<a href="https://colab.research.google.com/github/EAFIT-BI/BI-Introduction-2024-1/blob/main/Transformaci%C3%B3n_de_datos.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Transformación de datos
## Transformación de datos numéricos
### Agrupación o *Binning*

In [1]:
#cargamos librerías y datos
import pandas as pd

data = pd.read_excel("/content/Customers.xlsx",
                     sheet_name = "Customers")
data.head(2)


Unnamed: 0,CustID,Sex,Race,BirthDate,College,HouseholdSize,ZipCode,Income,Spending2020,Spending2021,NumOfOrders,DaysSinceLast,Satisfaction,Channel
0,1530016,Female,Black,1986-12-16,Yes,5,90047,53000,287,241,3,101,Very Dissatisfied,SM
1,1531136,Male,White,1993-05-09,Yes,5,90026,94000,1227,843,12,262,Neutral,TV


In [None]:
#creamos el score de "reciente" a la inversa
data["DaysSinceLastReverse"] = data.DaysSinceLast * -1

In [None]:
#creamos los 5 grupos basados en los cuantiles para
#cada variable
data['Recency'] = pd.qcut(data.DaysSinceLastReverse, q = 5,
                          labels = range(1,6))
data['Frequency'] = pd.qcut(data.NumOfOrders, q = 5,
                            labels = range(1,6))
data['Monetary'] = pd.qcut(data.Spending2021, q = 5,
                           labels = range(1,6))

In [None]:
data.head(1)

In [None]:
#creamos la variable RFM
data['RFM'] = data.Recency.astype('str') + \
data.Frequency.astype('str') + data.Monetary.astype('str')

In [None]:
#dividimos los ingresos en los 5 grupos de igual tamaño

data['BinnedIncome'] = pd.cut(data.Income, bins = 5,
                              labels = range(1,6))
#visualizamos el tamaño de los grupos
data.groupby('BinnedIncome').size()

In [None]:
#para generar las membresías, utilizaremos la misma función
#con variaciones en los argumentos

data['Membership_Tier'] = pd.cut(data.Spending2021,
                                 bins = [0, 250, 1000, float('inf')],
                                   labels = ['Bronze', 'Silver', 'Gold'])

In [None]:
data.head(3)

Unnamed: 0,CustID,Sex,Race,BirthDate,College,HouseholdSize,ZipCode,Income,Spending2020,Spending2021,...,DaysSinceLast,Satisfaction,Channel,DaysSinceLastReverse,Recency,Frequency,Monetary,RFM,BinnedIncome,Membership_Tier
0,1530016,Female,Black,1986-12-16,Yes,5,90047,53000,287,241,...,101,Very Dissatisfied,SM,-101,4,1,1,411,1,Bronze
1,1531136,Male,White,1993-05-09,Yes,5,90026,94000,1227,843,...,262,Neutral,TV,-262,2,4,4,244,3,Silver
2,1532160,Male,Black,1966-05-22,Yes,2,90027,64000,523,719,...,122,Very Satisfied,TV,-122,4,3,3,433,2,Silver


## Transformaciones matemáticas

In [None]:
#para analizar la diferencia de los gastos

data['SpendingDiff'] = data.Spending2021 - data.Spending2020

In [None]:
#para la diferencia porcentual

data['PctSpendingDiff'] = round((data.SpendingDiff/
                                 data.Spending2020)*100, 2).astype('str') + '%'

In [None]:
#ahora calculamos el logaritmo del ingreso
import numpy as np

data['IncomeLn'] = np.log(data.Income)

In [None]:
#calculamos la edad con referencia a la fecha de hoy
# 17 de abril de 2024

data['Age'] =np.floor((pd.to_datetime('2024-04-17') - data.BirthDate)/
                      np.timedelta64(1, 'Y'))

In [None]:
#extraemos el mes del cumpleaños
data['BirthMonth'] = pd.DatetimeIndex(data.BirthDate).month

In [None]:
data.head(1)

Unnamed: 0,CustID,Sex,Race,BirthDate,College,HouseholdSize,ZipCode,Income,Spending2020,Spending2021,...,Frequency,Monetary,RFM,BinnedIncome,Membership_Tier,SpendingDiff,PctSpendingDiff,IncomeLn,Age,BirthMonth
0,1530016,Female,Black,1986-12-16,Yes,5,90047,53000,287,241,...,1,1,411,1,Bronze,-46,-16.03%,10.878047,37.0,12


## Transformación de datos categóricos

### Reducción de categorías


In [None]:
#cargamos nuevamente los datos para descartar la
#información agregada en los pasos anteriores

data = pd.read_excel("/content/Customers.xlsx",
                     sheet_name = "Customers")

In [None]:
#visualizamos las categorías de la variable raza
data.groupby('Race').size()

In [None]:
#agruparemos las dos categorías con menor
#cantidad de individuos
data['Race1'] = np.where(data.Race.isin(['American Indian',
                                         'Pacific Islander']),
                         'Other', data.Race)

In [None]:
#visualizamos la distribución de las categorías de race1
data.groupby('Race1').size()

## Variables indicadoras (*one_hot encoding*)

Codifica las categorías según su presencia o ausencia(1,0)

In [2]:
channelDummies = pd.get_dummies(data.Channel,
                                prefix = 'Channel').drop(columns =
                                                         'Channel_Web')

In [None]:
channelDummies.head(3)

In [None]:
data = pd.concat([data, channelDummies], axis = 1)
data.head(2)

## Puntuación de categoría (*label enconding*)

En este caso, la transformación tiene, de forma implícita, un orden.

In [6]:
data['SatisfactionScore'] = data.Satisfaction.replace({
    'Very Dissatisfied': 1,
    'Somewhat Dissatisfied': 2,
    'Neutral': 3,
    'Somewhat Satisfied': 4,
    'Very Satisfied': 5})

data.head(3)

Unnamed: 0,CustID,Sex,Race,BirthDate,College,HouseholdSize,ZipCode,Income,Spending2020,Spending2021,...,DaysSinceLast,Satisfaction,Channel,Channel_Referral,Channel_SM,Channel_TV,Channel_Referral.1,Channel_SM.1,Channel_TV.1,SatisfactionScore
0,1530016,Female,Black,1986-12-16,Yes,5,90047,53000,287,241,...,101,Very Dissatisfied,SM,False,True,False,False,True,False,1
1,1531136,Male,White,1993-05-09,Yes,5,90026,94000,1227,843,...,262,Neutral,TV,False,False,True,False,False,True,3
2,1532160,Male,Black,1966-05-22,Yes,2,90027,64000,523,719,...,122,Very Satisfied,TV,False,False,True,False,False,True,5
