In [55]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from scipy.stats.mstats import winsorize


## Funciones

In [56]:
def windsorize_columns(df, columns, limits):
    for column in columns:
        # Aplicar la winsorización a cada columna
        df[column] = winsorize(df[column], limits=limits)
    return df

## Cargo el dataset con las modificaciones iniciales

In [57]:
df = pd.read_csv('./data/data_preprocessing.csv', index_col=0)
df.head()

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response
0,5524,1957,Graduation,Single,58138.0,0,0,2012-09-04,58,635,...,7,0,0,0,0,0,0,3,11,1
1,2174,1954,Graduation,Single,46344.0,1,1,2014-03-08,38,11,...,5,0,0,0,0,0,0,3,11,0
2,4141,1965,Graduation,Together,71613.0,0,0,2013-08-21,26,426,...,4,0,0,0,0,0,0,3,11,0
3,6182,1984,Graduation,Together,26646.0,1,0,2014-02-10,26,11,...,6,0,0,0,0,0,0,3,11,0
4,5324,1981,PhD,Married,58293.0,1,0,2014-01-19,94,173,...,5,0,0,0,0,0,0,3,11,0


In [58]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2240 entries, 0 to 2239
Data columns (total 29 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   2240 non-null   int64  
 1   Year_Birth           2240 non-null   int64  
 2   Education            2240 non-null   object 
 3   Marital_Status       2240 non-null   object 
 4   Income               2216 non-null   float64
 5   Kidhome              2240 non-null   int64  
 6   Teenhome             2240 non-null   int64  
 7   Dt_Customer          2240 non-null   object 
 8   Recency              2240 non-null   int64  
 9   MntWines             2240 non-null   int64  
 10  MntFruits            2240 non-null   int64  
 11  MntMeatProducts      2240 non-null   int64  
 12  MntFishProducts      2240 non-null   int64  
 13  MntSweetProducts     2240 non-null   int64  
 14  MntGoldProds         2240 non-null   int64  
 15  NumDealsPurchases    2240 non-null   int64 

## Primeras modificaciones

In [59]:
# Cambio el índice
df.set_index('ID', inplace=True)

# Cambio tipo a datetime
df['Dt_Customer'] = pd.to_datetime(df['Dt_Customer'])

# Eliminar columnas
df.drop(['Z_CostContact', 'Z_Revenue'], axis=1, inplace=True)

# Cambiar tipo a categóricas
cols_to_category = ['Education', 'Marital_Status','Complain']
df[cols_to_category] = df[cols_to_category].astype('category')

# Eliminación de YOLO y cambio de nombre a Absurd
df['Marital_Status'] = df['Marital_Status'].cat.rename_categories({'Absurd': 'Others'})
df = df[df['Marital_Status'] != 'YOLO']

df = df.drop_duplicates()


In [60]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2056 entries, 5524 to 9405
Data columns (total 26 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   Year_Birth           2056 non-null   int64         
 1   Education            2056 non-null   category      
 2   Marital_Status       2056 non-null   category      
 3   Income               2032 non-null   float64       
 4   Kidhome              2056 non-null   int64         
 5   Teenhome             2056 non-null   int64         
 6   Dt_Customer          2056 non-null   datetime64[ns]
 7   Recency              2056 non-null   int64         
 8   MntWines             2056 non-null   int64         
 9   MntFruits            2056 non-null   int64         
 10  MntMeatProducts      2056 non-null   int64         
 11  MntFishProducts      2056 non-null   int64         
 12  MntSweetProducts     2056 non-null   int64         
 13  MntGoldProds         2056 non-null 

In [61]:
df.loc[(df['Marital_Status'] == 'Absurd') | (df['Marital_Status']== 'YOLO' )]

Unnamed: 0_level_0,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,MntFruits,...,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Response
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


## Variable target

In [82]:
df['Response'].value_counts(True)

Response
0    0.848249
1    0.151751
Name: proportion, dtype: float64

Se encuentra muy desbalanceada.

## División en train y test

In [63]:
train_set, test_set = train_test_split(df,test_size=0.20,random_state=42,stratify=df['Response'])
print('Train_set',train_set.shape)
print('Test_set', test_set.shape)

Train_set (1644, 26)
Test_set (412, 26)


## Imputación de nulos

Flag con los nulos

In [64]:
train_set['income_missing'] = np.where(train_set['Income'].isnull(),1,0)

Otras variables con correlación

In [65]:
corr_matrix = train_set.corr(numeric_only=True)
abs(corr_matrix['Income']).sort_values(ascending=False) > 0.5
corr_matrix[(corr_matrix['Income'] > 0.5)].index

Index(['Income', 'MntWines', 'MntMeatProducts', 'MntFishProducts',
       'MntSweetProducts', 'NumCatalogPurchases', 'NumStorePurchases'],
      dtype='object')

IterativeImputer para realizar la imputación de nulos

In [66]:
features_iterative = ['Income', 'MntWines', 'MntMeatProducts', 'NumCatalogPurchases']
imputer_iterativo = IterativeImputer()

imputer_iterativo.fit(train_set[features_iterative])

train_set['Income'] = imputer_iterativo.transform(train_set[features_iterative])

# Aplicar al test_set
test_set['Income'] = imputer_iterativo.transform(test_set[features_iterative])

Comprobación

In [67]:
train_set['Income'].isna().sum()

0

## Nuevas variables

Imputación a train_set

In [68]:
# Edad tomando como referencia 2015 para poder obtener la variables
train_set['age']= 2015 - train_set['Year_Birth']

# Antigüedad como cliente tomando como referencia 2015 para poder obtener la variable
train_set['customes_seniority'] = 2015 - train_set['Dt_Customer'].dt.year

# Miembros totales en casa
dicc_status = {
    'Together': 2,  # Generalmente representa a dos personas viviendo juntas
    'Divorced': 1,  # Generalmente representa a una persona que está divorciada
    'Married': 2,   # Generalmente representa a dos personas casadas
    'Single': 1,    # Generalmente representa a una persona soltera
    'Widow': 1,     # Generalmente representa a una persona viuda
    'Alone': 1,     # Generalmente representa a una persona viviendo sola
    'Others': 1     # Puede variar, pero generalmente representa a una persona
}
train_set['Status_members'] = train_set['Marital_Status'].map(dicc_status)
train_set['Household_members'] = train_set['Status_members']+train_set['Kidhome']+train_set['Teenhome']

# Gasto total
train_set['Total_amount'] = train_set.filter(like='Mnt').sum(axis = 1)

# Nº compras totales
train_set['Total_purchase'] = train_set.filter(like='Purchases').sum(axis = 1)

# Gasto promedio
train_set['Median_amount_purchase'] = np.where(
    train_set['Total_purchase'] != 0,
    train_set['Total_amount'] / train_set['Total_purchase'],
    0)


# Nº ofertas aceptadas
train_set['Total_cmp'] = train_set.filter(like='Accepted').apply(pd.to_numeric, errors='coerce').sum(axis=1)

# % de ofertas aceptadas
train_set_numeric = train_set.filter(like='Cmp').apply(pd.to_numeric, errors='coerce')
train_set['Total_%_cmp'] = train_set['Total_cmp'] / len(train_set_numeric.columns)

# Elimino columnas intermedias
train_set = train_set.drop(columns=['Status_members'])

In [69]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1644 entries, 10270 to 10779
Data columns (total 35 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   Year_Birth              1644 non-null   int64         
 1   Education               1644 non-null   category      
 2   Marital_Status          1644 non-null   category      
 3   Income                  1644 non-null   float64       
 4   Kidhome                 1644 non-null   int64         
 5   Teenhome                1644 non-null   int64         
 6   Dt_Customer             1644 non-null   datetime64[ns]
 7   Recency                 1644 non-null   int64         
 8   MntWines                1644 non-null   int64         
 9   MntFruits               1644 non-null   int64         
 10  MntMeatProducts         1644 non-null   int64         
 11  MntFishProducts         1644 non-null   int64         
 12  MntSweetProducts        1644 non-null   int64   

In [70]:
train_set.iloc[:,25:36]

Unnamed: 0_level_0,Response,income_missing,age,customes_seniority,Household_members,Total_amount,Total_purchase,Median_amount_purchase,Total_cmp,Total_%_cmp
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
10270,0,0,34,2,3.0,66,8,8.250000,0,0.0
6969,0,0,33,2,3.0,233,13,17.923077,0,0.0
3921,0,0,63,3,1.0,205,10,20.500000,0,0.0
4356,0,0,44,2,3.0,1574,14,112.428571,2,0.4
8727,0,0,37,2,2.0,1150,29,39.655172,0,0.0
...,...,...,...,...,...,...,...,...,...,...
1646,0,0,43,1,3.0,46,7,6.571429,0,0.0
6336,0,0,55,1,4.0,48,10,4.800000,0,0.0
10203,0,0,25,2,2.0,42,5,8.400000,0,0.0
7533,0,0,51,2,4.0,187,13,14.384615,0,0.0


In [71]:
train_set['Median_amount_purchase'].describe

<bound method NDFrame.describe of ID
10270      8.250000
6969      17.923077
3921      20.500000
4356     112.428571
8727      39.655172
            ...    
1646       6.571429
6336       4.800000
10203      8.400000
7533      14.384615
10779      9.800000
Name: Median_amount_purchase, Length: 1644, dtype: float64>

Imputación a test_set

In [72]:
# Edad tomando como referencia 2000
test_set['age'] = 2015 - test_set['Year_Birth']

# Antigüedad como cliente tomando como referencia 2020
test_set['customes_seniority'] = 2015 - test_set['Dt_Customer'].dt.year

# Miembros totales en casa
dicc_status = {
    'Together': 2,  # Generalmente representa a dos personas viviendo juntas
    'Divorced': 1,  # Generalmente representa a una persona que está divorciada
    'Married': 2,   # Generalmente representa a dos personas casadas
    'Single': 1,    # Generalmente representa a una persona soltera
    'Widow': 1,     # Generalmente representa a una persona viuda
    'Alone': 1,     # Generalmente representa a una persona viviendo sola
    'Others': 1     # Puede variar, pero generalmente representa a una persona
}
test_set['Status_members'] = test_set['Marital_Status'].map(dicc_status)
test_set['Household_members'] = test_set['Status_members'] + test_set['Kidhome'] + test_set['Teenhome']

# Gasto total
test_set['Total_amount'] = test_set.filter(like='Mnt').sum(axis=1)

# Nº compras totales
test_set['Total_purchase'] = test_set.filter(like='Purchases').sum(axis=1)

# Gasto promedio
test_set['Median_amount_purchase'] = np.where(
    test_set['Total_purchase'] != 0,
    test_set['Total_amount'] / test_set['Total_purchase'],
    0)

# Nº ofertas aceptadas
test_set['Total_cmp'] = test_set.filter(like='Accepted').apply(pd.to_numeric, errors='coerce').sum(axis=1)

# % de ofertas aceptadas
test_set_numeric = test_set.filter(like='Cmp').apply(pd.to_numeric, errors='coerce')
test_set['Total_%_cmp'] = test_set['Total_cmp'] / len(test_set_numeric.columns)

# Elimino columnas intermedias creadas para crear terceras columnas
test_set = test_set.drop(columns=['Status_members'])

In [73]:
test_set.info()

<class 'pandas.core.frame.DataFrame'>
Index: 412 entries, 8955 to 2678
Data columns (total 34 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   Year_Birth              412 non-null    int64         
 1   Education               412 non-null    category      
 2   Marital_Status          412 non-null    category      
 3   Income                  412 non-null    float64       
 4   Kidhome                 412 non-null    int64         
 5   Teenhome                412 non-null    int64         
 6   Dt_Customer             412 non-null    datetime64[ns]
 7   Recency                 412 non-null    int64         
 8   MntWines                412 non-null    int64         
 9   MntFruits               412 non-null    int64         
 10  MntMeatProducts         412 non-null    int64         
 11  MntFishProducts         412 non-null    int64         
 12  MntSweetProducts        412 non-null    int64      

In [74]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1644 entries, 10270 to 10779
Data columns (total 35 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   Year_Birth              1644 non-null   int64         
 1   Education               1644 non-null   category      
 2   Marital_Status          1644 non-null   category      
 3   Income                  1644 non-null   float64       
 4   Kidhome                 1644 non-null   int64         
 5   Teenhome                1644 non-null   int64         
 6   Dt_Customer             1644 non-null   datetime64[ns]
 7   Recency                 1644 non-null   int64         
 8   MntWines                1644 non-null   int64         
 9   MntFruits               1644 non-null   int64         
 10  MntMeatProducts         1644 non-null   int64         
 11  MntFishProducts         1644 non-null   int64         
 12  MntSweetProducts        1644 non-null   int64   

## Tratamiento de outliers

Hay outliers en Income, con un valor de 666666 y otros 10 por encima de 120000.    
También en age con valores de más de 100 años

In [75]:
train_set[['Income','age']].describe().loc[['min','max']]

Unnamed: 0,Income,age
min,2447.0,19.0
max,162397.0,116.0


In [76]:
train_set[train_set['Income'] > 100000].shape[0]/train_set[train_set['Income'] < 100000].shape[0]

0.006119951040391677

In [77]:
train_set[train_set['age'] > 70].shape[0]/train_set[train_set['age'] < 70].shape[0]

0.009247842170160296

In [78]:
train_set[['Income','age']] = np.asarray(train_set[['Income','age']])
windsorize_columns(train_set,['Income','age'],[0.003,0.006])

Unnamed: 0_level_0,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,MntFruits,...,Response,income_missing,age,customes_seniority,Household_members,Total_amount,Total_purchase,Median_amount_purchase,Total_cmp,Total_%_cmp
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10270,1981,2n Cycle,Married,35523.0,1,0,2013-10-03,8,11,5,...,0,0,34.0,2,3.0,66,8,8.250000,0,0.0
6969,1982,Graduation,Together,50272.0,1,0,2013-08-07,75,99,13,...,0,0,33.0,2,3.0,233,13,17.923077,0,0.0
3921,1952,2n Cycle,Widow,28457.0,0,0,2012-10-28,96,24,1,...,0,0,63.0,3,1.0,205,10,20.500000,0,0.0
4356,1971,Graduation,Together,71819.0,0,1,2013-08-28,70,1224,28,...,0,0,44.0,2,3.0,1574,14,112.428571,2,0.4
8727,1978,Graduation,Divorced,63693.0,0,1,2013-10-14,63,587,43,...,0,0,37.0,2,2.0,1150,29,39.655172,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1646,1972,Basic,Married,25224.0,1,0,2014-03-16,36,4,9,...,0,0,43.0,1,3.0,46,7,6.571429,0,0.0
6336,1960,Graduation,Married,29315.0,1,1,2014-04-06,55,13,2,...,0,0,55.0,1,4.0,48,10,4.800000,0,0.0
10203,1990,Graduation,Single,26095.0,1,0,2013-06-30,77,11,7,...,0,0,25.0,2,2.0,42,5,8.400000,0,0.0
7533,1964,Graduation,Married,49096.0,1,1,2013-09-24,15,144,1,...,0,0,51.0,2,4.0,187,13,14.384615,0,0.0


In [79]:
train_set[['Income','age']].describe().loc[['min','max']]

Unnamed: 0,Income,age
min,4861.0,20.0
max,101970.0,71.0


In [80]:
test_set[['Income','age']] = np.asarray(test_set[['Income','age']])
windsorize_columns(test_set,['Income','age'],[0.003,0.006])

Unnamed: 0_level_0,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,MntFruits,...,Complain,Response,age,customes_seniority,Household_members,Total_amount,Total_purchase,Median_amount_purchase,Total_cmp,Total_%_cmp
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8955,1957,PhD,Together,40451.0,0,2,2014-02-16,54,35,0,...,0,0,58.0,1,4.0,49,5,9.800000,1,0.2
8749,1984,Graduation,Together,37235.0,1,0,2014-02-01,68,20,2,...,0,0,31.0,1,3.0,61,5,12.200000,0,0.0
3107,1976,Graduation,Together,71322.0,0,1,2014-05-02,31,121,24,...,0,0,39.0,1,3.0,350,13,26.923077,0,0.0
2337,1971,Graduation,Divorced,29819.0,1,0,2013-10-24,77,9,1,...,0,0,44.0,2,2.0,39,9,4.333333,0,0.0
5180,1968,PhD,Divorced,50616.0,0,1,2014-04-18,56,128,0,...,0,0,47.0,1,2.0,149,10,14.900000,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,1972,Master,Single,46423.0,1,1,2013-09-18,6,68,0,...,0,0,43.0,2,3.0,92,9,10.222222,0,0.0
4837,1965,PhD,Married,71322.0,0,1,2013-02-16,57,753,43,...,0,0,50.0,2,3.0,1305,28,46.607143,0,0.0
3900,1972,Graduation,Married,65685.0,0,1,2014-03-29,54,642,14,...,0,0,43.0,1,3.0,769,21,36.619048,0,0.0
5885,1973,2n Cycle,Married,35688.0,2,1,2012-08-22,94,73,3,...,0,0,42.0,3,5.0,211,16,13.187500,0,0.0


In [81]:
train_set.to_csv('./data/train_set.csv')
test_set.to_csv('./data/test_set.csv')