In [15]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split


## Cargo el dataset con las modificaciones iniciales

In [16]:
df = pd.read_csv('./data/data_preprocessing.csv', index_col=0)
df.head()

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response
0,5524,1957,Graduation,Single,58138.0,0,0,2012-09-04,58,635,...,7,0,0,0,0,0,0,3,11,1
1,2174,1954,Graduation,Single,46344.0,1,1,2014-03-08,38,11,...,5,0,0,0,0,0,0,3,11,0
2,4141,1965,Graduation,Together,71613.0,0,0,2013-08-21,26,426,...,4,0,0,0,0,0,0,3,11,0
3,6182,1984,Graduation,Together,26646.0,1,0,2014-02-10,26,11,...,6,0,0,0,0,0,0,3,11,0
4,5324,1981,PhD,Married,58293.0,1,0,2014-01-19,94,173,...,5,0,0,0,0,0,0,3,11,0


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2240 entries, 0 to 2239
Data columns (total 29 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   2240 non-null   int64  
 1   Year_Birth           2240 non-null   int64  
 2   Education            2240 non-null   object 
 3   Marital_Status       2240 non-null   object 
 4   Income               2216 non-null   float64
 5   Kidhome              2240 non-null   int64  
 6   Teenhome             2240 non-null   int64  
 7   Dt_Customer          2240 non-null   object 
 8   Recency              2240 non-null   int64  
 9   MntWines             2240 non-null   int64  
 10  MntFruits            2240 non-null   int64  
 11  MntMeatProducts      2240 non-null   int64  
 12  MntFishProducts      2240 non-null   int64  
 13  MntSweetProducts     2240 non-null   int64  
 14  MntGoldProds         2240 non-null   int64  
 15  NumDealsPurchases    2240 non-null   int64 

## Primeras modificaciones

In [18]:
# Cambio el índice
df.set_index('ID', inplace=True)

# Cambio tipo a datetime
df['Dt_Customer'] = pd.to_datetime(df['Dt_Customer'])

# Eliminar columnas
df.drop(['Z_CostContact', 'Z_Revenue'], axis=1, inplace=True)

# Cambiar tipo a categóricas
cols_to_category = ['Education', 'Marital_Status',  
                        'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 
                        'AcceptedCmp1', 'AcceptedCmp2', 'Complain']
df[cols_to_category] = df[cols_to_category].astype('category')

# Eliminación de YOLO y cambio de nombre a Absurd
df['Marital_Status'] = df['Marital_Status'].cat.rename_categories({'Absurd': 'Others'})
df = df[df['Marital_Status'] != 'YOLO']


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2238 entries, 5524 to 9405
Data columns (total 26 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   Year_Birth           2238 non-null   int64         
 1   Education            2238 non-null   category      
 2   Marital_Status       2238 non-null   category      
 3   Income               2214 non-null   float64       
 4   Kidhome              2238 non-null   int64         
 5   Teenhome             2238 non-null   int64         
 6   Dt_Customer          2238 non-null   datetime64[ns]
 7   Recency              2238 non-null   int64         
 8   MntWines             2238 non-null   int64         
 9   MntFruits            2238 non-null   int64         
 10  MntMeatProducts      2238 non-null   int64         
 11  MntFishProducts      2238 non-null   int64         
 12  MntSweetProducts     2238 non-null   int64         
 13  MntGoldProds         2238 non-null 

In [20]:
df.loc[(df['Marital_Status'] == 'Absurd') | (df['Marital_Status']== 'YOLO' )]

Unnamed: 0_level_0,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,MntFruits,...,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Response
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


## Variable target

In [21]:
df['Response'].value_counts(True)

Response
0    0.851206
1    0.148794
Name: proportion, dtype: float64

Se encuentra muy desbalanceada.

## División en train y test

In [22]:
train_set, test_set = train_test_split(df,test_size=0.20,random_state=42)
print('Train_set',train_set.shape)
print('Test_set', test_set.shape)

Train_set (1790, 26)
Test_set (448, 26)


## Imputación de nulos

A la variable Income se le imputa la mediana del importe de Income en función de la variable NumCatalogPurchases

In [23]:
train_set['Income_median'] = train_set.groupby(['NumCatalogPurchases'])['Income'].transform('median')

train_set['Income'] = train_set['Income'].fillna(train_set['Income_median'])

In [24]:
train_set['Income'].isna().sum()

0

Guardo los valores para aplicarlos a test.

In [25]:

income_median = train_set.groupby('NumCatalogPurchases')['Income_median'].median().reset_index()
income_median

Unnamed: 0,NumCatalogPurchases,Income_median
0,0,31880.0
1,1,40851.0
2,2,54880.0
3,3,64795.0
4,4,67102.0
5,5,69556.0
6,6,75507.0
7,7,78482.5
8,8,71853.0
9,9,78642.0


In [26]:
dicc_income = pd.Series(train_set['Income_median'].values, index=train_set['NumCatalogPurchases']).to_dict()
dicc_income

{6: 75507.0,
 2: 54880.0,
 0: 31880.0,
 1: 40851.0,
 5: 69556.0,
 3: 64795.0,
 11: 77044.0,
 4: 67102.0,
 9: 78642.0,
 7: 78482.5,
 8: 71853.0,
 10: 78093.0,
 22: 157243.0,
 28: 157146.0}

In [27]:
train_set['Marital_Status'].unique()

['Single', 'Together', 'Married', 'Widow', 'Divorced', 'Others', 'Alone']
Categories (8, object): ['Others', 'Alone', 'Divorced', 'Married', 'Single', 'Together', 'Widow', 'YOLO']

## Nuevas variables

In [28]:
# Edad tomando como referencia 2020
train_set['age']= 2020 - train_set['Year_Birth']

# Antigüedad como cliente tomando como referencia 2020
train_set['customes_seniority'] = 2020 - train_set['Dt_Customer'].dt.year

# Miembros totales en casa
dicc_status = {
    'Together': 2,  # Generalmente representa a dos personas viviendo juntas
    'Divorced': 1,  # Generalmente representa a una persona que está divorciada
    'Married': 2,   # Generalmente representa a dos personas casadas
    'Single': 1,    # Generalmente representa a una persona soltera
    'Widow': 1,     # Generalmente representa a una persona viuda
    'Alone': 1,     # Generalmente representa a una persona viviendo sola
    'Others': 1     # Puede variar, pero generalmente representa a una persona
}
train_set['Status_members'] = train_set['Marital_Status'].map(dicc_status)
train_set['Household_members'] = train_set['Status_members']+train_set['Kidhome']+train_set['Teenhome']

# Gasto total
train_set['Total_amount'] = train_set.filter(like='Mnt').sum(axis = 1)

# Nº compras totales
train_set['Total_purchase'] = train_set.filter(like='Purchases').sum(axis = 1)

# Gasto promedio
train_set['Median_amount_purchase'] = train_set['Total_amount']/train_set['Total_purchase']

# Nº ofertas aceptadas
train_set['Total_cmp'] = train_set.filter(like='Accepted').apply(pd.to_numeric, errors='coerce').sum(axis=1)

# % de ofertas aceptadas
train_set_numeric = train_set.filter(like='Cmp').apply(pd.to_numeric, errors='coerce')
train_set['Total_%_cmp'] = train_set['Total_cmp'] / len(train_set_numeric.columns)

# Elimino columnas intermedias
train_set = train_set.drop(columns=['Status_members','Income_median'])

In [29]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1790 entries, 9400 to 4297
Data columns (total 34 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   Year_Birth              1790 non-null   int64         
 1   Education               1790 non-null   category      
 2   Marital_Status          1790 non-null   category      
 3   Income                  1790 non-null   float64       
 4   Kidhome                 1790 non-null   int64         
 5   Teenhome                1790 non-null   int64         
 6   Dt_Customer             1790 non-null   datetime64[ns]
 7   Recency                 1790 non-null   int64         
 8   MntWines                1790 non-null   int64         
 9   MntFruits               1790 non-null   int64         
 10  MntMeatProducts         1790 non-null   int64         
 11  MntFishProducts         1790 non-null   int64         
 12  MntSweetProducts        1790 non-null   int64     

In [30]:
train_set.iloc[:,25:36]

Unnamed: 0_level_0,Response,age,customes_seniority,Household_members,Total_amount,Total_purchase,Median_amount_purchase,Total_cmp,Total_%_cmp
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
9400,0,62,6,1.0,1383,19,72.789474,0,0.0
2804,1,45,8,3.0,120,11,10.909091,0,0.0
1503,0,44,7,4.0,107,1,107.000000,0,0.0
5491,0,69,7,3.0,319,16,19.937500,0,0.0
2894,1,35,7,1.0,2013,21,95.857143,3,0.6
...,...,...,...,...,...,...,...,...,...
10785,0,51,6,4.0,41,7,5.857143,0,0.0
9964,0,41,7,2.0,424,15,28.266667,0,0.0
3412,0,69,7,3.0,957,17,56.294118,1,0.2
2811,0,57,6,3.0,62,7,8.857143,0,0.0


In [32]:
train_set.to_csv('./data/train_set.csv')
test_set.to_csv('./data/test_set.csv')