Predicciones sobre:
- Cantidad total de gasto
    - Cantidad parciales de gasto

In [119]:
## Importación librerías

In [120]:
# Librerías para visualización de datos
import matplotlib.pyplot as plt
import seaborn as sns

# Librerías para manipulación y análisis de datos
import numpy as np
import pandas as pd


from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import LinearRegression
from catboost import CatBoostRegressor, Pool


from sklearn.model_selection import cross_val_score

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

from toolbox_DS import *
from toolbox_ML import *

import warnings
warnings.filterwarnings(action="ignore", message=r'.*Use subset.*of np.ndarray is not recommended')

## Funciones

In [121]:
# Funciones de preprocesamiento
def establecer_indice(df, columna_id):
    df.set_index(columna_id, inplace=True)
    return df

def convertir_a_datetime(df, columna_fecha):
    df[columna_fecha] = pd.to_datetime(df[columna_fecha])
    return df

def convertir_a_categoricas(df, columnas):
    df[columnas] = df[columnas].astype('category')
    return df

def eliminar_columnas(df, columnas):
    df = df.drop(columns=columnas)
    return df

def eliminar_outliers(df, columna, valor_outlier):
    return df[df[columna] != valor_outlier]

def dividir_X_y(df, columna_objetivo):
    X = df.drop(columns=columna_objetivo)
    y = df[columna_objetivo]
    return X, y

# Primera parte: Preprocesamiento de los conjuntos de datos
def preprocesar_datos(train_set, test_set, columnas_eliminar_train, columnas_eliminar_test, columna_objetivo, valor_outlier):
    # Preprocesamiento de train_set
    train_set = establecer_indice(train_set, 'ID')
    train_set = convertir_a_datetime(train_set, 'Dt_Customer')
    train_set = convertir_a_categoricas(train_set, ['Education', 'Marital_Status'])
    train_set = eliminar_columnas(train_set, columnas_eliminar_train)
    train_set = eliminar_outliers(train_set, 'Income', valor_outlier)
    X_train, y_train = dividir_X_y(train_set, columna_objetivo)
    numerical_features = X_train.select_dtypes(['int','float']).columns

    # Preprocesamiento de test_set
    test_set = establecer_indice(test_set, 'ID')
    test_set = convertir_a_datetime(test_set, 'Dt_Customer')
    test_set = convertir_a_categoricas(test_set, ['Education', 'Marital_Status'])
    test_set = eliminar_columnas(test_set, columnas_eliminar_test)
    test_set = eliminar_outliers(test_set, 'Income', valor_outlier)
    X_test, y_test = dividir_X_y(test_set, columna_objetivo)

    return X_train, y_train, X_test, y_test, numerical_features

# Segunda parte: Tratamiento de variables, pipeline y transformación
def tratamiento_y_pipeline(X_train, y_train, X_test, numerical_features, categorical_features_onehot, categorical_features_ordinal):
    # Tratamiento de variables y pipeline
    ordinal_encoder = OrdinalEncoder(categories=[['Basic', '2n Cycle', 'Graduation', 'Master', 'PhD']])
    imputer = SimpleImputer(strategy='median')
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', imputer, numerical_features),
            ('cat_onehot', OneHotEncoder(), categorical_features_onehot),
            ('cat_ordinal', Pipeline([
                ('ordinal', ordinal_encoder),
                ('scaler', MinMaxScaler())
            ]), categorical_features_ordinal)
        ]
    )
    pipeline = Pipeline(steps=[
        ('preprocesor', preprocessor),
        ('algoritmo', RandomForestClassifier())
    ])
    pipeline.fit(X_train, y_train)

    # Transformación de los conjuntos de datos
    X_train_transform = pipeline.named_steps['preprocesor'].transform(X_train)
    X_test_transform = pipeline.named_steps['preprocesor'].transform(X_test)

    # Convertir a DataFrame
    features_transformed = pipeline.named_steps['preprocesor'].get_feature_names_out()
    X_train_transform_df = pd.DataFrame(X_train_transform, columns=features_transformed)
    X_test_transform_df = pd.DataFrame(X_test_transform, columns=features_transformed)

    return X_train_transform_df, X_test_transform_df

## Carga de datos

In [122]:
train_set = pd.read_csv('./data/train_set.csv')
test_set = pd.read_csv('./data/test_set.csv')

In [123]:
columnas_eliminar_train = ['income_missing', 'Year_Birth', 'Total_%_cmp', 'Dt_Customer', 'Median_amount_purchase']
columnas_eliminar_test = ['Year_Birth', 'Total_%_cmp', 'Dt_Customer', 'Median_amount_purchase']
categorical_features_onehot = ['Marital_Status']
categorical_features_ordinal = ['Education']

## Predicción Total_amount

### Preparación de datos

In [124]:
train_set_mnt = train_set.copy()
test_set_mnt = test_set.copy()

In [125]:
X_train_mtn, y_train_mnt, X_test_mnt, y_test_mnt, numerical_features_mnt = preprocesar_datos(train_set_mnt,test_set_mnt,columnas_eliminar_train,columnas_eliminar_test,'Total_amount',666666)

In [126]:
X_train_mtn.columns == X_test_mnt.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True])

In [127]:
X_train_mtn.columns

Index(['Education', 'Marital_Status', 'Income', 'Kidhome', 'Teenhome',
       'Recency', 'MntWines', 'MntFruits', 'MntMeatProducts',
       'MntFishProducts', 'MntSweetProducts', 'MntGoldProds',
       'NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases',
       'NumStorePurchases', 'NumWebVisitsMonth', 'AcceptedCmp3',
       'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1', 'AcceptedCmp2',
       'Complain', 'Response', 'age', 'customes_seniority',
       'Household_members', 'Total_purchase', 'Total_cmp'],
      dtype='object')

In [128]:
numerical_features_mnt

Index(['Income', 'Kidhome', 'Teenhome', 'Recency', 'MntWines', 'MntFruits',
       'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
       'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',
       'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth',
       'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1',
       'AcceptedCmp2', 'Complain', 'Response', 'age', 'customes_seniority',
       'Household_members', 'Total_purchase', 'Total_cmp'],
      dtype='object')

In [129]:
X_train_transform_df_mnt, X_test_transform_df_mnt = tratamiento_y_pipeline(X_train_mtn, y_train_mnt, X_test_mnt,numerical_features_mnt,categorical_features_onehot, categorical_features_ordinal)

In [133]:
features_amount_select = ['num__Income', 'num__Kidhome', 'num__Teenhome', 'num__age','cat_onehot__Marital_Status_Alone','num__Household_members',
       'cat_onehot__Marital_Status_Divorced',
       'cat_onehot__Marital_Status_Married',
       'cat_onehot__Marital_Status_Others',
       'cat_onehot__Marital_Status_Single',
       'cat_onehot__Marital_Status_Together',
       'cat_onehot__Marital_Status_Widow', 'cat_ordinal__Education']

### Comparación modelos

In [152]:
# Instancio modelos

xgb = XGBRegressor(random_state = 42)
rf = RandomForestRegressor(random_state=42)
lgb = LGBMRegressor(random_state = 42, verbose = -3)
cat = CatBoostRegressor(random_state=42, verbose=False)
lin_reg = LinearRegression()

TypeError: LinearRegression.__init__() got an unexpected keyword argument 'max_iter'

In [145]:
# Definir los nombres de los modelos y los modelos correspondientes
model_names = ['XGBoost', 'RandomForest', 'LightGBoost', 'CatBoost', 'LinearRegression']
model_set = [xgb, rf, lgb, cat,lin_reg]

# Crear un diccionario para almacenar las métricas de validación cruzada
metricas_cv = {}

# Lista para almacenar los valores de recall
valores = []

# Realizar validación cruzada para cada modelo
for nombre, modelo in zip(model_names, model_set):
    # Calcular las métricas de validación cruzada
    scores = cross_val_score(modelo, X_train_transform_df_mnt[features_amount_select], y_train_mnt, cv=5, scoring='neg_mean_absolute_error')
    
    # Almacenar el resultado
    metricas_cv[nombre] = scores
    valores.append(np.mean(scores))

# Encontrar el modelo con la mayor métrica de recall
ganador = list(metricas_cv.keys())[np.argmax(valores)]

# Imprimir los resultados
print("Resultados de la validación cruzada:")
for nombre, scores in metricas_cv.items():
    print(f"{nombre}: MAE = {-np.mean(scores):.3f}")
print('*'*25)
print(f"\nEl ganador es: {ganador}")

Resultados de la validación cruzada:
XGBoost: MAE = 221.709
RandomForest: MAE = 209.420
LightGBoost: MAE = 215.936
CatBoost: MAE = 209.985
LinearRegression: MAE = 245.977
*************************

El ganador es: RandomForest


### Modelización e hiperparámetros

#### RandomForest

In [146]:
param_rf = {
    'n_estimators': [80,100,200],
    'max_depth': [10,12,13],
    'min_samples_split': [2,3,5],
    'min_samples_leaf': [11,12,13],
    'criterion': 'absolute_error'
}

grid_rf = GridSearchCV(rf, 
                       param_grid=param_rf, 
                       cv=5, 
                       scoring=('neg_mean_absolute_error'), 
                       n_jobs=-1)

grid_rf.fit(X_train_transform_df_mnt[features_amount_select], y_train_mnt)

# Obtener el mejor modelo y parámetros
best_model_rf = grid_rf.best_estimator_
best_params_rf = grid_rf.best_params_
print("Mejor modelo y parámetros:", best_params_rf)

# Imprimir el mejor resultado del MAE
best_MAE_rf = grid_rf.best_score_
print("Mejor resultado del MAE:", best_MAE_rf)

Mejor modelo y parámetros: {'criterion': 'absolute_error', 'max_depth': 12, 'min_samples_leaf': 12, 'min_samples_split': 3, 'n_estimators': 100}
Mejor resultado del recall para la clase positiva: -201.78001463155096


In [147]:
importancias = best_model_rf.feature_importances_
nombres_caracteristicas = features_amount_select

importancias_df = pd.DataFrame({
    'Característica': nombres_caracteristicas,
    'Importancia': importancias
})

importancias_df.sort_values(by='Importancia', ascending=False, inplace=True)

print(importancias_df)

                         Característica  Importancia
0                           num__Income     0.908930
3                              num__age     0.026291
1                          num__Kidhome     0.019849
2                         num__Teenhome     0.018988
12               cat_ordinal__Education     0.010012
5                num__Household_members     0.008499
7    cat_onehot__Marital_Status_Married     0.003484
10  cat_onehot__Marital_Status_Together     0.001651
9     cat_onehot__Marital_Status_Single     0.001502
6   cat_onehot__Marital_Status_Divorced     0.000794
4      cat_onehot__Marital_Status_Alone     0.000000
8     cat_onehot__Marital_Status_Others     0.000000
11     cat_onehot__Marital_Status_Widow     0.000000


In [161]:
m

NameError: name 'm' is not defined

### CatBoost

In [150]:
 
param_cat = {
    'iterations': [500,1000,1500],
    'learning_rate': [0.1,0.2,0.4],
    'depth': [6,8,12],
    'l2_leaf_reg': [2,3,4],
    'min_data_in_leaf':[4,8,12],
       
}

grid_cat = GridSearchCV(cat, 
                       param_grid=param_cat, 
                       cv=5, 
                       scoring=('neg_mean_absolute_error'), 
                       n_jobs=-1)

grid_cat.fit(X_train_transform_df_mnt[features_amount_select], y_train_mnt)

# Obtener el mejor modelo y parámetros
best_model_cat = grid_cat.best_estimator_
best_params_cat = grid_cat.best_params_
print("Mejor modelo y parámetros:", best_params_rf)

# Imprimir el mejor resultado del MAE
best_MAE_cat = grid_cat.best_score_
print("Mejor resultado del MAE:", best_MAE_cat)



Mejor modelo y parámetros: {'criterion': 'absolute_error', 'max_depth': 12, 'min_samples_leaf': 12, 'min_samples_split': 3, 'n_estimators': 100}
Mejor resultado del recall para la clase positiva: -210.65900841503944


In [160]:
importancias = best_model_cat.feature_importances_
nombres_caracteristicas = features_amount_select

importancias_df = pd.DataFrame({
    'Característica': nombres_caracteristicas,
    'Importancia': importancias
})

importancias_df.sort_values(by='Importancia', ascending=False, inplace=True)

print(importancias_df)

                         Característica  Importancia
0                           num__Income    62.287638
3                              num__age    11.639605
1                          num__Kidhome     6.181174
2                         num__Teenhome     6.127764
12               cat_ordinal__Education     4.601861
5                num__Household_members     3.345526
7    cat_onehot__Marital_Status_Married     2.076928
10  cat_onehot__Marital_Status_Together     1.263993
9     cat_onehot__Marital_Status_Single     0.935488
6   cat_onehot__Marital_Status_Divorced     0.844080
11     cat_onehot__Marital_Status_Widow     0.692133
8     cat_onehot__Marital_Status_Others     0.003439
4      cat_onehot__Marital_Status_Alone     0.000371


### Evaluación contra test

In [153]:
# RandomForest:
y_pred_mnt_rf = grid_rf.predict(X_test_transform_df_mnt[features_amount_select])
mae = mean_absolute_error(y_test_mnt, y_pred_mnt_rf)
print(f"RandomForest: El MAE es: {mae}")

mape = mean_absolute_percentage_error(y_test_mnt, y_pred_mnt_rf)
print(f"RandomForest: El MAPE es: {mape}")

# CatBoost:
y_pred_mnt_cat = grid_cat.predict(X_test_transform_df_mnt[features_amount_select])
mae = mean_absolute_error(y_test_mnt, y_pred_mnt_cat)
print(f"CatBoost: El MAE es: {mae}")

mape = mean_absolute_percentage_error(y_test_mnt, y_pred_mnt_cat)
print(f"CatBoost: El MAPE es: {mape}")


RandomForest: El MAE es: 193.534609375
RandomForest: El MAPE es: 0.9930382783861701
CatBoost: El MAE es: 200.63835994774166
CatBoost: El MAPE es: 1.214370051537649


In [158]:
import joblib
import os

path_rf = './models/rf_mnt_1.joblib'

joblib.dump(grid_rf,path_rf)



['./models/rf_mnt_1.joblib']

In [159]:
path_cat = './models/cat_mnt_1.joblib'

joblib.dump(grid_cat,path_cat)

['./models/cat_mnt_1.joblib']

## Predicción gasto en fruta

In [None]:
train_set_mnt_fruit = train_set.copy()
test_set_mnt_fruit = test_set.copy()

In [None]:
X_train_mtn_fruit, y_train_mnt_fruit, X_test_mnt_fruit, y_test_mnt_fruit, numerical_features_mnt = preprocesar_datos(train_set_mnt_fruit,test_set_mnt_fruit,columnas_eliminar_train,columnas_eliminar_test,'MntFruits',666666)

In [None]:
X_train_mtn_fruit.columns == X_test_mnt_fruit.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True])

In [None]:
X_train_mtn_fruit.columns

Index(['Education', 'Marital_Status', 'Income', 'Kidhome', 'Teenhome',
       'Recency', 'MntWines', 'MntFruits', 'MntMeatProducts',
       'MntFishProducts', 'MntSweetProducts', 'MntGoldProds',
       'NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases',
       'NumStorePurchases', 'NumWebVisitsMonth', 'AcceptedCmp3',
       'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1', 'AcceptedCmp2',
       'Complain', 'Response', 'age', 'customes_seniority',
       'Household_members', 'Total_purchase', 'Total_cmp'],
      dtype='object')

In [None]:
numerical_features_mnt

Index(['Income', 'Kidhome', 'Teenhome', 'Recency', 'MntWines', 'MntFruits',
       'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
       'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',
       'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth',
       'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1',
       'AcceptedCmp2', 'Complain', 'Response', 'age', 'customes_seniority',
       'Household_members', 'Total_purchase', 'Total_cmp'],
      dtype='object')

In [None]:
X_train_transform_df_mnt_fruit, X_test_transform_df_mnt_fruit = tratamiento_y_pipeline(X_train_mtn_fruit, y_train_mnt_fruit, X_test_mnt_fruit,numerical_features_mnt,categorical_features_onehot, categorical_features_ordinal)

In [None]:
features_amount_select = ['num__Income', 'num__Kidhome', 'num__Teenhome', 'num__age','cat_onehot__Marital_Status_Alone','num__Household_members',
       'cat_onehot__Marital_Status_Divorced',
       'cat_onehot__Marital_Status_Married',
       'cat_onehot__Marital_Status_Others',
       'cat_onehot__Marital_Status_Single',
       'cat_onehot__Marital_Status_Together',
       'cat_onehot__Marital_Status_Widow', 'cat_ordinal__Education']

### Comparación modelos

In [None]:
# Instancio modelos

xgb = XGBRegressor(random_state = 42)
rf = RandomForestRegressor(random_state=42)
lgb = LGBMRegressor(random_state = 42, verbose = -3)
cat = CatBoostRegressor(random_state=42, verbose=False)
lin_reg = LinearRegression()

In [None]:
# Definir los nombres de los modelos y los modelos correspondientes
model_names = ['XGBoost', 'RandomForest', 'LightGBoost', 'CatBoost', 'LinearRegression']
model_set = [xgb, rf, lgb, cat,lin_reg]

# Crear un diccionario para almacenar las métricas de validación cruzada
metricas_cv = {}

# Lista para almacenar los valores de recall
valores = []

# Realizar validación cruzada para cada modelo
for nombre, modelo in zip(model_names, model_set):
    # Calcular las métricas de validación cruzada
    scores = cross_val_score(modelo, X_train_transform_df_mnt_fruit[features_amount_select], y_train_mnt_fruit, cv=5, scoring='neg_mean_absolute_error')
    
    # Almacenar el resultado
    metricas_cv[nombre] = scores
    valores.append(np.mean(scores))

# Encontrar el modelo con la mayor métrica de recall
ganador = list(metricas_cv.keys())[np.argmax(valores)]

# Imprimir los resultados
print("Resultados de la validación cruzada:")
for nombre, scores in metricas_cv.items():
    print(f"{nombre}: MAE = {-np.mean(scores):.3f}")
print('*'*25)
print(f"\nEl ganador es: {ganador}")

Resultados de la validación cruzada:
XGBoost: MAE = 221.709
RandomForest: MAE = 209.420
LightGBoost: MAE = 215.936
CatBoost: MAE = 209.985
LinearRegression: MAE = 245.977
*************************

El ganador es: RandomForest
