In [34]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, Binarizer

from sklearn.pipeline import Pipeline

from feature_engine.imputation import(
    AddMissingIndicator,
    MeanMedianImputer,
    CategoricalImputer
)

from feature_engine.encoding import (
    RareLabelEncoder,
    OrdinalEncoder
)

from feature_engine.transformation import LogTransformer

from feature_engine.selection import DropFeatures
from feature_engine.wrappers import SklearnTransformerWrapper

import joblib

In [35]:
pd.pandas.set_option('display.max_columns', None) #habilitamos despliegue maximo de columnas

In [46]:
data = pd.read_csv("marketing_campaign.tsv", sep='\t', engine='python')
data.head()

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response
0,5524,1957,Graduation,Single,58138.0,0,0,04-09-2012,58,635,88,546,172,88,88,3,8,10,4,7,0,0,0,0,0,0,3,11,1
1,2174,1954,Graduation,Single,46344.0,1,1,08-03-2014,38,11,1,6,2,1,6,2,1,1,2,5,0,0,0,0,0,0,3,11,0
2,4141,1965,Graduation,Together,71613.0,0,0,21-08-2013,26,426,49,127,111,21,42,1,8,2,10,4,0,0,0,0,0,0,3,11,0
3,6182,1984,Graduation,Together,26646.0,1,0,10-02-2014,26,11,4,20,10,3,5,2,2,0,4,6,0,0,0,0,0,0,3,11,0
4,5324,1981,PhD,Married,58293.0,1,0,19-01-2014,94,173,43,118,46,27,15,5,5,3,6,5,0,0,0,0,0,0,3,11,0


In [47]:
data.isnull().sum()

ID                      0
Year_Birth              0
Education               0
Marital_Status          0
Income                 24
Kidhome                 0
Teenhome                0
Dt_Customer             0
Recency                 0
MntWines                0
MntFruits               0
MntMeatProducts         0
MntFishProducts         0
MntSweetProducts        0
MntGoldProds            0
NumDealsPurchases       0
NumWebPurchases         0
NumCatalogPurchases     0
NumStorePurchases       0
NumWebVisitsMonth       0
AcceptedCmp3            0
AcceptedCmp4            0
AcceptedCmp5            0
AcceptedCmp1            0
AcceptedCmp2            0
Complain                0
Z_CostContact           0
Z_Revenue               0
Response                0
dtype: int64

In [48]:
data['MntTotalProducts'] = data['MntWines'] + data['MntFruits'] + data['MntMeatProducts'] + data['MntFishProducts'] + data['MntSweetProducts'] + data['MntGoldProds']

In [96]:
#Variables categoricas con NA
CATEGORICAL_VARS_WITH_NA_FREQUENT = []

#Variable categoricas con NA pero indicador de Missing
CATEGORICAL_VARS_WITH_NA_MISSING = []


#Variables numéricas con NA
NUMERICAL_VARS_WITH_NA = ['Income']

DROP_FEATURES = ['Dt_Customer', 'Marital_Status','MntMeatProducts']

#Variables para binarización por sesgo fuerte
BINARIZE_VARS = ['MntWines','MntFruits','MntFishProducts','MntSweetProducts']

#Variables categoricas a codificar sin ordinalidad
CATEGORICAL_VARS = ['Education' ]

#Variables seleccionadas según análisis de Lasso
FEATURES = [
    'ID', 'Education', 'Marital_Status', 'Income', 'Kidhome', 'Teenhome',
       'Recency', 'MntWines', 'MntFruits', 'MntMeatProducts',
       'MntFishProducts', 'MntSweetProducts', 'MntGoldProds',
       'NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases',
       'NumStorePurchases', 'NumWebVisitsMonth','Dt_Customer'
]

In [97]:
X_train, X_test, y_train, y_test = train_test_split(
        data.drop(['MntTotalProducts'], axis=1),
        data['MntTotalProducts'],
        test_size=0.1,
        random_state=2022)

X_train.shape, X_test.shape

((2016, 29), (224, 29))

In [98]:
#Selección de variables para entrenamiento
X_train = X_train[FEATURES]
X_train

Unnamed: 0,ID,Education,Marital_Status,Income,Kidhome,Teenhome,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,Dt_Customer
876,773,Master,Divorced,42835.0,1,1,64,379,4,93,12,9,98,7,6,6,4,6,30-06-2013
912,9760,PhD,Widow,52278.0,0,1,24,953,0,71,0,0,174,6,10,5,10,8,25-01-2013
683,10120,Graduation,Together,38946.0,0,1,84,116,6,82,6,6,41,2,3,1,6,5,24-10-2013
2080,4643,Graduation,Married,27803.0,1,0,40,8,26,46,38,9,49,2,3,0,4,8,26-08-2012
467,9213,Graduation,Widow,31880.0,1,0,13,4,1,5,2,0,3,1,1,0,2,8,31-10-2012
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1713,7396,Graduation,Married,80398.0,0,0,92,342,51,936,207,35,26,1,5,8,12,3,10-11-2012
624,6583,2n Cycle,Married,72635.0,0,0,54,390,22,323,104,35,107,1,6,8,6,3,03-06-2013
173,1880,PhD,Together,53537.0,1,1,17,81,0,6,0,0,6,2,2,1,3,5,30-01-2014
1244,1291,PhD,Together,27683.0,1,0,90,152,9,121,12,12,45,4,6,2,4,8,04-08-2012


In [99]:
#Seleccionamos variables para predicción
X_test = X_test[FEATURES]
X_test

Unnamed: 0,ID,Education,Marital_Status,Income,Kidhome,Teenhome,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,Dt_Customer
616,8953,Master,Single,35791.0,2,1,94,27,0,5,0,0,3,2,1,0,3,8,06-05-2013
2133,642,Basic,Married,16005.0,1,0,69,1,3,2,20,30,47,3,2,1,2,8,14-08-2012
2117,8659,PhD,Together,69805.0,0,1,50,750,71,174,13,10,20,2,6,8,11,2,21-01-2014
1149,10525,Graduation,Single,26576.0,1,0,40,10,0,8,0,0,9,1,1,0,2,9,13-10-2012
596,5252,Graduation,Divorced,23910.0,1,0,80,16,12,18,7,1,13,1,2,0,3,7,26-10-2012
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1147,5430,Graduation,Together,54450.0,1,1,0,454,0,171,8,19,32,12,9,2,8,8,14-09-2012
567,891,Master,Together,29298.0,1,1,60,6,0,2,2,0,1,1,1,0,2,5,07-09-2013
89,771,Graduation,Together,54178.0,0,1,79,135,9,39,4,0,7,1,2,2,5,2,21-01-2014
1522,1998,Graduation,Single,37697.0,1,0,82,34,6,21,11,4,8,1,2,1,3,6,07-02-2014


In [100]:
marketing_pipeline = Pipeline([
    
    #============= IMPUTACIONES ===================#
    
    #1. Imputación de varaibles categoricas
    ('missing_imputation', 
         CategoricalImputer(imputation_method='missing', variables=CATEGORICAL_VARS_WITH_NA_MISSING)
    ),
    
    #2. Imputación de variables categoricas con NA basado en frequiencia.
    ('frequent_imputation', 
         CategoricalImputer(imputation_method='frequent', variables=CATEGORICAL_VARS_WITH_NA_FREQUENT)
    ),
    
    #3. Indicamos Faltante en variables numéricas para imputar
    ('missing_indicator', AddMissingIndicator(variables=NUMERICAL_VARS_WITH_NA)),
    
    #4. Imputación de mediana para variables categoricas
    ('mean_imputation', MeanMedianImputer(
        imputation_method='mean', variables=NUMERICAL_VARS_WITH_NA)
    ),
    
    #6. Drop de variables
    ('drop_features', DropFeatures(features_to_drop=DROP_FEATURES)),
   
    #8. Binarización de Variables con Sesgo Fuerte
    ('binarizer', SklearnTransformerWrapper(
        transformer=Binarizer(threshold=0), variables=BINARIZE_VARS)
    ),
    
    #============ CODIFICACION DE VARIABLES CATEGORICAS NOMINALES ============
    
    ('rare_label_encoder', RareLabelEncoder(
        tol=0.01, n_categories=1, variables=CATEGORICAL_VARS)),
    
    ('categorical_encoder', OrdinalEncoder(
        encoding_method='ordered', variables=CATEGORICAL_VARS)),
    
    #=========== SCALER ==============
    ('scaler', MinMaxScaler()),
    
    #=========== ENTRENAMIENTO DEL MODELO ============
    ('Lasso', Lasso(alpha=0.01, random_state=2022)),
]) 

In [101]:
marketing_pipeline.fit(X_train, y_train)

Pipeline(steps=[('missing_imputation', CategoricalImputer(variables=[])),
                ('frequent_imputation',
                 CategoricalImputer(imputation_method='frequent',
                                    variables=[])),
                ('missing_indicator',
                 AddMissingIndicator(variables=['Income'])),
                ('mean_imputation',
                 MeanMedianImputer(imputation_method='mean',
                                   variables=['Income'])),
                ('drop_features',
                 DropFeatures(features_to_d...
                 SklearnTransformerWrapper(transformer=Binarizer(threshold=0),
                                           variables=['MntWines', 'MntFruits',
                                                      'MntFishProducts',
                                                      'MntSweetProducts'])),
                ('rare_label_encoder',
                 RareLabelEncoder(n_categories=1, tol=0.01,
                         

In [102]:
preds = marketing_pipeline.predict(X_test)

In [103]:
from sklearn.metrics import mean_squared_error 

In [104]:
mean_squared_error(y_test, preds, squared=False)

284.28012021754995

In [105]:
joblib.dump(marketing_pipeline, 'MarketingPpipeline.pkl')

['MarketingPpipeline.pkl']