<a href="https://colab.research.google.com/github/Benmoussa-marouane/data-science/blob/master/b_inter_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pipeline and ColumnTransformer

In [64]:
# Obviously
import pandas as pd

# Some sklearn tools for preprocessing and building a pipeline. 
# ColumnTransformer was introduced in 0.20 so make sure you have this version
# !pip install xgboost
import numpy as np 

# To build our pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

!pip install impyute
from impyute.imputation.cs import mice

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV

# Other helpers
from sklearn.metrics import accuracy_score, classification_report


!pip install category_encoders

from category_encoders.target_encoder import TargetEncoder



In [65]:
!pip install lightgbm

from lightgbm import LGBMClassifier

!pip install catboost

from catboost import CatBoostClassifier



In [66]:
from google.colab import drive
drive.mount('/content/drive')

import sys
sys.path.append('/content/drive/My Drive/Colab Notebooks/Data scientist')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### The Dataset

The dataset can be downloaded [here](https://archive.ics.uci.edu/ml/datasets/bank+marketing). It consists of data from marketing campaigns of a Portuguese bank. We will try to build classifiers that can predict whether or not the client targeted by the campaign ended up subscribing to a term deposit (column `y`).

In [67]:
communes_df=pd.read_pickle('/content/drive/My Drive/Colab Notebooks/Data scientist/data/commune_scoring.pkl')
# Separation Variables d'intérêts et variables explicatives
print(communes_df.camping.value_counts())
print(communes_df.info())

0    31610
1     5067
Name: camping, dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36677 entries, 0 to 36676
Data columns (total 96 columns):
codgeo                                                             36677 non-null object
nb_pharmacies_et_parfumerie                                        36676 non-null float64
dynamique_entrepreneuriale                                         36676 non-null float64
dynamique_entrepreneuriale_service_et_commerce                     36676 non-null float64
synergie_medicale_commune                                          36677 non-null int64
orientation_economique                                             36677 non-null object
indice_fiscal_partiel                                              36677 non-null float64
score_fiscal                                                       36677 non-null float64
indice_synergie_medicale                                           36677 non-null float64
score_synergie_medicale            

 keep in mind data in unbalanced we need to pick carefully good evaluation metric, maybe even oversampling..


In [68]:
dff = communes_df.select_dtypes(exclude=["number","bool"])
df_categorical = dff.assign(score_equipement_de_sante_bv=communes_df['score_equipement_de_sante_bv'])
df_categorical = df_categorical.fillna(df_categorical.mode().iloc[0]) # impute missing data using Mode or most frequent For categorical data 
# df_categorical = df_categorical.str.strip()

string_features = ['orientation_economique','seg_croissance_pop','libgeo','urbanite_ruralite','dynamique_demographique_bv',
                   'seg_environnement_demographique_obsolete','environnement_demographique'
                  ,'syn_medical','seg_cap_fiscale','seg_dyn_entre','dyn_setc']

# for fe in string_features:
#     df_categorical[fe] = df_categorical[fe].map(lambda x: "".join(w.strip() for w in x))



##############################

numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
X_incomplete = communes_df.select_dtypes(include=numerics).astype(float)
df_numeric = X_incomplete[X_incomplete.columns.difference(['score_equipement_de_sante_bv'])]### score_equipment_de_sante  is categorical

print(len(X_incomplete))

36677


In [69]:
df_numeric.columns

Index(['camping', 'capacite_fisc', 'capacite_fiscale', 'capacite_hotel',
       'densite_medicale_bv', 'dep_moyenne_salaires_cadre_horaires',
       'dep_moyenne_salaires_employe_horaires',
       'dep_moyenne_salaires_horaires', 'dep_moyenne_salaires_ouvrie_horaires',
       'dep_moyenne_salaires_prof_intermediaire_horaires',
       'dynamique_demographique_insee', 'dynamique_entrepreneuriale',
       'dynamique_entrepreneuriale_service_et_commerce', 'evolution_pop_',
       'evolution_population', 'indice_demographique', 'indice_fiscal_partiel',
       'indice_menages', 'indice_synergie_medicale',
       'moyenne_revenus_fiscaux_departementaux',
       'moyenne_revenus_fiscaux_regionaux', 'moyenne_revnus_fiscaux',
       'nb_actifs_non_salaries', 'nb_actifs_salaries', 'nb_atifs',
       'nb_creation_commerces', 'nb_creation_construction',
       'nb_creation_enteprises', 'nb_creation_industrielles',
       'nb_creation_services', 'nb_de_commerce',
       'nb_de_services_aux_particuli

In [0]:
for f in df_numeric.columns:
  X_incomplete[f].fillna((X_incomplete[f].mean()), inplace=True)

In [0]:
# Source: https://maxhalford.github.io/blog/target-encoding-done-the-right-way/
def calc_smooth_mean(df1, cat_name, target, weight):
    # Compute the global mean
    mean = communes_df[target].mean()

    # Compute the number of values and the mean of each group
    agg = communes_df.groupby(cat_name)[target].agg(['count', 'mean'])
    counts = agg['count']
    means = agg['mean']

    # Compute the "smoothed" means
    smooth = (counts * means + weight * mean) / (counts + weight)

    return df1[cat_name].map(smooth)

Feature 'codgeo' has 36677 unique categories note good idea to use target encoding

Feature 'libgeo' has 34125 unique categories

Feature 'dep' has 100 unique categories

Feature 'cp' has 100 unique categories

In [0]:
WEIGHT = 300
df_categorical['codgeo'] = calc_smooth_mean(df1=communes_df, cat_name='codgeo', target='camping', weight=WEIGHT).astype('category')
df_categorical['libgeo'] = calc_smooth_mean(df1=communes_df, cat_name='libgeo', target='camping', weight=WEIGHT).astype('category')
df_categorical['dep'] = calc_smooth_mean(df1=communes_df, cat_name='dep', target='camping', weight=WEIGHT).astype('category')
df_categorical['cp'] = calc_smooth_mean(df1=communes_df, cat_name='cp', target='camping', weight=WEIGHT).astype('category')

In [73]:
#0.137693	0.137693	0.0883396	0.0883396
#
#
df_categorical[['codgeo','libgeo','dep','cp']].head()

Unnamed: 0,codgeo,libgeo,dep,cp
0,0.137693,0.137693,0.143874,0.143874
1,0.137693,0.137693,0.143874,0.143874
2,0.137693,0.137693,0.143874,0.143874
3,0.141015,0.141015,0.143874,0.143874
4,0.137693,0.137693,0.143874,0.143874


In [74]:
# Decide which categorical variables you want to use in model
for col_name in df_categorical.columns:
#     if df_numeric[col_name].dtypes == 'object':
      unique_cat = len(df_categorical[col_name].unique())
      print("Feature '{col_name}' has {unique_cat} unique categories".format(col_name=col_name, unique_cat=unique_cat))

Feature 'codgeo' has 2 unique categories
Feature 'orientation_economique' has 5 unique categories
Feature 'seg_croissance_pop' has 2 unique categories
Feature 'libgeo' has 35 unique categories
Feature 'dep' has 99 unique categories
Feature 'urbanite_ruralite' has 6 unique categories
Feature 'dynamique_demographique_bv' has 7 unique categories
Feature 'seg_environnement_demographique_obsolete' has 8 unique categories
Feature 'environnement_demographique' has 7 unique categories
Feature 'syn_medical' has 3 unique categories
Feature 'seg_cap_fiscale' has 3 unique categories
Feature 'seg_dyn_entre' has 3 unique categories
Feature 'dyn_setc' has 3 unique categories
Feature 'cp' has 99 unique categories
Feature 'score_equipement_de_sante_bv' has 5 unique categories


In [75]:
df_categorical['urbanite_ruralite'].value_counts()

Com rurale < 2 000 m habts    30052
Com < 10 m habts               2505
Com > 200 m habts              1369
Com < 50 m habts               1325
Com < 200 m habts               836
Com rurale > 2 000 habts        590
Name: urbanite_ruralite, dtype: int64

In [76]:
# In this case, bucket low frequecy categories as "Other"
df_categorical['urbanite_ruralite'] = ['Com rurale < 2 000 m habts ' if x == 'Com rurale < 2 000 m habts' else 'Other' for x in df_categorical['urbanite_ruralite']]

print(df_categorical['urbanite_ruralite'].value_counts().sort_values(ascending=False))

Com rurale < 2 000 m habts     30052
Other                           6625
Name: urbanite_ruralite, dtype: int64


In [77]:
df_categorical['seg_cap_fiscale'].value_counts() 

Fiscalité moyenne    28780
Fiscalité faible      6628
Fiscalité élevée      1269
Name: seg_cap_fiscale, dtype: int64

In [78]:
# In this case, bucket low frequecy categories as "Other"
df_categorical['seg_cap_fiscale'] = ['Fiscalité moyenne' if x == 'Fiscalité moyenne' else 'Other' for x in df_categorical['seg_cap_fiscale']]

print(df_categorical['seg_cap_fiscale'].value_counts().sort_values(ascending=False))

Fiscalité moyenne    28780
Other                 7897
Name: seg_cap_fiscale, dtype: int64


In [79]:
df_categorical['syn_medical'].value_counts()

Faible Synergie Médicale    24669
Forte Synergie Médicale      6048
Synergie Médicale            5960
Name: syn_medical, dtype: int64

In [80]:
# In this case, bucket low frequecy categories as "Other"
df_categorical['syn_medical'] = ['Faible Synergie Médicale' if x == 'Faible Synergie Médicale' else 'Other' for x in df_categorical['syn_medical']]

print(df_categorical['syn_medical'].value_counts().sort_values(ascending=False))

Faible Synergie Médicale    24669
Other                       12008
Name: syn_medical, dtype: int64


In [81]:
df_categorical['seg_dyn_entre'].value_counts()

Faible dynamique        30187
Dynamique Economique     5059
Moyenne dynamique        1431
Name: seg_dyn_entre, dtype: int64

In [82]:
# In this case, bucket low frequecy categories as "Other"
df_categorical['seg_dyn_entre'] = ['Faible dynamique' if x == 'Faible dynamique' else 'Other' for x in df_categorical['seg_dyn_entre']]

print(df_categorical['seg_dyn_entre'].value_counts().sort_values(ascending=False))

Faible dynamique    30187
Other                6490
Name: seg_dyn_entre, dtype: int64


In [83]:
df_categorical.dyn_setc.value_counts()

Faible Dynamique Serv et Com              31916
Bonne Dynamique Entreprise Serv et Com     3788
Dynamique Serv et Com                       973
Name: dyn_setc, dtype: int64

In [84]:
# In this case, bucket low frequecy categories as "Other"
df_categorical['dyn_setc'] = ['Faible Dynamique Serv et Com' if x == 'Faible Dynamique Serv et Com' else 'Other' for x in df_categorical['dyn_setc']]

print(df_categorical['dyn_setc'].value_counts().sort_values(ascending=False))

Faible Dynamique Serv et Com    31916
Other                            4761
Name: dyn_setc, dtype: int64


In [0]:
# df_categorical.seg_environnement_demographique_obsolete.value_counts()

In [0]:
# # In this case, bucket low frequecy categories as "Other"
# for x in df_categorical['seg_environnement_demographique_obsolete'] :
  
#     if x is 'Zone rurale en croissance démographique':  
#         df_categorical['seg_environnement_demographique_obsolete'] = 'Zone rurale en croissance démographique'
    
#     elif x is 'Zone rurale en déclin démographique':  
#         df_categorical['seg_environnement_demographique_obsolete'] = 'Zone rurale en déclin démographique'
    
#     else:
#         df_categorical['seg_environnement_demographique_obsolete'] = 'other'
        
        
    
# print(df_categorical['seg_environnement_demographique_obsolete'].value_counts().sort_values(ascending=False))

In [87]:
for c in df_categorical.columns:
  print(df_categorical[c].value_counts().sort_values(ascending=False).head(10))
  print()
 

0.137693    31610
0.141015     5067
Name: codgeo, dtype: int64

Bassin Résidentiel         16870
Bassin Urbain               8671
Bassin Industriel           7122
Bassins Agroalimentaire     2564
Bassin diversifié           1450
Name: orientation_economique, dtype: int64

en croissance démographique    36666
en déclin démographique           11
Name: seg_croissance_pop, dtype: int64

0.137693    27899
0.141015     4613
0.137237     1824
0.136784      558
0.140548      446
0.136334      244
0.140084      162
0.135887      150
0.139624      116
0.135443       66
Name: libgeo, dtype: int64

0.149452    894
0.082463    866
0.063123    816
0.107621    782
0.095163    745
0.072277    730
0.082866    707
0.104817    706
0.064900    693
0.076354    675
Name: dep, dtype: int64

Com rurale < 2 000 m habts     30052
Other                           6625
Name: urbanite_ruralite, dtype: int64

Grande Ville                                          8671
1.Accroissement par excédent naturel et migratoi

### Create our "pipeline" models with `ColumnTransformer` and `Pipeline`

We'll define a new `ColumnTransformer` object that keeps our numerical features and apply one hot encoding on our categorical features. 

That will allow us to create a clean pipeline that includes both features engineering (one hot encoding here) and training the model (a nice way to avoid data leakage)

In [0]:
numeric_features =list(df_numeric)
numeric_features.remove('camping')

categorical_features =list(df_categorical)

te_features = ['codgeo','libgeo','dep','cp']


for i in te_features:
  categorical_features.remove(i)
 


numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])


                                 
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
         ])

                                 
# target_encoding = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
#     ('TE', ce.TargetEncoder(cols=te_features) )
#          ])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression(class_weight="balanced", 
                                                  solver="liblinear", 
                                                  random_state=42,max_iter = 1100))])

In [141]:
# print(data.shape)

df_numeric = pd.DataFrame(data=X_incomplete,index= range(0,len(X_incomplete)),columns=X_incomplete.columns)  
# print(df_numeric.shape)

merged =pd.concat([df_numeric, df_categorical], axis=1)


#duplicated columns

merged = merged.loc[:,~merged.columns.duplicated()]

print(merged.shape)

(36677, 96)


Now we can define our 4 models as sklearn `Pipeline` objects, containing our preprocessing step and training of one given algorithm.

Let's split the data into training and test sets.

In [142]:
# Get X, y
y = merged['camping']
X = merged.drop("camping", axis=1)

# X.reset_index(drop=True).head()
np.shape(X)

(36677, 95)

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y,test_size=.3)



We're good to go!

### Train Logistic Regression

First let's fine tune our logistic regression and evaluate its performance. We can treat our pipeline object like a normal sklearn model and call `.fit` or `.predict` on it. We can also directly pass it to the `GridSearchCV` function.

In [144]:
clf.fit(X_train, y_train)

print("model score: %.3f" % clf.score(X_test, y_test))

model score: 0.819


In [160]:
grid_search = GridSearchCV(clf, {'classifier__C': [0.1, 1.0, 10]}, 
                  n_jobs=-1, cv=5, scoring="accuracy")


grid_search.fit(X_train, y_train)

print(("best logistic regression from grid search: %.3f"
       % grid_search.score(X_test, y_test)))

best logistic regression from grid search: 0.819


Let's see our best parameters and score

In [161]:
print(grid_search.best_params_)
print(grid_search.best_score_)

{'classifier__C': 1.0}
0.8221477817161998


Let's retrain our model on the whole training data with the best parameters. First we need to update the parameters inside our pipeline. We can use the `set_params` method for that.

In [162]:
clf.set_params(**grid_search.best_params_)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                                             

We can run `get_params` to get the parameters of our model as a sanity check that we have properly updated the parameters.

In [163]:
clf.get_params("classifier")

{'classifier': LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                    fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                    max_iter=1100, multi_class='warn', n_jobs=None, penalty='l2',
                    random_state=42, solver='liblinear', tol=0.0001, verbose=0,
                    warm_start=False),
 'classifier__C': 1.0,
 'classifier__class_weight': 'balanced',
 'classifier__dual': False,
 'classifier__fit_intercept': True,
 'classifier__intercept_scaling': 1,
 'classifier__l1_ratio': None,
 'classifier__max_iter': 1100,
 'classifier__multi_class': 'warn',
 'classifier__n_jobs': None,
 'classifier__penalty': 'l2',
 'classifier__random_state': 42,
 'classifier__solver': 'liblinear',
 'classifier__tol': 0.0001,
 'classifier__verbose': 0,
 'classifier__warm_start': False,
 'memory': None,
 'preprocessor': ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
                   transformer_weights=None,
               

Now we can fit the model on the whole training set and calculate accuracy on the test set.

In [164]:
clf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                                             

Generate predictions

In [0]:
y_pred = clf.predict(X_test)

In [166]:
accuracy_score(y_test, y_pred)

0.8185205379861868

In [167]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.95      0.84      0.89      9484
         1.0       0.41      0.70      0.52      1520

    accuracy                           0.82     11004
   macro avg       0.68      0.77      0.70     11004
weighted avg       0.87      0.82      0.84     11004



Here the pipeline creates a temporary DataFrame that contains the one hot encoded features. If you want to debug your pipeline, you can access intermediary stages at any time. For instance below we get the preprocessor, extract the stage for categorical features and get the categories it has learnt:

In [148]:
ohe_categories = clf.named_steps["preprocessor"].transformers_[1][1].named_steps['onehot'].get_feature_names()

ohe_categories

array(['x0_Bassin Industriel', 'x0_Bassin Résidentiel',
       'x0_Bassin Urbain', 'x0_Bassin diversifié',
       'x0_Bassins Agroalimentaire', 'x1_en croissance démographique',
       'x1_en déclin démographique', 'x2_Com rurale < 2 000 m habts ',
       'x2_Other',
       'x3_1.Accroissement par excédent naturel et migratoire',
       'x3_2.Accroissement par excédent naturel',
       'x3_3.Accroissement par excédent migratoire',
       'x3_4.Déclin par déficit naturel et migratoire',
       'x3_5.Déclin par déficit naturel',
       'x3_6.Déclin par déficit migratoire', 'x3_Grande Ville',
       'x4_Com < 10 m habts en croissance démographique',
       'x4_Com < 200 m habts en croissance démographique',
       'x4_Com < 50 m habts en croissance démographique',
       'x4_Com > 200 m habts en croissance démographique',
       'x4_Com rurale < 2 000 m habts en croissance démographique',
       'x4_Com rurale > 2 000 habts en croissance démographique',
       'x4_Zone rurale en croissanc

In [154]:
all_features = numeric_features + list(ohe_categories)

len(all_features)

125

  Great, so now we have a nice list of columns after processing. Let's visualise the data in a dataframe just for sanity check. For that we can also call methods directly on specificy stages:

In [155]:
df = pd.DataFrame(clf.named_steps["preprocessor"].transform(X_train),columns=all_features)
df.head()

#124==codgeo or libgeo

Unnamed: 0,nb_pharmacies_et_parfumerie,dynamique_entrepreneuriale,dynamique_entrepreneuriale_service_et_commerce,synergie_medicale_commune,indice_fiscal_partiel,score_fiscal,indice_synergie_medicale,score_synergie_medicale,reg,nb_omnipraticiens_bv,nb_infirmiers_liberaux_bv,nb_dentistes_liberaux_bv,nb_pharmaciens_liberaux_bv,densite_medicale_bv,score_equipement_de_sante_bv,indice_demographique,score_demographique,indice_menages,score_menages,population,evolution_population,evolution_pop_,nb_menages,nb_residences_principales,nb_proprietaire,nb_logement,nb_residences_secondaires,nb_log_vacants,nb_occupants_residence_principale,nb_femme,nb_homme,nb_mineurs,nb_majeurs,nb_etudiants,nb_entreprises_secteur_services,nb_entreprises_secteur_commerce,nb_entreprises_secteur_construction,nb_entreprises_secteur_industrie,nb_creation_enteprises,nb_creation_industrielles,...,x0_Bassins Agroalimentaire,x1_en croissance démographique,x1_en déclin démographique,x2_Com rurale < 2 000 m habts,x2_Other,x3_1.Accroissement par excédent naturel et migratoire,x3_2.Accroissement par excédent naturel,x3_3.Accroissement par excédent migratoire,x3_4.Déclin par déficit naturel et migratoire,x3_5.Déclin par déficit naturel,x3_6.Déclin par déficit migratoire,x3_Grande Ville,x4_Com < 10 m habts en croissance démographique,x4_Com < 200 m habts en croissance démographique,x4_Com < 50 m habts en croissance démographique,x4_Com > 200 m habts en croissance démographique,x4_Com rurale < 2 000 m habts en croissance démographique,x4_Com rurale > 2 000 habts en croissance démographique,x4_Zone rurale en croissance démographique,x4_Zone rurale en déclin démographique,x5_Bassin Industriel en croissance démographique,x5_Bassin Résidentiel en croissance démographique,x5_Bassin Résidentiel en déclin démographique,x5_Bassin Urbain en croissance démographique,x5_Bassin diversifié en croissance démographique,x5_Bassin diversifié en déclin démographique,x5_Bassins Agroalimentaire en croissance démographique,x6_Faible Synergie Médicale,x6_Other,x7_Fiscalité moyenne,x7_Other,x8_Faible dynamique,x8_Other,x9_Faible Dynamique Serv et Com,x9_Other,x10_0.0,x10_1.0,x10_2.0,x10_3.0,x10_4.0
0,-0.031952,-0.000935,-0.020801,0.037905,0.18316,0.18316,0.037905,0.0379,1.273932,-0.362748,-0.318632,-0.296227,-0.296227,1.783006,-1.158708,-0.032733,-0.032734,-0.028934,-0.028929,-0.032733,0.008337,0.989114,-0.028934,-0.029064,-0.02797,0.030318,0.787752,0.003262,-0.031525,-0.036572,-0.029365,-0.033317,-0.033002,-0.040133,-0.018203,-0.023444,-0.007082,0.000195,-0.017005,-0.063009,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
1,-0.031952,-0.048776,-0.037056,-0.087281,0.100406,0.100406,-0.087281,-0.087277,1.626843,-0.056218,-0.250125,-0.009434,-0.009434,-0.562825,1.115972,-0.089965,-0.089963,-0.074591,-0.074592,-0.089965,-0.086741,-0.26657,-0.074591,-0.074571,-0.118779,-0.072366,-0.047716,-0.057718,-0.089883,-0.088361,-0.091401,-0.093879,-0.084024,-0.075567,-0.035516,-0.038285,-0.050467,-0.062164,-0.043199,0.045509,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,-0.031952,0.013977,-0.000766,-0.015746,0.28358,0.28358,-0.015746,-0.015741,0.881808,-0.439381,-0.592658,-0.439624,-0.439624,-0.443309,1.115972,0.02238,0.022374,0.00587,0.005869,0.02238,0.027353,0.630347,0.00587,0.005971,0.070887,-0.00532,-0.099552,-0.034588,0.025234,0.020635,0.021877,0.02576,0.014883,0.006897,0.001034,-0.002667,0.079687,0.020981,0.045861,0.371061,...,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,-0.031952,-0.020196,-0.015131,-0.007798,-2.336639,-2.336639,-0.007798,-0.007791,-0.333775,1.782961,0.84598,0.994343,0.994343,-0.591614,1.115972,-0.070701,-0.070706,-0.05775,-0.057754,-0.070701,-0.062292,0.450963,-0.05775,-0.057739,-0.075865,-0.06232,-0.114797,-0.053512,-0.070196,-0.067886,-0.074184,-0.074187,-0.066124,-0.063971,-0.013394,-0.016518,-0.033113,-0.012277,-0.032721,-0.063009,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,-0.031952,-0.046291,-0.033276,-0.113113,0.826592,0.826592,-0.113113,-0.113107,-1.039597,-0.439381,-0.455645,-0.583021,-0.583021,-0.414521,-0.400481,-0.076249,-0.076246,-0.064986,-0.064988,-0.076249,-0.062292,0.450963,-0.064986,-0.064971,-0.09464,-0.067079,-0.093453,-0.054563,-0.076141,-0.073547,-0.078693,-0.078147,-0.072856,-0.061394,-0.035516,-0.031359,-0.067821,-0.049692,-0.032721,-0.063009,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


###Global interpretation

In [169]:
# !pip install eli5
import eli5

eli5.show_weights(clf.named_steps["classifier"],feature_names=all_features)



Weight?,Feature
+4.756,nb_de_commerce
+4.753,nb_hotel
+3.460,nb_homme
+3.424,nb_logement_secondaire_et_occasionnel
+3.255,nb_actifs_non_salaries
+1.665,nb_sante_action_sociale
+1.592,nb_entreprises_secteur_services
+1.505,nb_residences_secondaires
… 61 more positive …,… 61 more positive …
… 45 more negative …,… 45 more negative …


That gives us the weights associated to each feature, that can be seen as the contribution of each feature into predicting that the class will be y=1 

In [176]:
i = 15
X_test.iloc[[i]]

Unnamed: 0,nb_pharmacies_et_parfumerie,dynamique_entrepreneuriale,dynamique_entrepreneuriale_service_et_commerce,synergie_medicale_commune,indice_fiscal_partiel,score_fiscal,indice_synergie_medicale,score_synergie_medicale,reg,nb_omnipraticiens_bv,nb_infirmiers_liberaux_bv,nb_dentistes_liberaux_bv,nb_pharmaciens_liberaux_bv,densite_medicale_bv,score_equipement_de_sante_bv,indice_demographique,score_demographique,indice_menages,score_menages,population,evolution_population,evolution_pop_,nb_menages,nb_residences_principales,nb_proprietaire,nb_logement,nb_residences_secondaires,nb_log_vacants,nb_occupants_residence_principale,nb_femme,nb_homme,nb_mineurs,nb_majeurs,nb_etudiants,nb_entreprises_secteur_services,nb_entreprises_secteur_commerce,nb_entreprises_secteur_construction,nb_entreprises_secteur_industrie,nb_creation_enteprises,nb_creation_industrielles,...,valeur_ajoutee_regionale,score_urbanite,nb_atifs,nb_actifs_salaries,nb_actifs_non_salaries,nb_logement_secondaire_et_occasionnel,nb_hotel,capacite_hotel,taux_etudiants,taux_propriete,dynamique_demographique_insee,capacite_fisc,capacite_fiscale,moyenne_revnus_fiscaux,nb_education_sante_action_sociale,nb_services_personnels_et_domestiques,nb_sante_action_sociale,nb_industries_des_biens_intermediaires,nb_de_commerce,nb_de_services_aux_particuliers,nb_institution_de_education_sante_action_sociale_administration,pib_regionnal,score_croissance_population,score_croissance_entrepreneuriale,score_va_region,score_pib,codgeo,orientation_economique,seg_croissance_pop,libgeo,dep,urbanite_ruralite,dynamique_demographique_bv,seg_environnement_demographique_obsolete,environnement_demographique,syn_medical,seg_cap_fiscale,seg_dyn_entre,dyn_setc,cp
2315,0.0,77.0,24.0,165.0,101.93878,59.04139,165.82085,0.20287,82.0,41.0,48.0,23.0,23.0,0.089842,4.0,43.22229,0.03335,41.74097,0.02493,709.0,13.0,1.0,277.0,277.0,197.0,317.0,25.0,15.0,707.0,712.0,684.0,864.0,532.0,44.0,11.0,8.0,4.0,13.0,5.0,0.0,...,86957.458359,0.0,282.0,240.0,42.0,25.0,0.0,0.0,0.0,62.0,0.0,109.0,109.0,9898.5,0.0,0.0,0.0,9364.0,9350.0,3372.0,15105.0,173681.0,71.72131,0.02651,32.42578,33.83811,0.137693,Bassin Industriel,en croissance démographique,0.137693,0.282387,Com rurale < 2 000 m habts,1.Accroissement par excédent naturel et migrat...,Zone rurale en croissance démographique,Bassin Industriel en croissance démographique,Other,Fiscalité moyenne,Faible dynamique,Faible Dynamique Serv et Com,0.282387


In [177]:
y_test.iloc[i]

0.0

Our client subsribed to the term deposit after the campaign! Let's see what our model would have predicted and how it would explain it.

We'll need to first transform our row into the format expected by our model as eli5 cannot work directly with our pipeline.

Note: eli5 actually does support pipeline, but with a limited number of transformations only. In our pipeline it does not support the passthrough transformation (which, funny enough, doesn't do anything...)

In [178]:
eli5.show_prediction(clf.named_steps["classifier"], 
                     clf.named_steps["preprocessor"].transform(X_test)[i],feature_names=all_features, show_feature_values=True)

Contribution?,Feature,Value
1.589,score_pib,1.05
1.589,pib_regionnal,1.05
0.958,nb_institution_de_education_sante_action_sociale_administration,1.021
0.541,nb_de_services_aux_particuliers,0.45
0.508,nb_industries_des_biens_intermediaires,2.372
0.383,x2_Com rurale < 2 000 m habts,1.0
0.381,densite_medicale_bv,-0.586
0.305,reg_moyenne_salaires_prof_intermediaire_horaires,0.598
0.286,nb_logement_secondaire_et_occasionnel,-0.084
0.28,x8_Faible dynamique,1.0


as we may see for test data number 22 it did use features 124 and 123 as ELI5 suggested in first time

### Train a Decision Tree

Define the new model

In [0]:
from sklearn.tree import DecisionTreeClassifier

# Decision Tree
dt_model = Pipeline([("preprocessor", preprocessor), 
                     ("model", DecisionTreeClassifier(class_weight="balanced"))])

In [180]:
gs = GridSearchCV(dt_model, {"model__max_depth": [3, 5, 7], 
                             "model__min_samples_split": [2, 5]}, 
                  n_jobs=-1, cv=5, scoring="accuracy")

gs.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('preprocessor',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('num',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('imputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                                        copy=True,
                    

Let's see our best parameters and score

In [181]:
print(gs.best_params_)
print(gs.best_score_)

{'model__max_depth': 5, 'model__min_samples_split': 5}
0.8153702333190511


In [182]:
dt_model.set_params(**gs.best_params_)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                                             

In [0]:
dt_model.fit(X_train, y_train)
y_pred = dt_model.predict(X_test)

In [184]:
accuracy_score(y_test, y_pred)

0.8170665212649946

In [185]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.95      0.83      0.89      9484
         1.0       0.41      0.74      0.53      1520

    accuracy                           0.82     11004
   macro avg       0.68      0.78      0.71     11004
weighted avg       0.88      0.82      0.84     11004



In [187]:
eli5.show_weights(dt_model.named_steps["model"],feature_names=all_features)

Weight,Feature
0.6784,nb_residences_secondaires
0.1840,nb_entreprises_secteur_commerce
0.0416,nb_logement_secondaire_et_occasionnel
0.0233,taux_propriete
0.0176,score_urbanite
0.0145,reg_moyenne_salaires_cadre_horaires
0.0143,dep_moyenne_salaires_employe_horaires
0.0055,dep_moyenne_salaires_prof_intermediaire_horaires
0.0054,dep_moyenne_salaires_cadre_horaires
0.0044,reg


In [188]:

eli5.show_prediction(dt_model.named_steps["model"], 
                     dt_model.named_steps["preprocessor"].transform(X_test)[i],
                     feature_names=all_features, show_feature_values=True)

Contribution?,Feature,Value
0.5,<BIAS>,1.0
0.102,nb_residences_secondaires,-0.084
0.084,taux_propriete,0.261
0.024,reg_moyenne_salaires_cadre_horaires,0.841
-0.172,nb_entreprises_secteur_commerce,-0.031


### Train a Random Forest

In [0]:
from sklearn.ensemble import RandomForestClassifier

# Random Forest
rf_model = Pipeline([("preprocessor", preprocessor), 
                     ("model", RandomForestClassifier(class_weight="balanced", 
                                                      n_estimators=100, n_jobs=-1))])

In [190]:
gs = GridSearchCV(rf_model, {"model__max_depth": [10, 15], 
                             "model__min_samples_split": [5, 10]}, 
                  n_jobs=-1, cv=5, scoring="accuracy")

gs.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('preprocessor',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('num',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('imputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                                        copy=True,
                    

Let's see our best parameters and score

In [191]:
print(gs.best_params_)
print(gs.best_score_)

{'model__max_depth': 15, 'model__min_samples_split': 5}
0.8924940599072956


In [192]:
rf_model.set_params(**gs.best_params_)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                                             

In [0]:
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)

In [194]:
accuracy_score(y_test, y_pred)

0.888858596873864

In [195]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.93      0.94      0.94      9484
         1.0       0.61      0.54      0.57      1520

    accuracy                           0.89     11004
   macro avg       0.77      0.74      0.75     11004
weighted avg       0.88      0.89      0.89     11004



In [197]:
!pip install lime

from lime.lime_tabular import LimeTabularExplainer




### Train an XGB model

Pipeline can work with any class compatible with scikit-learn, here we are using it with the XGBClassifier from xgboost for instance:

In [0]:
from xgboost.sklearn import XGBClassifier

# XGBoost
xgb_model = Pipeline([("prepyrocessor", preprocessor), 
                      # Add a scale_pos_weight to make it balanced
                      ("model", XGBClassifier(scale_pos_weight=(1 - y.mean()), 
                                              n_jobs=-1))])

In [199]:
gs = GridSearchCV(xgb_model, {"model__max_depth": [5, 10],
                              "model__min_child_weight": [5, 10],
                              "model__n_estimators": [25]},
                  n_jobs=-1, cv=5, scoring="accuracy")

gs.fit(X_train, y_train)



GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('prepyrocessor',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('num',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('imputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                                        copy=True,
                   

Let's see our best parameters and score.

In [204]:
print(gs.best_params_)
print(gs.best_score_)
xgb_model.set_params(**gs.best_params_)
xgb_model.fit(X_train, y_train)

{'model__max_depth': 10, 'model__min_child_weight': 10, 'model__n_estimators': 25}
0.8979083083395006


Pipeline(memory=None,
         steps=[('prepyrocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                                            

Generate predictions

In [0]:
y_pred = xgb_model.predict(X_test)

In [206]:
accuracy_score(y_test, y_pred)

0.896037804434751

In [207]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.91      0.97      0.94      9484
         1.0       0.71      0.42      0.52      1520

    accuracy                           0.90     11004
   macro avg       0.81      0.69      0.73     11004
weighted avg       0.88      0.90      0.88     11004



###LGBM and CAtboost

In [0]:
# LGBm
lgb_model = Pipeline([("preprocessor", preprocessor), 
                      ("model",  LGBMClassifier(n_estimators=40))])

In [129]:
param_dist = {"model__max_depth": [5, 10],
              "model__min_child_weight": [5, 10],
              "model__n_estimators": [25]
              }
gs = GridSearchCV(lgb_model, n_jobs=-1, param_grid=param_dist, cv = 3, scoring="roc_auc", )


gs.fit(X_train, y_train)



GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('preprocessor',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='passthrough',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('num',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('imputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                                        copy=True,
             

In [130]:
print(gs.best_params_)
print(gs.best_score_)
lgb_model.set_params(**gs.best_params_)
lgb_model.fit(X_train, y_train)

{'model__max_depth': 5, 'model__min_child_weight': 5, 'model__n_estimators': 25}
1.0


Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='passthrough',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                                      

In [0]:
y_pred = lgb_model.predict(X_test)

In [132]:
accuracy_score(y_test, y_pred)

1.0

In [133]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      9484
         1.0       1.00      1.00      1.00      1520

    accuracy                           1.00     11004
   macro avg       1.00      1.00      1.00     11004
weighted avg       1.00      1.00      1.00     11004



In [134]:
from sklearn import metrics

def auc2(m, train, test): 
    return (metrics.roc_auc_score(y_train,m.predict(train)),
                            metrics.roc_auc_score(y_test,m.predict(test)))
  
  

auc2(lgb_model, X_train,X_test)

(1.0, 1.0)

In [0]:
# catboost
catb_model = Pipeline([("preprocessor", preprocessor), 
                      # Add a scale_pos_weight to make it balanced
                      ("model",  CatBoostClassifier(iterations =100, learning_rate = 0.03,eval_metric='AUC') )])


In [136]:
catb_model.fit(X_train, y_train)

0:	total: 73.2ms	remaining: 7.25s
1:	total: 95.6ms	remaining: 4.68s
2:	total: 118ms	remaining: 3.81s
3:	total: 141ms	remaining: 3.38s
4:	total: 163ms	remaining: 3.1s
5:	total: 197ms	remaining: 3.09s
6:	total: 223ms	remaining: 2.96s
7:	total: 245ms	remaining: 2.82s
8:	total: 268ms	remaining: 2.71s
9:	total: 298ms	remaining: 2.68s
10:	total: 321ms	remaining: 2.6s
11:	total: 345ms	remaining: 2.53s
12:	total: 368ms	remaining: 2.46s
13:	total: 391ms	remaining: 2.4s
14:	total: 414ms	remaining: 2.34s
15:	total: 436ms	remaining: 2.29s
16:	total: 458ms	remaining: 2.24s
17:	total: 480ms	remaining: 2.19s
18:	total: 510ms	remaining: 2.17s
19:	total: 534ms	remaining: 2.14s
20:	total: 559ms	remaining: 2.1s
21:	total: 580ms	remaining: 2.06s
22:	total: 604ms	remaining: 2.02s
23:	total: 627ms	remaining: 1.99s
24:	total: 651ms	remaining: 1.95s
25:	total: 674ms	remaining: 1.92s
26:	total: 700ms	remaining: 1.89s
27:	total: 726ms	remaining: 1.87s
28:	total: 748ms	remaining: 1.83s
29:	total: 771ms	remaining

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='passthrough',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                                      

In [0]:
y_pred = catb_model.predict(X_test)

In [138]:
accuracy_score(y_test, y_pred)

1.0