# Pipeline and ColumnTransformer

In [1]:
# Obviously
import pandas as pd

# Some sklearn tools for preprocessing and building a pipeline. 
# ColumnTransformer was introduced in 0.20 so make sure you have this version
# !pip install xgboost
import numpy as np 

# To build our pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

!pip install impyute
from impyute.imputation.cs import mice

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV

# Other helpers
from sklearn.metrics import accuracy_score, classification_report


!pip install category_encoders

from category_encoders.target_encoder import TargetEncoder

Collecting impyute
  Downloading https://files.pythonhosted.org/packages/37/28/86829f67c9affb847facaab94687761d3555539ec675f7577778c5b2680a/impyute-0.0.8-py2.py3-none-any.whl
Installing collected packages: impyute
Successfully installed impyute-0.0.8
Collecting category_encoders
[?25l  Downloading https://files.pythonhosted.org/packages/6e/a1/f7a22f144f33be78afeb06bfa78478e8284a64263a3c09b1ef54e673841e/category_encoders-2.0.0-py2.py3-none-any.whl (87kB)
[K     |████████████████████████████████| 92kB 3.3MB/s 
Installing collected packages: category-encoders
Successfully installed category-encoders-2.0.0


In [71]:
!pip install lightgbm

from lightgbm import LGBMClassifier

!pip install catboost

from catboost import CatBoostClassifier



In [3]:
from google.colab import drive
drive.mount('/content/drive')

import sys
sys.path.append('/content/drive/My Drive/Colab Notebooks/Data scientist')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


### The Dataset

The dataset can be downloaded [here](https://archive.ics.uci.edu/ml/datasets/bank+marketing). It consists of data from marketing campaigns of a Portuguese bank. We will try to build classifiers that can predict whether or not the client targeted by the campaign ended up subscribing to a term deposit (column `y`).

In [4]:
communes_df=pd.read_pickle('/content/drive/My Drive/Colab Notebooks/Data scientist/data/commune_scoring.pkl')
# Separation Variables d'intérêts et variables explicatives
print(communes_df.camping.value_counts())
print(communes_df.shape)

0    31610
1     5067
Name: camping, dtype: int64
(36677, 96)


 keep in mind data in unbalanced we need to pick carefully good evaluation metric, maybe even oversampling..


In [5]:
dff = communes_df.select_dtypes(exclude=["number","bool"])
df_categorical = dff.assign(score_equipement_de_sante_bv=communes_df['score_equipement_de_sante_bv'])
df_categorical = df_categorical.fillna(df_categorical.mode().iloc[0]) # impute missing data using Mode or most frequent For categorical data 
# df_categorical = df_categorical.str.strip()

string_features = ['orientation_economique','seg_croissance_pop','libgeo','urbanite_ruralite','dynamique_demographique_bv',
                   'seg_environnement_demographique_obsolete','environnement_demographique'
                  ,'syn_medical','seg_cap_fiscale','seg_dyn_entre','dyn_setc']

# for fe in string_features:
#     df_categorical[fe] = df_categorical[fe].map(lambda x: "".join(w.strip() for w in x))



##############################

numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
X_incomplete = communes_df.select_dtypes(include=numerics).astype(float)
df_numeric = X_incomplete[X_incomplete.columns.difference(['score_equipement_de_sante_bv'])]### score_equipment_de_sante  is categorical

print(len(X_incomplete))

36677


In [6]:
df_categorical.head()

Unnamed: 0,codgeo,orientation_economique,seg_croissance_pop,libgeo,dep,urbanite_ruralite,dynamique_demographique_bv,seg_environnement_demographique_obsolete,environnement_demographique,syn_medical,seg_cap_fiscale,seg_dyn_entre,dyn_setc,cp,score_equipement_de_sante_bv
0,1001,Bassin Industriel,en croissance démographique,L' Abergement-Clémenciat,1,Com rurale < 2 000 m habts,1.Accroissement par excédent naturel et migrat...,Zone rurale en croissance démographique,Bassin Industriel en croissance démographique,Synergie Médicale,Fiscalité moyenne,Faible dynamique,Faible Dynamique Serv et Com,1,4
1,1002,Bassin Résidentiel,en croissance démographique,L' Abergement-de-Varey,1,Com rurale < 2 000 m habts,1.Accroissement par excédent naturel et migrat...,Zone rurale en croissance démographique,Bassin Résidentiel en croissance démographique,Forte Synergie Médicale,Fiscalité moyenne,Faible dynamique,Faible Dynamique Serv et Com,1,4
2,1004,Bassin Résidentiel,en croissance démographique,Ambérieu-en-Bugey,1,Com < 50 m habts,1.Accroissement par excédent naturel et migrat...,Zone rurale en croissance démographique,Bassin Résidentiel en croissance démographique,Forte Synergie Médicale,Fiscalité moyenne,Dynamique Economique,Bonne Dynamique Entreprise Serv et Com,1,4
3,1005,Bassin Urbain,en croissance démographique,Ambérieux-en-Dombes,1,Com rurale < 2 000 m habts,Grande Ville,Com rurale < 2 000 m habts en croissance démog...,Bassin Urbain en croissance démographique,Forte Synergie Médicale,Fiscalité moyenne,Moyenne dynamique,Faible Dynamique Serv et Com,1,1
4,1006,Bassin Résidentiel,en croissance démographique,Ambléon,1,Com rurale < 2 000 m habts,3.Accroissement par excédent migratoire,Zone rurale en croissance démographique,Bassin Résidentiel en croissance démographique,Forte Synergie Médicale,Fiscalité moyenne,Faible dynamique,Faible Dynamique Serv et Com,1,3


In [0]:
for f in df_numeric.columns:
  X_incomplete[f].fillna((X_incomplete[f].mean()), inplace=True)

In [0]:
# Source: https://maxhalford.github.io/blog/target-encoding-done-the-right-way/
def calc_smooth_mean(df1, df2, cat_name, target, weight):
    # Compute the global mean
    mean = communes_df[target].mean()

    # Compute the number of values and the mean of each group
    agg = communes_df.groupby(cat_name)[target].agg(['count', 'mean'])
    counts = agg['count']
    means = agg['mean']

    # Compute the "smoothed" means
    smooth = (counts * means + weight * mean) / (counts + weight)

    # Replace each value by the according smoothed mean
    if df2 is None:
        return df1[cat_name].map(smooth)
    else:
        return df1[cat_name].map(smooth)

Feature 'codgeo' has 36677 unique categories note good idea to use target encoding

Feature 'libgeo' has 34125 unique categories

Feature 'dep' has 100 unique categories

Feature 'cp' has 100 unique categories

In [0]:
WEIGHT = 300
df_categorical['codgeo'] = calc_smooth_mean(df1=communes_df, df2=None, cat_name='codgeo', target='camping', weight=WEIGHT).astype('category')
df_categorical['libgeo'] = calc_smooth_mean(df1=communes_df, df2=None, cat_name='libgeo', target='camping', weight=WEIGHT).astype('category')
df_categorical['dep'] = calc_smooth_mean(df1=communes_df, df2=None, cat_name='dep', target='camping', weight=WEIGHT).astype('category')
df_categorical['cp'] = calc_smooth_mean(df1=communes_df, df2=None, cat_name='cp', target='camping', weight=WEIGHT).astype('category')

In [10]:
df_categorical['cp'].head()

0    0.143874
1    0.143874
2    0.143874
3    0.143874
4    0.143874
Name: cp, dtype: category
Categories (99, float64): [0.063123, 0.064257, 0.064900, 0.067950, ..., 0.254847, 0.262826, 0.282068,
                           0.282387]

In [11]:
# Decide which categorical variables you want to use in model
for col_name in df_categorical.columns:
#     if df_numeric[col_name].dtypes == 'object':
      unique_cat = len(df_categorical[col_name].unique())
      print("Feature '{col_name}' has {unique_cat} unique categories".format(col_name=col_name, unique_cat=unique_cat))

Feature 'codgeo' has 2 unique categories
Feature 'orientation_economique' has 5 unique categories
Feature 'seg_croissance_pop' has 2 unique categories
Feature 'libgeo' has 35 unique categories
Feature 'dep' has 99 unique categories
Feature 'urbanite_ruralite' has 6 unique categories
Feature 'dynamique_demographique_bv' has 7 unique categories
Feature 'seg_environnement_demographique_obsolete' has 8 unique categories
Feature 'environnement_demographique' has 7 unique categories
Feature 'syn_medical' has 3 unique categories
Feature 'seg_cap_fiscale' has 3 unique categories
Feature 'seg_dyn_entre' has 3 unique categories
Feature 'dyn_setc' has 3 unique categories
Feature 'cp' has 99 unique categories
Feature 'score_equipement_de_sante_bv' has 5 unique categories


In [12]:
df_categorical['urbanite_ruralite'].value_counts()

Com rurale < 2 000 m habts    30052
Com < 10 m habts               2505
Com > 200 m habts              1369
Com < 50 m habts               1325
Com < 200 m habts               836
Com rurale > 2 000 habts        590
Name: urbanite_ruralite, dtype: int64

In [13]:
# In this case, bucket low frequecy categories as "Other"
df_categorical['urbanite_ruralite'] = ['Com rurale < 2 000 m habts ' if x == 'Com rurale < 2 000 m habts' else 'Other' for x in df_categorical['urbanite_ruralite']]

rint(df_categorical['urbanite_ruralite'].value_counts().sort_values(ascending=False))

Com rurale < 2 000 m habts     30052
Other                           6625
Name: urbanite_ruralite, dtype: int64


In [14]:
df_categorical['seg_cap_fiscale'].value_counts() 

Fiscalité moyenne    28780
Fiscalité faible      6628
Fiscalité élevée      1269
Name: seg_cap_fiscale, dtype: int64

In [15]:
# In this case, bucket low frequecy categories as "Other"
df_categorical['seg_cap_fiscale'] = ['Fiscalité moyenne' if x == 'Fiscalité moyenne' else 'Other' for x in df_categorical['seg_cap_fiscale']]

print(df_categorical['seg_cap_fiscale'].value_counts().sort_values(ascending=False))

Fiscalité moyenne    28780
Other                 7897
Name: seg_cap_fiscale, dtype: int64


In [16]:
df_categorical['syn_medical'].value_counts()

Faible Synergie Médicale    24669
Forte Synergie Médicale      6048
Synergie Médicale            5960
Name: syn_medical, dtype: int64

In [17]:
# In this case, bucket low frequecy categories as "Other"
df_categorical['syn_medical'] = ['Faible Synergie Médicale' if x == 'Faible Synergie Médicale' else 'Other' for x in df_categorical['syn_medical']]

print(df_categorical['syn_medical'].value_counts().sort_values(ascending=False))

Faible Synergie Médicale    24669
Other                       12008
Name: syn_medical, dtype: int64


In [18]:
df_categorical['seg_dyn_entre'].value_counts()

Faible dynamique        30187
Dynamique Economique     5059
Moyenne dynamique        1431
Name: seg_dyn_entre, dtype: int64

In [19]:
# In this case, bucket low frequecy categories as "Other"
df_categorical['seg_dyn_entre'] = ['Faible dynamique' if x == 'Faible dynamique' else 'Other' for x in df_categorical['seg_dyn_entre']]

print(df_categorical['seg_dyn_entre'].value_counts().sort_values(ascending=False))

Faible dynamique    30187
Other                6490
Name: seg_dyn_entre, dtype: int64


In [20]:
df_categorical.dyn_setc.value_counts()

Faible Dynamique Serv et Com              31916
Bonne Dynamique Entreprise Serv et Com     3788
Dynamique Serv et Com                       973
Name: dyn_setc, dtype: int64

In [21]:
# In this case, bucket low frequecy categories as "Other"
df_categorical['dyn_setc'] = ['Faible Dynamique Serv et Com' if x == 'Faible Dynamique Serv et Com' else 'Other' for x in df_categorical['dyn_setc']]

print(df_categorical['dyn_setc'].value_counts().sort_values(ascending=False))

Faible Dynamique Serv et Com    31916
Other                            4761
Name: dyn_setc, dtype: int64


In [0]:
# df_categorical.seg_environnement_demographique_obsolete.value_counts()

In [0]:
# # In this case, bucket low frequecy categories as "Other"
# for x in df_categorical['seg_environnement_demographique_obsolete'] :
  
#     if x is 'Zone rurale en croissance démographique':  
#         df_categorical['seg_environnement_demographique_obsolete'] = 'Zone rurale en croissance démographique'
    
#     elif x is 'Zone rurale en déclin démographique':  
#         df_categorical['seg_environnement_demographique_obsolete'] = 'Zone rurale en déclin démographique'
    
#     else:
#         df_categorical['seg_environnement_demographique_obsolete'] = 'other'
        
        
    
# print(df_categorical['seg_environnement_demographique_obsolete'].value_counts().sort_values(ascending=False))

In [24]:
for c in df_categorical.columns:
  print(df_categorical[c].value_counts().sort_values(ascending=False).head(10))
  print()
 

0.137693    31610
0.141015     5067
Name: codgeo, dtype: int64

Bassin Résidentiel         16870
Bassin Urbain               8671
Bassin Industriel           7122
Bassins Agroalimentaire     2564
Bassin diversifié           1450
Name: orientation_economique, dtype: int64

en croissance démographique    36666
en déclin démographique           11
Name: seg_croissance_pop, dtype: int64

0.137693    27899
0.141015     4613
0.137237     1824
0.136784      558
0.140548      446
0.136334      244
0.140084      162
0.135887      150
0.139624      116
0.135443       66
Name: libgeo, dtype: int64

0.149452    894
0.082463    866
0.063123    816
0.107621    782
0.095163    745
0.072277    730
0.082866    707
0.104817    706
0.064900    693
0.076354    675
Name: dep, dtype: int64

Com rurale < 2 000 m habts     30052
Other                           6625
Name: urbanite_ruralite, dtype: int64

Grande Ville                                          8671
1.Accroissement par excédent naturel et migratoi

### Create our "pipeline" models with `ColumnTransformer` and `Pipeline`

We'll define a new `ColumnTransformer` object that keeps our numerical features and apply one hot encoding on our categorical features. 

That will allow us to create a clean pipeline that includes both features engineering (one hot encoding here) and training the model (a nice way to avoid data leakage)

In [0]:
numeric_features =list(df_numeric)
numeric_features.remove('camping')

categorical_features =list(df_categorical)

te_features = ['codgeo','libgeo','dep','cp']


for i in te_features:
  categorical_features.remove(i)
 


numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])


                                 
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
         ])

                                 
# target_encoding = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
#     ('TE', ce.TargetEncoder(cols=te_features) )
#          ])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
                ])

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression(class_weight="balanced", 
                                                  solver="liblinear", 
                                                  random_state=42,max_iter = 1100))])

In [26]:
# print(data.shape)

df_numeric = pd.DataFrame(data=X_incomplete,index= range(0,len(X_incomplete)),columns=X_incomplete.columns)  
# print(df_numeric.shape)

merged =pd.concat([df_numeric, df_categorical], axis=1)


#duplicated columns

merged = merged.loc[:,~merged.columns.duplicated()]

print(merged.shape)

(36677, 96)


Now we can define our 4 models as sklearn `Pipeline` objects, containing our preprocessing step and training of one given algorithm.

In [0]:
# from sklearn.linear_model import LogisticRegression

# # # Logistic Regression
# # lr_model = Pipeline([("preprocessor", preprocessor), 
# #                      ("model", LogisticRegression(class_weight="balanced", 
# #                                                   solver="liblinear", 
# #                                                   random_state=42))])

# # Now we have a full prediction pipeline.
# lr_model = Pipeline(steps=[('preprocessor', preprocessor),
#                       ('classifier', LogisticRegression(solver='lbfgs'))])

Let's split the data into training and test sets.

In [28]:
# Get X, y
y = merged['camping']
X = merged.drop("camping", axis=1)

X.reset_index(drop=True).head()

Unnamed: 0,nb_pharmacies_et_parfumerie,dynamique_entrepreneuriale,dynamique_entrepreneuriale_service_et_commerce,synergie_medicale_commune,indice_fiscal_partiel,score_fiscal,indice_synergie_medicale,score_synergie_medicale,reg,nb_omnipraticiens_bv,nb_infirmiers_liberaux_bv,nb_dentistes_liberaux_bv,nb_pharmaciens_liberaux_bv,densite_medicale_bv,score_equipement_de_sante_bv,indice_demographique,score_demographique,indice_menages,score_menages,population,evolution_population,evolution_pop_,nb_menages,nb_residences_principales,nb_proprietaire,nb_logement,nb_residences_secondaires,nb_log_vacants,nb_occupants_residence_principale,nb_femme,nb_homme,nb_mineurs,nb_majeurs,nb_etudiants,nb_entreprises_secteur_services,nb_entreprises_secteur_commerce,nb_entreprises_secteur_construction,nb_entreprises_secteur_industrie,nb_creation_enteprises,nb_creation_industrielles,...,valeur_ajoutee_regionale,score_urbanite,nb_atifs,nb_actifs_salaries,nb_actifs_non_salaries,nb_logement_secondaire_et_occasionnel,nb_hotel,capacite_hotel,taux_etudiants,taux_propriete,dynamique_demographique_insee,capacite_fisc,capacite_fiscale,moyenne_revnus_fiscaux,nb_education_sante_action_sociale,nb_services_personnels_et_domestiques,nb_sante_action_sociale,nb_industries_des_biens_intermediaires,nb_de_commerce,nb_de_services_aux_particuliers,nb_institution_de_education_sante_action_sociale_administration,pib_regionnal,score_croissance_population,score_croissance_entrepreneuriale,score_va_region,score_pib,codgeo,orientation_economique,seg_croissance_pop,libgeo,dep,urbanite_ruralite,dynamique_demographique_bv,seg_environnement_demographique_obsolete,environnement_demographique,syn_medical,seg_cap_fiscale,seg_dyn_entre,dyn_setc,cp
0,0.0,57.0,23.0,114.0,101.93878,59.04139,114.56713,0.13481,82.0,9.0,14.0,7.0,7.0,0.09286,4.0,44.19769,0.0341,37.22029,0.02223,725.0,16.0,2.0,247.0,248.0,196.0,289.0,32.0,9.0,728.0,694.0,714.0,909.0,499.0,51.0,7.0,11.0,2.0,2.0,4.0,0.0,...,86957.458359,0.0,295.0,254.0,41.0,32.0,0.0,0.0,0.0,67.0,-1.0,117.0,117.0,11483.5,3.0,1.0,0.0,9364.0,9350.0,3372.0,15105.0,173681.0,72.13115,0.01585,32.42578,33.83811,0.137693,Bassin Industriel,en croissance démographique,0.137693,0.143874,Com rurale < 2 000 m habts,1.Accroissement par excédent naturel et migrat...,Zone rurale en croissance démographique,Bassin Industriel en croissance démographique,Other,Fiscalité moyenne,Faible dynamique,Faible Dynamique Serv et Com,0.143874
1,0.0,45.0,4.0,143.0,101.93878,59.04139,143.71141,0.17351,82.0,31.0,36.0,18.0,18.0,0.099229,4.0,10.18071,0.00786,10.09619,0.00603,167.0,4.0,2.0,67.0,67.0,61.0,142.0,71.0,4.0,168.0,162.0,164.0,202.0,124.0,5.0,4.0,0.0,1.0,0.0,1.0,0.0,...,86957.458359,0.0,57.0,49.0,8.0,71.0,0.0,0.0,0.0,42.0,0.0,110.0,110.0,11483.5,0.0,0.0,0.0,9364.0,9350.0,3372.0,15105.0,173681.0,72.13115,0.00173,32.42578,33.83811,0.137693,Bassin Résidentiel,en croissance démographique,0.137693,0.143874,Com rurale < 2 000 m habts,1.Accroissement par excédent naturel et migrat...,Zone rurale en croissance démographique,Bassin Résidentiel en croissance démographique,Other,Fiscalité moyenne,Faible dynamique,Faible Dynamique Serv et Com,0.143874
2,0.0,634.0,828.0,366.0,101.93878,59.04139,367.8208,0.47115,82.0,31.0,36.0,18.0,18.0,0.099229,4.0,696.92134,0.53776,699.19896,0.41767,11432.0,512.0,4.0,4640.0,4635.0,1968.0,5184.0,135.0,414.0,11015.0,11350.0,10878.0,13624.0,8604.0,904.0,342.0,301.0,58.0,108.0,83.0,4.0,...,86957.458359,37.5,4556.0,4203.0,353.0,135.0,2.0,52.0,0.0,37.0,-55.0,250.0,250.0,11483.5,113.0,41.0,118.0,9364.0,9350.0,3372.0,15105.0,173681.0,72.95082,0.38471,32.42578,33.83811,0.137693,Bassin Résidentiel,en croissance démographique,0.137693,0.143874,Other,1.Accroissement par excédent naturel et migrat...,Zone rurale en croissance démographique,Bassin Résidentiel en croissance démographique,Other,Fiscalité moyenne,Other,Other,0.143874
3,0.0,113.0,62.0,132.0,101.93878,59.04139,132.65668,0.15883,82.0,12.0,12.0,6.0,6.0,1.0,1.0,85.774,0.06619,71.2761,0.04258,1407.0,39.0,2.0,473.0,473.0,344.0,505.0,14.0,18.0,1406.0,1324.0,1402.0,1758.0,968.0,97.0,22.0,26.0,17.0,10.0,6.0,0.0,...,86957.458359,0.0,621.0,535.0,86.0,14.0,2.0,17.0,0.0,68.0,-3.0,127.0,127.0,11483.5,5.0,2.0,7.0,9364.0,9350.0,3372.0,15105.0,173681.0,72.13115,0.02824,32.42578,33.83811,0.141015,Bassin Urbain,en croissance démographique,0.141015,0.143874,Com rurale < 2 000 m habts,Grande Ville,Com rurale < 2 000 m habts en croissance démog...,Bassin Urbain en croissance démographique,Other,Fiscalité moyenne,Other,Faible Dynamique Serv et Com,0.143874
4,0.0,42.0,1.0,121.0,101.93878,59.04139,121.60196,0.14415,82.0,26.0,21.0,10.0,10.0,0.100905,3.0,5.24276,0.00405,6.17827,0.00369,86.0,-8.0,-9.0,41.0,41.0,28.0,57.0,13.0,3.0,86.0,86.0,86.0,101.0,71.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,...,86957.458359,0.0,37.0,33.0,4.0,13.0,0.0,0.0,0.0,49.0,0.0,109.0,109.0,11483.5,0.0,0.0,0.0,9364.0,9350.0,3372.0,15105.0,173681.0,67.62295,0.0,32.42578,33.83811,0.137693,Bassin Résidentiel,en croissance démographique,0.137693,0.143874,Com rurale < 2 000 m habts,3.Accroissement par excédent migratoire,Zone rurale en croissance démographique,Bassin Résidentiel en croissance démographique,Other,Fiscalité moyenne,Faible dynamique,Faible Dynamique Serv et Com,0.143874


In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y,test_size=.3)


print(np.shape(X_train))

(25673, 95)


We're good to go!

### Train Logistic Regression

First let's fine tune our logistic regression and evaluate its performance. We can treat our pipeline object like a normal sklearn model and call `.fit` or `.predict` on it. We can also directly pass it to the `GridSearchCV` function.

In [30]:
clf.fit(X_train, y_train)

print("model score: %.3f" % clf.score(X_test, y_test))

model score: 0.819


In [33]:
param_grid = {
    'preprocessor__num__imputer__strategy': ['mean', 'median'],
    'classifier__C': [0.1, 1.0, 10],
}

grid_search = GridSearchCV(clf, param_grid, cv=5, verbose =1,iid=False)
grid_search.fit(X_train, y_train)

print(("best logistic regression from grid search: %.3f"
       % grid_search.score(X_test, y_test)))

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  3.5min finished


best logistic regression from grid search: 0.819


Let's see our best parameters and score

In [34]:
print(grid_search.best_params_)
print(grid_search.best_score_)

{'classifier__C': 1.0, 'preprocessor__num__imputer__strategy': 'mean'}
0.822693330997702


Let's retrain our model on the whole training data with the best parameters. First we need to update the parameters inside our pipeline. We can use the `set_params` method for that.

In [35]:
clf.set_params(**grid_search.best_params_)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='mean',
                                                               

We can run `get_params` to get the parameters of our model as a sanity check that we have properly updated the parameters.

In [0]:
# clf.get_params("classifier")

Now we can fit the model on the whole training set and calculate accuracy on the test set.

In [0]:
# clf.fit(X_train, y_train)

Generate predictions

In [0]:
y_pred = clf.predict(X_test)

In [37]:
accuracy_score(y_test, y_pred)

0.819429298436932

In [38]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.95      0.84      0.89      9484
         1.0       0.41      0.70      0.52      1520

    accuracy                           0.82     11004
   macro avg       0.68      0.77      0.70     11004
weighted avg       0.87      0.82      0.84     11004



Here the pipeline creates a temporary DataFrame that contains the one hot encoded features. If you want to debug your pipeline, you can access intermediary stages at any time. For instance below we get the preprocessor, extract the stage for categorical features and get the categories it has learnt:

In [39]:
ohe_categories = clf.named_steps["preprocessor"].transformers_[1][1].named_steps['onehot'].get_feature_names()

ohe_categories

array(['x0_Bassin Industriel', 'x0_Bassin Résidentiel',
       'x0_Bassin Urbain', 'x0_Bassin diversifié',
       'x0_Bassins Agroalimentaire', 'x1_en croissance démographique',
       'x1_en déclin démographique', 'x2_Com rurale < 2 000 m habts ',
       'x2_Other',
       'x3_1.Accroissement par excédent naturel et migratoire',
       'x3_2.Accroissement par excédent naturel',
       'x3_3.Accroissement par excédent migratoire',
       'x3_4.Déclin par déficit naturel et migratoire',
       'x3_5.Déclin par déficit naturel',
       'x3_6.Déclin par déficit migratoire', 'x3_Grande Ville',
       'x4_Com < 10 m habts en croissance démographique',
       'x4_Com < 200 m habts en croissance démographique',
       'x4_Com < 50 m habts en croissance démographique',
       'x4_Com > 200 m habts en croissance démographique',
       'x4_Com rurale < 2 000 m habts en croissance démographique',
       'x4_Com rurale > 2 000 habts en croissance démographique',
       'x4_Zone rurale en croissanc

Now we can create a list with all those features so we can reuse it later:

In [0]:
all_features = numeric_features + te_features + list(ohe_categories)

In [41]:
len(all_features)

127

Great, so now we have a nice list of columns after processing. Let's visualise the data in a dataframe just for sanity check. For that we can also call methods directly on specificy stages:

In [42]:
pd.DataFrame(clf.named_steps["preprocessor"].transform(X_train), 
             columns=all_features).head(100)

ValueError: ignored

Now that we have trained our first model, we can easily do the same with our three others. Having the preprocessing steps in a pipeline makes the code cleaner and easier to read: here all code is related to training the model only, not tweaking the data.

### Train a Decision Tree

Define the new model

In [0]:
from sklearn.tree import DecisionTreeClassifier

# Decision Tree
dt_model = Pipeline([("preprocessor", preprocessor), 
                     ("model", DecisionTreeClassifier(class_weight="balanced"))])

In [44]:
gs = GridSearchCV(dt_model, {"model__max_depth": [3, 5, 7], 
                             "model__min_samples_split": [2, 5]}, 
                  n_jobs=-1, cv=5, scoring="accuracy")

gs.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('preprocessor',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('num',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('imputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                                        copy=True,
                    

Let's see our best parameters and score

In [45]:
print(gs.best_params_)
print(gs.best_score_)

{'model__max_depth': 3, 'model__min_samples_split': 2}
0.8042690764616524


In [46]:
dt_model.set_params(**gs.best_params_)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='mean',
                                                               

In [0]:
dt_model.fit(X_train, y_train)
y_pred = dt_model.predict(X_test)

In [48]:
accuracy_score(y_test, y_pred)

0.8297891675754271

In [49]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.94      0.86      0.90      9484
         1.0       0.43      0.67      0.52      1520

    accuracy                           0.83     11004
   macro avg       0.68      0.76      0.71     11004
weighted avg       0.87      0.83      0.84     11004



### Train a Random Forest

In [0]:
from sklearn.ensemble import RandomForestClassifier

# Random Forest
rf_model = Pipeline([("preprocessor", preprocessor), 
                     ("model", RandomForestClassifier(class_weight="balanced", 
                                                      n_estimators=100, n_jobs=-1))])

In [51]:
gs = GridSearchCV(rf_model, {"model__max_depth": [10, 15], 
                             "model__min_samples_split": [5, 10]}, 
                  n_jobs=-1, cv=5, scoring="accuracy")

gs.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('preprocessor',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('num',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('imputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                                        copy=True,
                    

Let's see our best parameters and score

In [52]:
print(gs.best_params_)
print(gs.best_score_)

{'model__max_depth': 15, 'model__min_samples_split': 5}
0.8898453628325478


In [53]:
rf_model.set_params(**gs.best_params_)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='mean',
                                                               

In [0]:
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)

In [55]:
accuracy_score(y_test, y_pred)

0.8897673573246092

In [56]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.93      0.94      0.94      9484
         1.0       0.61      0.56      0.58      1520

    accuracy                           0.89     11004
   macro avg       0.77      0.75      0.76     11004
weighted avg       0.89      0.89      0.89     11004



### Train an XGB model

Pipeline can work with any class compatible with scikit-learn, here we are using it with the XGBClassifier from xgboost for instance:

In [0]:
from xgboost.sklearn import XGBClassifier

# XGBoost
xgb_model = Pipeline([("prepyrocessor", preprocessor), 
                      # Add a scale_pos_weight to make it balanced
                      ("model", XGBClassifier(scale_pos_weight=(1 - y.mean()), 
                                              n_jobs=-1))])

In [58]:
gs = GridSearchCV(xgb_model, {"model__max_depth": [5, 10],
                              "model__min_child_weight": [5, 10],
                              "model__n_estimators": [25]},
                  n_jobs=-1, cv=5, scoring="accuracy")

gs.fit(X_train, y_train)



GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('preprocessor',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('num',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('imputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                                        copy=True,
                    

Let's see our best parameters and score.

In [59]:
print(gs.best_params_)
print(gs.best_score_)
xgb_model.set_params(**gs.best_params_)
xgb_model.fit(X_train, y_train)

{'model__max_depth': 10, 'model__min_child_weight': 10, 'model__n_estimators': 25}
0.8972461340708137


Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='mean',
                                                               

Generate predictions

In [0]:
y_pred = xgb_model.predict(X_test)

In [61]:
accuracy_score(y_test, y_pred)

0.900945110868775

In [62]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.91      0.98      0.94      9484
         1.0       0.74      0.43      0.55      1520

    accuracy                           0.90     11004
   macro avg       0.83      0.70      0.75     11004
weighted avg       0.89      0.90      0.89     11004



In [0]:
# LGBm
lgb_model = Pipeline([("preprocessor", preprocessor), 
                      # Add a scale_pos_weight to make it balanced
                      ("model",  LGBMClassifier(n_estimators=40, max_depth=8, min_samples_split=.005, n_jobs=-1))])

In [73]:
lgb_model.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='mean',
                                                               

In [0]:
y_pred = lgb_model.predict(X_test)

In [75]:
accuracy_score(y_test, y_pred)

0.9005816066884769

In [0]:
# catboost
catb_model = Pipeline([("preprocessor", preprocessor), 
                      # Add a scale_pos_weight to make it balanced
                      ("model",  CatBoostClassifier(iterations =100, learning_rate = 0.03) )])


In [79]:
catb_model.fit(X_train, y_train)

0:	learn: 0.6636183	total: 72.2ms	remaining: 7.14s
1:	learn: 0.6354058	total: 93.7ms	remaining: 4.59s
2:	learn: 0.6072342	total: 118ms	remaining: 3.82s
3:	learn: 0.5806002	total: 140ms	remaining: 3.36s
4:	learn: 0.5580426	total: 161ms	remaining: 3.06s
5:	learn: 0.5389987	total: 183ms	remaining: 2.86s
6:	learn: 0.5196062	total: 204ms	remaining: 2.71s
7:	learn: 0.5024240	total: 225ms	remaining: 2.59s
8:	learn: 0.4855466	total: 246ms	remaining: 2.49s
9:	learn: 0.4702562	total: 267ms	remaining: 2.4s
10:	learn: 0.4567752	total: 293ms	remaining: 2.37s
11:	learn: 0.4434738	total: 317ms	remaining: 2.32s
12:	learn: 0.4316584	total: 340ms	remaining: 2.28s
13:	learn: 0.4218781	total: 362ms	remaining: 2.22s
14:	learn: 0.4111974	total: 383ms	remaining: 2.17s
15:	learn: 0.4023282	total: 405ms	remaining: 2.13s
16:	learn: 0.3927706	total: 428ms	remaining: 2.09s
17:	learn: 0.3857266	total: 449ms	remaining: 2.04s
18:	learn: 0.3776366	total: 470ms	remaining: 2s
19:	learn: 0.3712788	total: 493ms	remaining

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='mean',
                                                               

In [0]:
y_pred = lgb_model.predict(X_test)

In [81]:
accuracy_score(y_test, y_pred)

0.9005816066884769