# Import des librairies

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import imblearn

# Import des données

In [2]:
df = pd.read_csv('donnée/data_rdy.csv')

In [3]:
train_df = df[df['TARGET'].notnull()]

# Destiné à Kaggle
test_df = df[df['TARGET'].isnull()]

# Etude du DataFrame

In [4]:
# On regarde notre DataFrame final

train_df

Unnamed: 0,index,SK_ID_CURR,TARGET,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,CC_NAME_CONTRACT_STATUS_Signed_MAX,CC_NAME_CONTRACT_STATUS_Signed_MEAN,CC_NAME_CONTRACT_STATUS_Signed_SUM,CC_NAME_CONTRACT_STATUS_Signed_VAR,CC_NAME_CONTRACT_STATUS_nan_MIN,CC_NAME_CONTRACT_STATUS_nan_MAX,CC_NAME_CONTRACT_STATUS_nan_MEAN,CC_NAME_CONTRACT_STATUS_nan_SUM,CC_NAME_CONTRACT_STATUS_nan_VAR,CC_COUNT
0,0,100002,1.0,0,0,0,0,202500.0,406597.5,24700.5,...,,,,,,,,,,
1,1,100003,0.0,1,0,1,0,270000.0,1293502.5,35698.5,...,,,,,,,,,,
2,2,100004,0.0,0,1,0,0,67500.0,135000.0,6750.0,...,,,,,,,,,,
3,3,100006,0.0,1,0,0,0,135000.0,312682.5,29686.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
4,4,100007,0.0,0,0,0,0,121500.0,513000.0,21865.5,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307502,307506,456251,0.0,0,0,1,0,157500.0,254700.0,27558.0,...,,,,,,,,,,
307503,307507,456252,0.0,1,0,0,0,72000.0,269550.0,12001.5,...,,,,,,,,,,
307504,307508,456253,0.0,1,0,0,0,153000.0,677664.0,29979.0,...,,,,,,,,,,
307505,307509,456254,1.0,1,0,0,0,171000.0,370107.0,20205.0,...,,,,,,,,,,


In [5]:
# On regarde la répartition entre les clients autoriser à faire des cédits et ceux qui ne sont pas autorisés :
# On cherche les clients susceptible de ne pas rembourser
# 0 correspond au client autorisé à faire des crédits
# 1 correspond au client non autorisé

train_df['TARGET'].unique()

array([1., 0.])

In [6]:
# On regarde si il y a une imbalance

dict(train_df['TARGET'].value_counts()/len(train_df['TARGET'])*100)

{0.0: 91.92701304360551, 1.0: 8.07298695639449}

In [7]:
# On retire les informations qui ne sont pas des features

feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]

correlation = train_df[feats].corrwith(train_df['TARGET'])
correlation

CODE_GENDER                        -0.054710
FLAG_OWN_CAR                       -0.021850
FLAG_OWN_REALTY                     0.006146
CNT_CHILDREN                        0.019189
AMT_INCOME_TOTAL                   -0.003982
                                      ...   
CC_NAME_CONTRACT_STATUS_nan_MAX          NaN
CC_NAME_CONTRACT_STATUS_nan_MEAN         NaN
CC_NAME_CONTRACT_STATUS_nan_SUM          NaN
CC_NAME_CONTRACT_STATUS_nan_VAR          NaN
CC_COUNT                           -0.060481
Length: 795, dtype: float64

In [8]:
feature = list(correlation.abs().sort_values(ascending=False)[:20].index)
feature

['EXT_SOURCE_3',
 'EXT_SOURCE_2',
 'EXT_SOURCE_1',
 'CC_CNT_DRAWINGS_ATM_CURRENT_MEAN',
 'CC_CNT_DRAWINGS_CURRENT_MAX',
 'BURO_DAYS_CREDIT_MEAN',
 'CC_AMT_BALANCE_MEAN',
 'CC_AMT_TOTAL_RECEIVABLE_MEAN',
 'CC_AMT_RECIVABLE_MEAN',
 'CC_AMT_RECEIVABLE_PRINCIPAL_MEAN',
 'CC_CNT_DRAWINGS_CURRENT_MEAN',
 'BURO_MONTHS_BALANCE_SIZE_MEAN',
 'BURO_CREDIT_ACTIVE_Closed_MEAN',
 'DAYS_BIRTH',
 'PREV_NAME_CONTRACT_STATUS_Refused_MEAN',
 'BURO_CREDIT_ACTIVE_Active_MEAN',
 'BURO_DAYS_CREDIT_MIN',
 'DAYS_EMPLOYED',
 'PREV_CODE_REJECT_REASON_XAP_MEAN',
 'CC_AMT_INST_MIN_REGULARITY_MEAN']

In [9]:
train_df[feature].dropna()

Unnamed: 0,EXT_SOURCE_3,EXT_SOURCE_2,EXT_SOURCE_1,CC_CNT_DRAWINGS_ATM_CURRENT_MEAN,CC_CNT_DRAWINGS_CURRENT_MAX,BURO_DAYS_CREDIT_MEAN,CC_AMT_BALANCE_MEAN,CC_AMT_TOTAL_RECEIVABLE_MEAN,CC_AMT_RECIVABLE_MEAN,CC_AMT_RECEIVABLE_PRINCIPAL_MEAN,CC_CNT_DRAWINGS_CURRENT_MEAN,BURO_MONTHS_BALANCE_SIZE_MEAN,BURO_CREDIT_ACTIVE_Closed_MEAN,DAYS_BIRTH,PREV_NAME_CONTRACT_STATUS_Refused_MEAN,BURO_CREDIT_ACTIVE_Active_MEAN,BURO_DAYS_CREDIT_MIN,DAYS_EMPLOYED,PREV_CODE_REJECT_REASON_XAP_MEAN,CC_AMT_INST_MIN_REGULARITY_MEAN
36,0.754406,0.681699,0.842763,0.060606,10.0,-1904.000000,208572.600000,208397.449091,208397.449091,203647.547727,0.363636,2.000000,1.000000,-17199,0.555556,0.000000,-2639.0,-768.0,0.444444,11279.115000
85,0.824595,0.495765,0.447675,0.191489,2.0,-1866.400000,40102.360851,40075.166968,40075.166968,38209.465053,0.212766,32.400000,1.000000,-15909,0.000000,0.000000,-2830.0,-1094.0,1.000000,2713.067903
111,0.452534,0.707972,0.339403,1.625000,10.0,-690.500000,239297.514375,236263.530000,236263.530000,229597.291875,3.125000,21.000000,0.000000,-13830,0.000000,1.000000,-875.0,-1169.0,1.000000,10370.178750
119,0.096319,0.601408,0.567775,1.066667,7.0,-733.500000,192715.035600,192650.827200,192650.827200,181626.355200,1.266667,8.000000,0.500000,-13286,0.272727,0.500000,-1252.0,-2305.0,0.636364,9687.838378
124,0.739412,0.746486,0.647045,0.000000,0.0,-1489.714286,1614.429844,1587.911719,1587.911719,1545.474844,0.000000,46.285714,0.571429,-16282,0.166667,0.428571,-2857.0,-4375.0,0.833333,254.476875
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307469,0.565608,0.264486,0.585450,0.441176,8.0,-1375.727273,246443.762647,247080.352500,247080.352500,239288.582647,0.441176,45.636364,0.818182,-14589,0.000000,0.181818,-2216.0,-2145.0,1.000000,12882.023182
307471,0.424130,0.583214,0.634729,0.000000,0.0,-1082.900000,5511.337660,5440.027979,5440.027979,5268.558830,0.000000,25.700000,0.600000,-13416,0.181818,0.400000,-2624.0,-2405.0,0.818182,453.269681
307475,0.352340,0.226036,0.407174,0.700000,15.0,-664.000000,72884.079000,71516.529000,71516.529000,68680.174500,3.800000,19.875000,0.625000,-13346,0.000000,0.375000,-1450.0,-1972.0,1.000000,3189.397500
307496,0.337673,0.789389,0.896042,1.048780,8.0,-1740.956522,131834.730732,130767.060732,130767.060732,127608.373537,1.365854,48.608696,0.826087,-20390,0.055556,0.173913,-2906.0,-5326.0,0.944444,6514.200000


# Imbalance

Il faut que l'on s'occuppe de l'imbalance entre les deux classes de la Target

In [10]:
train_rdy = train_df.dropna(subset = feature)
train_rdy

Unnamed: 0,index,SK_ID_CURR,TARGET,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,CC_NAME_CONTRACT_STATUS_Signed_MAX,CC_NAME_CONTRACT_STATUS_Signed_MEAN,CC_NAME_CONTRACT_STATUS_Signed_SUM,CC_NAME_CONTRACT_STATUS_Signed_VAR,CC_NAME_CONTRACT_STATUS_nan_MIN,CC_NAME_CONTRACT_STATUS_nan_MAX,CC_NAME_CONTRACT_STATUS_nan_MEAN,CC_NAME_CONTRACT_STATUS_nan_SUM,CC_NAME_CONTRACT_STATUS_nan_VAR,CC_COUNT
36,36,100043,0.0,1,0,0,2,198000.0,641173.5,23157.0,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,33.0
85,85,100100,0.0,0,1,0,2,202500.0,796396.5,38443.5,...,1.0,0.031915,3.0,0.031229,0.0,0.0,0.0,0.0,0.0,94.0
111,111,100131,0.0,1,0,0,0,270000.0,891072.0,45625.5,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,24.0
119,119,100139,0.0,1,0,1,1,157500.0,302341.5,24016.5,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,75.0
124,124,100145,0.0,1,1,0,1,202500.0,260725.5,16789.5,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,96.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307469,307473,456213,0.0,1,1,0,1,90000.0,258709.5,20439.0,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,34.0
307471,307475,456215,1.0,1,0,1,1,144000.0,1303200.0,46809.0,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,94.0
307475,307479,456219,0.0,1,0,0,1,112500.0,521280.0,31630.5,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,10.0
307496,307500,456244,0.0,1,0,0,0,261000.0,1303812.0,35982.0,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,41.0


In [11]:
# On vérifie que l'imbalance n'a pas trop changé après le dropna

dict(train_rdy['TARGET'].value_counts()/len(train_rdy['TARGET'])*100)

{0.0: 90.37421817025336, 1.0: 9.625781829746634}

In [12]:
# On récupère notre x et notre y

X = train_rdy[feature]
y = train_rdy['TARGET']

In [13]:
from sklearn.model_selection import train_test_split

# On prend un X_train et y_train

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state = 0)

In [14]:
# import library
from imblearn.over_sampling import SMOTE

# On teste le SMOTE. On utilisera un pipeline plus tard pour s'assurer que tout va bien

smote = SMOTE()

# fit predictor and target variable
X_smote, y_smote = smote.fit_resample(X_train, y_train)

print('original dataset shape:\n', y.value_counts()/len(y)*100)
print('Resample dataset shape:\n', y_smote.value_counts()/len(y_smote)*100)

original dataset shape:
 0.0    90.374218
1.0     9.625782
Name: TARGET, dtype: float64
Resample dataset shape:
 0.0    50.0
1.0    50.0
Name: TARGET, dtype: float64


# Gridsearch

In [15]:
# Score
from sklearn.metrics import fbeta_score, make_scorer, roc_auc_score

#Pipeline
from imblearn.pipeline import Pipeline

# Gridsearch

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Classifier

from sklearn.ensemble import RandomForestClassifier

In [16]:
# On prépare le fbetascore pour le gridsearch

# Le fbeta_score permet de maximiser la pénalité sur les faux négatifs. Il servira à évaluer la performance final de nos modèles
ftwo_scorer = make_scorer(fbeta_score, beta=10)

In [17]:
# On prépare le roc_auc_score pour le gridsearch

roc_auc = make_scorer(roc_auc_score)

## Random Forest

## fbeta score

In [18]:
# Random Forest

# Number of trees in random forest
# original
#n_estimators = [10,20,30,50,70,100]

n_estimators = np.arange(10,20,1)

# Number of features to consider at every split

# original
#max_features = ['auto', 'sqrt']

#max_features = ['sqrt']

# a été retiré

# Maximum number of levels in tree

# original
max_depth = [2,3,4]

#max_depth = [2]

# Minimum number of samples required to split a node

# original
#min_samples_split = [3, 5, 10]

min_samples_split = [4,5, 6,7]

# Minimum number of samples required at each leaf node

# original
min_samples_leaf = [ 3, 4, 5]

#min_samples_leaf = [ 4]

# Create the random grid
random_grid = {'classification__n_estimators': n_estimators,
               #'classification__max_features': max_features,
               'classification__max_depth': max_depth,
               'classification__min_samples_split': min_samples_split,
               'classification__min_samples_leaf': min_samples_leaf}

In [18]:
random_grid ={"classification__n_estimators":[10, 30, 50, 100],
              "classification__criterion":["gini", "entropy"],
              "classification__max_depth":[2],
              "classification__min_samples_split":range(30, 50, 70), 
              "classification__min_samples_leaf":[20, 25, 30], 
              "classification__max_features": range(2, 7) 
             }

In [19]:
model_tree = Pipeline([
        ('sampling', SMOTE()),
        ('classification', RandomForestClassifier())
    ])

In [20]:
search_tree_ftwo = RandomizedSearchCV(estimator = model_tree,scoring = ftwo_scorer,\
                               param_distributions = random_grid, n_iter = 400, cv = 5,\
                               verbose=2, random_state=0, n_jobs = -1)

In [21]:
search_tree_ftwo.fit(X_train,y_train)



Fitting 5 folds for each of 120 candidates, totalling 600 fits


In [22]:
search_tree_ftwo.best_params_

{'classification__n_estimators': 10,
 'classification__min_samples_split': 30,
 'classification__min_samples_leaf': 20,
 'classification__max_features': 2,
 'classification__max_depth': 2,
 'classification__criterion': 'entropy'}

In [23]:
search_tree_ftwo.best_score_

0.5466868543856265

In [24]:
search_tree_ftwo.score(X_test, y_test)

0.510068808787324

In [25]:
search_tree_ftwo.score(X_train, y_train)

0.4887762398488176

In [26]:
y_predict = search_tree_ftwo.predict(X_train)

fbeta_score(y_train,y_predict,beta=10)

0.4887762398488176

In [27]:
y_predict = search_tree_ftwo.predict(X_test)

fbeta_score(y_test,y_predict,beta=10)

0.510068808787324

In [28]:
current_model = search_tree_ftwo

y_predict = current_model.predict(X_test)

report = pd.DataFrame({'model':['search_tree'],'score':['fbeta'],'best_params':[current_model.best_params_],\
          'best_score':[current_model.best_score_],'train':[current_model.score(X_train, y_train)],\
          'test':[current_model.score(X_test, y_test)],'fbeta_score_test':[fbeta_score(y_test,y_predict,beta=10)]})

report

Unnamed: 0,model,score,best_params,best_score,train,test,fbeta_score_test
0,search_tree,fbeta,"{'classification__n_estimators': 10, 'classifi...",0.546687,0.488776,0.510069,0.510069


## au_roc

In [29]:
search_tree_roc = RandomizedSearchCV(estimator = model_tree,scoring = roc_auc,\
                               param_distributions = random_grid, n_iter = 400, cv = 5,\
                               verbose=2, random_state=0, n_jobs = -1)

In [30]:
search_tree_roc.fit(X_train,y_train)



Fitting 5 folds for each of 120 candidates, totalling 600 fits


In [31]:
search_tree_roc.best_params_

{'classification__n_estimators': 50,
 'classification__min_samples_split': 30,
 'classification__min_samples_leaf': 25,
 'classification__max_features': 2,
 'classification__max_depth': 2,
 'classification__criterion': 'entropy'}

In [32]:
search_tree_roc.best_score_

0.6148020572870012

In [33]:
search_tree_roc.score(X_test, y_test)

0.6065334445624148

In [34]:
search_tree_roc.score(X_train, y_train)

0.6040487099765449

In [35]:
# Fbeta est le score que l'on cherche à maximiser

y_predict = search_tree_roc.predict(X_train)

fbeta_score(y_train,y_predict,beta=10)

0.4909452353469283

In [36]:
y_predict = search_tree_roc.predict(X_test)

fbeta_score(y_test,y_predict,beta=10)

0.4874579284943164

In [37]:
current_model = search_tree_roc

y_predict = current_model.predict(X_test)

report_new = pd.DataFrame({'model':['search_tree'],'score':['roc_auc'],'best_params':[current_model.best_params_],\
          'best_score':[current_model.best_score_],'train':[current_model.score(X_train, y_train)],\
          'test':[current_model.score(X_test, y_test)],'fbeta_score_test':[fbeta_score(y_test,y_predict,beta=10)]})

report_final = pd.concat([report,report_new])
report_final

Unnamed: 0,model,score,best_params,best_score,train,test,fbeta_score_test
0,search_tree,fbeta,"{'classification__n_estimators': 13, 'classifi...",0.52659,0.527499,0.528562,0.528562
0,search_tree,roc_auc,"{'classification__n_estimators': 13, 'classifi...",0.613108,0.611411,0.614265,0.487458


# Regression logistic

## fbeta_score

In [38]:
from sklearn.linear_model import LogisticRegression

In [39]:
model_logistic = Pipeline([
        ('sampling', SMOTE()),
        ('classification', LogisticRegression())
    ])

In [40]:
#random_grid = {'classification__max_iter':[1000,5000,10_000]}

#
random_grid = {'classification__penalty': ['l1', 'l2'],
              'classification__C':np.logspace(-5, 5,1000), 
              'classification__max_iter':[500, 1000],
              'classification__solver': ["liblinear"],
              'classification__tol':np.logspace(-4, -2, 100)
             }

In [41]:
search_logi_ftwo = RandomizedSearchCV(estimator = model_logistic,scoring = ftwo_scorer,\
                               param_distributions = random_grid, n_iter = 5, cv = 5,\
                               verbose=2, random_state=0, n_jobs = -1)

In [42]:
search_logi_ftwo.fit(X_train,y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


In [43]:
search_logi_ftwo.best_params_

{'classification__tol': 0.0047508101621027985,
 'classification__solver': 'liblinear',
 'classification__penalty': 'l1',
 'classification__max_iter': 500,
 'classification__C': 9098.272894455567}

In [44]:
search_logi_ftwo.best_score_

0.6426793510362524

In [45]:
search_logi_ftwo.score(X_test,y_test)

0.625189489641233

In [46]:
search_logi_ftwo.score(X_train,y_train)

0.6401881102080458

In [47]:
y_predict = search_logi_ftwo.predict(X_test)

fbeta_score(y_test,y_predict,beta=10)

0.625189489641233

In [48]:
current_model = search_logi_ftwo

y_predict = current_model.predict(X_test)

report_new = pd.DataFrame({'model':'search_logi','score':['fbeta'],'best_params':[current_model.best_params_],\
          'best_score':[current_model.best_score_],'train':[current_model.score(X_train, y_train)],\
          'test':[current_model.score(X_test, y_test)],'fbeta_score_test':[fbeta_score(y_test,y_predict,beta=10)]})

report_final = pd.concat([report_final,report_new])
report_final

Unnamed: 0,model,score,best_params,best_score,train,test,fbeta_score_test
0,search_tree,fbeta,"{'classification__n_estimators': 13, 'classifi...",0.52659,0.527499,0.528562,0.528562
0,search_tree,roc_auc,"{'classification__n_estimators': 13, 'classifi...",0.613108,0.611411,0.614265,0.487458
0,search_logi,fbeta,"{'classification__tol': 0.0047508101621027985,...",0.642679,0.640188,0.625189,0.625189


## au_roc

In [49]:
search_logi_roc = RandomizedSearchCV(estimator = model_logistic,scoring = roc_auc,\
                               param_distributions = random_grid, n_iter = 3, cv = 5,\
                               verbose=2, random_state=0, n_jobs = -1)

In [50]:
search_logi_roc.fit(X_train,y_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


In [51]:
search_logi_roc.best_params_

{'classification__tol': 0.0001668100537200059,
 'classification__solver': 'liblinear',
 'classification__penalty': 'l2',
 'classification__max_iter': 500,
 'classification__C': 444.27067496068827}

In [52]:
search_logi_roc.best_score_

0.6099282757484276

In [53]:
search_logi_roc.score(X_test,y_test)

0.5916597148490823

In [54]:
search_logi_roc.score(X_train,y_train)

0.6163080237822506

In [55]:
y_predict = search_logi_roc.predict(X_test)

fbeta_score(y_test,y_predict,beta=10)

0.5504949254479388

In [56]:
current_model = search_logi_roc

y_predict = current_model.predict(X_test)

report_new = pd.DataFrame({'model':'search_logi','score':['roc_auc'],'best_params':[current_model.best_params_],\
          'best_score':[current_model.best_score_],'train':[current_model.score(X_train, y_train)],\
          'test':[current_model.score(X_test, y_test)],'fbeta_score_test':[fbeta_score(y_test,y_predict,beta=10)]})

report_final = pd.concat([report_final,report_new])
report_final

Unnamed: 0,model,score,best_params,best_score,train,test,fbeta_score_test
0,search_tree,fbeta,"{'classification__n_estimators': 13, 'classifi...",0.52659,0.527499,0.528562,0.528562
0,search_tree,roc_auc,"{'classification__n_estimators': 13, 'classifi...",0.613108,0.611411,0.614265,0.487458
0,search_logi,fbeta,"{'classification__tol': 0.0047508101621027985,...",0.642679,0.640188,0.625189,0.625189
0,search_logi,roc_auc,"{'classification__tol': 0.0001668100537200059,...",0.609928,0.616308,0.59166,0.550495


# KNN Classifier

## fwto scorer

In [57]:
from sklearn.neighbors import KNeighborsClassifier

In [58]:
model_KNN = Pipeline([
        ('sampling', SMOTE()),
        ('classification', KNeighborsClassifier())
    ])

In [59]:
random_grid = {'classification__n_neighbors':[2,5,10,20,30]}

In [60]:
search_KNN_ftwo = RandomizedSearchCV(estimator = model_KNN,scoring = ftwo_scorer,\
                               param_distributions = random_grid, n_iter = 3, cv = 5,\
                               verbose=2, random_state=0, n_jobs = -1)

In [61]:
search_KNN_ftwo.fit(X_train,y_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


In [62]:
search_KNN_ftwo.best_score_

0.41663421774439674

In [63]:
search_KNN_ftwo.best_params_

{'classification__n_neighbors': 10}

In [64]:
search_KNN_ftwo.score(X_test,y_test)

0.3683350100603622

In [65]:
search_KNN_ftwo.score(X_train,y_train)

0.9320340021442793

In [66]:
# Très clairement, ce modèle overfit

In [67]:
y_predict = search_KNN_ftwo.predict(X_test)

fbeta_score(y_test,y_predict,beta=10)

0.3683350100603622

In [68]:
current_model = search_KNN_ftwo

y_predict = current_model.predict(X_test)

report_new = pd.DataFrame({'model':'search_KNN','score':['fbeta'],'best_params':[current_model.best_params_],\
          'best_score':[current_model.best_score_],'train':[current_model.score(X_train, y_train)],\
          'test':[current_model.score(X_test, y_test)],'fbeta_score_test':[fbeta_score(y_test,y_predict,beta=10)]})

report_final = pd.concat([report_final,report_new])
report_final

Unnamed: 0,model,score,best_params,best_score,train,test,fbeta_score_test
0,search_tree,fbeta,"{'classification__n_estimators': 13, 'classifi...",0.52659,0.527499,0.528562,0.528562
0,search_tree,roc_auc,"{'classification__n_estimators': 13, 'classifi...",0.613108,0.611411,0.614265,0.487458
0,search_logi,fbeta,"{'classification__tol': 0.0047508101621027985,...",0.642679,0.640188,0.625189,0.625189
0,search_logi,roc_auc,"{'classification__tol': 0.0001668100537200059,...",0.609928,0.616308,0.59166,0.550495
0,search_KNN,fbeta,{'classification__n_neighbors': 10},0.416634,0.932034,0.368335,0.368335


## au_roc

In [69]:
search_KNN_roc = RandomizedSearchCV(estimator = model_KNN,scoring = roc_auc,\
                               param_distributions = random_grid, n_iter = 3, cv = 5,\
                               verbose=2, random_state=0, n_jobs = -1)

In [70]:
search_KNN_roc.fit(X_train,y_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


In [71]:
current_model = search_KNN_roc

print("best_score :",current_model.best_score_)
print("best_params :",current_model.best_params_)
print("score X_train :",current_model.score(X_train,y_train))
print("score X_test :",current_model.score(X_test,y_test))

y_predict = current_model.predict(X_test)

print("score fbeta :",fbeta_score(y_test,y_predict,beta=10))

best_score : 0.5351907047094764
best_params : {'classification__n_neighbors': 10}
score X_train : 0.8311187476135928
score X_test : 0.5154045957834066
score fbeta : 0.3752755211285345


In [72]:
current_model = search_KNN_roc

y_predict = current_model.predict(X_test)

report_new = pd.DataFrame({'model':'search_KNN','score':['roc_auc'],'best_params':[current_model.best_params_],\
          'best_score':[current_model.best_score_],'train':[current_model.score(X_train, y_train)],\
          'test':[current_model.score(X_test, y_test)],'fbeta_score_test':[fbeta_score(y_test,y_predict,beta=10)]})

report_final = pd.concat([report_final,report_new])
report_final

Unnamed: 0,model,score,best_params,best_score,train,test,fbeta_score_test
0,search_tree,fbeta,"{'classification__n_estimators': 13, 'classifi...",0.52659,0.527499,0.528562,0.528562
0,search_tree,roc_auc,"{'classification__n_estimators': 13, 'classifi...",0.613108,0.611411,0.614265,0.487458
0,search_logi,fbeta,"{'classification__tol': 0.0047508101621027985,...",0.642679,0.640188,0.625189,0.625189
0,search_logi,roc_auc,"{'classification__tol': 0.0001668100537200059,...",0.609928,0.616308,0.59166,0.550495
0,search_KNN,fbeta,{'classification__n_neighbors': 10},0.416634,0.932034,0.368335,0.368335
0,search_KNN,roc_auc,{'classification__n_neighbors': 10},0.535191,0.831119,0.515405,0.375276


# Gradiant boosting classifier

## fwto scorer

In [73]:
from sklearn.ensemble import GradientBoostingClassifier

In [74]:
# Gradiant Boosting

#Original
random_grid = {'classification__n_estimators':np.arange(1,100,1),
       'classification__max_depth':[2,3,4],
       'classification__min_samples_split': [3, 5, 10],
        'classification__max_features' : ['auto', 'sqrt'],
        'classification__min_samples_leaf' : [ 3, 4, 5]}

model_gradiant = Pipeline([
        ('sampling', SMOTE()),
        ('classification', GradientBoostingClassifier())
    ])

In [75]:
search_gradiant_ftwo = RandomizedSearchCV(estimator = model_gradiant,scoring = ftwo_scorer,\
                               param_distributions = random_grid, n_iter = 100, cv = 5,\
                               verbose=2, random_state=0, n_jobs = -1)

In [76]:
search_gradiant_ftwo.fit(X_train,y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [77]:
current_model = search_gradiant_ftwo

print("best_score :",current_model.best_score_)
print("best_params :",current_model.best_params_)
print("score X_train :",current_model.score(X_train,y_train))
print("score X_test :",current_model.score(X_test,y_test))

y_predict = current_model.predict(X_test)

print("score fbeta :",fbeta_score(y_test,y_predict,beta=10))

best_score : 0.49265432883583227
best_params : {'classification__n_estimators': 10, 'classification__min_samples_split': 3, 'classification__min_samples_leaf': 5, 'classification__max_features': 'sqrt', 'classification__max_depth': 3}
score X_train : 0.493365129428055
score X_test : 0.4998413806230569
score fbeta : 0.4998413806230569


In [78]:
current_model = search_gradiant_ftwo

y_predict = current_model.predict(X_test)

report_new = pd.DataFrame({'model':'search_gradiant','score':['ftwo'],'best_params':[current_model.best_params_],\
          'best_score':[current_model.best_score_],'train':[current_model.score(X_train, y_train)],\
          'test':[current_model.score(X_test, y_test)],'fbeta_score_test':[fbeta_score(y_test,y_predict,beta=10)]})

report_final = pd.concat([report_final,report_new])
report_final

# Légère typo à la ligne 7 le score ftwo correspond au fbeta (corrigé dans le rapport et les diapos)

Unnamed: 0,model,score,best_params,best_score,train,test,fbeta_score_test
0,search_tree,fbeta,"{'classification__n_estimators': 13, 'classifi...",0.52659,0.527499,0.528562,0.528562
0,search_tree,roc_auc,"{'classification__n_estimators': 13, 'classifi...",0.613108,0.611411,0.614265,0.487458
0,search_logi,fbeta,"{'classification__tol': 0.0047508101621027985,...",0.642679,0.640188,0.625189,0.625189
0,search_logi,roc_auc,"{'classification__tol': 0.0001668100537200059,...",0.609928,0.616308,0.59166,0.550495
0,search_KNN,fbeta,{'classification__n_neighbors': 10},0.416634,0.932034,0.368335,0.368335
0,search_KNN,roc_auc,{'classification__n_neighbors': 10},0.535191,0.831119,0.515405,0.375276
0,search_gradiant,ftwo,"{'classification__n_estimators': 10, 'classifi...",0.492654,0.493365,0.499841,0.499841


## au_roc

In [79]:
search_gradiant_roc = RandomizedSearchCV(estimator = model_gradiant,scoring = roc_auc,\
                               param_distributions = random_grid, n_iter = 100, cv = 5,\
                               verbose=2, random_state=0, n_jobs = -1)

In [80]:
search_gradiant_roc.fit(X_train,y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits






In [81]:
current_model = search_gradiant_roc

print("best_score :",current_model.best_score_)
print("best_params :",current_model.best_params_)
print("score X_train :",current_model.score(X_train,y_train))
print("score X_test :",current_model.score(X_test,y_test))

y_predict = current_model.predict(X_test)

print("score fbeta :",fbeta_score(y_test,y_predict,beta=10))

best_score : 0.6184467513940858
best_params : {'classification__n_estimators': 43, 'classification__min_samples_split': 5, 'classification__min_samples_leaf': 5, 'classification__max_features': 'auto', 'classification__max_depth': 2}
score X_train : 0.6236499754540992
score X_test : 0.6162615652965266
score fbeta : 0.45739252455670365


In [82]:
current_model = search_gradiant_roc

y_predict = current_model.predict(X_test)

report_new = pd.DataFrame({'model':'search_gradiant','score':['au_roc'],'best_params':[current_model.best_params_],\
          'best_score':[current_model.best_score_],'train':[current_model.score(X_train, y_train)],\
          'test':[current_model.score(X_test, y_test)],'fbeta_score_test':[fbeta_score(y_test,y_predict,beta=10)]})

report_final = pd.concat([report_final,report_new])
report_final

# Légère typo à la ligne 7 le score ftwo correspond au fbeta (corrigé dans le rapport et les diapos)

Unnamed: 0,model,score,best_params,best_score,train,test,fbeta_score_test
0,search_tree,fbeta,"{'classification__n_estimators': 13, 'classifi...",0.52659,0.527499,0.528562,0.528562
0,search_tree,roc_auc,"{'classification__n_estimators': 13, 'classifi...",0.613108,0.611411,0.614265,0.487458
0,search_logi,fbeta,"{'classification__tol': 0.0047508101621027985,...",0.642679,0.640188,0.625189,0.625189
0,search_logi,roc_auc,"{'classification__tol': 0.0001668100537200059,...",0.609928,0.616308,0.59166,0.550495
0,search_KNN,fbeta,{'classification__n_neighbors': 10},0.416634,0.932034,0.368335,0.368335
0,search_KNN,roc_auc,{'classification__n_neighbors': 10},0.535191,0.831119,0.515405,0.375276
0,search_gradiant,ftwo,"{'classification__n_estimators': 10, 'classifi...",0.492654,0.493365,0.499841,0.499841
0,search_gradiant,au_roc,"{'classification__n_estimators': 43, 'classifi...",0.618447,0.62365,0.616262,0.457393


# Affinage

In [None]:
# Un affinage plus complet est réalisé dans le notebook suivant

In [83]:
model_logistic = Pipeline([
        ('sampling', SMOTE()),
        ('classification', LogisticRegression())
    ])

random_grid = {'classification__penalty': ['l1', 'l2'],
              'classification__C':np.logspace(-5, 5,1000), 
              'classification__max_iter':[500, 1000],
              'classification__solver': ["liblinear"],
              'classification__tol':np.logspace(-4, -2, 100)
             }

search_logi_ftwo_1 = RandomizedSearchCV(estimator = model_logistic,scoring = ftwo_scorer,\
                               param_distributions = random_grid, n_iter = 100, cv = 5,\
                               verbose=2, random_state=0, n_jobs = -1)

In [84]:
search_logi_ftwo_1.fit(X_train,y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [85]:
search_logi_ftwo_1.best_score_

0.6479624375524301

In [93]:
search_logi_ftwo_1.best_params_

{'classification__tol': 0.0009326033468832199,
 'classification__solver': 'liblinear',
 'classification__penalty': 'l1',
 'classification__max_iter': 1000,
 'classification__C': 18589.56679635688}

In [128]:
current_model = search_logi_ftwo_1

y_predict = current_model.predict(X_test)

report_new_1 = pd.DataFrame({'model':'search_gradiant','score':['ftwo'],'best_params':[current_model.best_params_],\
          'best_score':[current_model.best_score_],'train':[current_model.score(X_train, y_train)],\
          'test':[current_model.score(X_test, y_test)],'fbeta_score_test':[fbeta_score(y_test,y_predict,beta=10)]})

#report_final = pd.concat([report_final,report_new])
#report_final

report_new_1
# Par erreur, on a appellé le modèle search_gradiant au lieu de search_logi
# Le score ftwo correspond au fbeta (corrigé dans le rapport et les diapos)

Unnamed: 0,model,score,best_params,best_score,train,test,fbeta_score_test
0,search_gradiant,ftwo,"{'classification__tol': 0.0009326033468832199,...",0.647962,0.643953,0.637707,0.637707


In [87]:
search_logi_roc_1 = RandomizedSearchCV(estimator = model_logistic,scoring =  roc_auc,\
                               param_distributions = random_grid, n_iter = 100, cv = 5,\
                               verbose=2, random_state=0, n_jobs = -1)

In [88]:
search_logi_roc_1.fit(X_train,y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [129]:
current_model = search_logi_roc_1

y_predict = current_model.predict(X_test)

report_new_2 = pd.DataFrame({'model':'search_gradiant','score':['roc'],'best_params':[current_model.best_params_],\
          'best_score':[current_model.best_score_],'train':[current_model.score(X_train, y_train)],\
          'test':[current_model.score(X_test, y_test)],'fbeta_score_test':[fbeta_score(y_test,y_predict,beta=10)]})

report_final_1 = pd.concat([report_new_1,report_new_2])
report_final_1
# Par erreur, on a appellé le modèle search_gradiant au lieu de search_logi
# Le score ftwo correspond au fbeta (corrigé dans le rapport et les diapos)

Unnamed: 0,model,score,best_params,best_score,train,test,fbeta_score_test
0,search_gradiant,ftwo,"{'classification__tol': 0.0009326033468832199,...",0.647962,0.643953,0.637707,0.637707
0,search_gradiant,roc,"{'classification__tol': 0.0009326033468832199,...",0.673914,0.671483,0.672751,0.637546


In [95]:
search_logi_roc_1.best_params_

{'classification__tol': 0.0009326033468832199,
 'classification__solver': 'liblinear',
 'classification__penalty': 'l1',
 'classification__max_iter': 1000,
 'classification__C': 18589.56679635688}

In [97]:
# On va exporter notre pipeline pour ne pas avoir à le faire tourner à chaque fois
import joblib

In [99]:
joblib.dump(search_logi_ftwo_1, 'Pipeline\search_logi_ftwo_1.joblib')
joblib.dump(search_logi_roc_1, 'Pipeline\search_logi_roc_1.joblib')

['Pipeline\\search_logi_roc_1.joblib']

# Feature importance

In [125]:
coefficient = search_logi_ftwo.best_estimator_.named_steps['classification'].coef_

pd.DataFrame({'feature':X_train.columns,'coef':coefficient[0]})

Unnamed: 0,feature,coef
0,EXT_SOURCE_3,-2.256766
1,EXT_SOURCE_2,-1.866373
2,EXT_SOURCE_1,-2.549993
3,CC_CNT_DRAWINGS_ATM_CURRENT_MEAN,0.3622152
4,CC_CNT_DRAWINGS_CURRENT_MAX,0.0286624
5,BURO_DAYS_CREDIT_MEAN,0.0001033761
6,CC_AMT_BALANCE_MEAN,9.444904e-07
7,CC_AMT_TOTAL_RECEIVABLE_MEAN,1.35146e-06
8,CC_AMT_RECIVABLE_MEAN,1.02054e-06
9,CC_AMT_RECEIVABLE_PRINCIPAL_MEAN,4.466358e-07
