# Projet : Créer un modèle de scoring pour un organisme de crédit

## Sommaire

* [1. Observation des données et preprocessing (Kaggle)](#1)
    * [1.1 Imports et fonctions d'analyse et de preprocessing](#1.1)
    * [1.2 Observation des données déséquilibrées](#1.2)
    * [1.3 Supression de features et catégorisation](#1.3)
    * [1.4 Remplacement des valeurs nulles et des outliers](#1.4)
    * [1.5 Observation des features finales](#1.5)
* [2. Pipe de transformation](#2)
* [3. Modélisations et MLFlow](#3)
    * [3.1 DummyClassifier](#3.1)
    * [3.2 LogisticRegressor](#3.2)
    * [3.3 LightGBM](#3.3)
    * [3.4 XGBoost](#3.4)
    * [3.5 AdaBoost](#3.5)
* [4. ROC-Curve, comparaison des meilleurs modèles](#4)
* [5. Pipeline de référence](#5)
* [6. Feature importance et interprétabilité (globale et locale)](#6)
    * [6.1 Features les plus corrélées aux targets](#6.1)
    * [6.2 Feature importance de LGBM](#6.2)
    * [6.1 SHAP](#6.3)
    * [6.1 LIME](#6.4)
* [7. Analyse du Data Drift](#7)
* [8. Réduction de la mémoire des datasets pour export vers Github](#8)

## 1. Observation des données et preprocessing (Kaggle) <a class="anchor" id="1"></a>

### 1.1 Imports et fonctions d'analyse et de preprocessing <a class="anchor" id="1.1"></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.pyplot import figure

# ----------------------------------------------------
import sklearn
import scipy
import statsmodels.api as sm 
from scipy.stats import shapiro

# ----------------------------------------------------
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder
from sklearn.preprocessing import MinMaxScaler, FunctionTransformer

# ----------------------------------------------------
from imblearn.pipeline import make_pipeline, Pipeline

# ----------------------------------------------------
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# ----------------------------------------------------
import warnings
warnings.filterwarnings("ignore")

In [None]:
def outlier_detect(df, col):
    q1_col = Q1[col]
    iqr_col = IQR[col]
    q3_col = Q3[col]
    return df[((df[col] < (q1_col - 1.5 * iqr_col)) |(df[col] > (q3_col + 1.5 * iqr_col)))]

# ----------------------------------------------------------
def lower_outlier(df, col):
    q1_col = Q1[col]
    iqr_col = IQR[col]
    q3_col = Q3[col]
    lower = df[(df[col] < (q1_col - 1.5 * iqr_col))]
    return lower

# ----------------------------------------------------------
def upper_outlier(df, col):
    q1_col = Q1[col]
    iqr_col = IQR[col]
    q3_col = Q3[col]
    upper = df[(df[col] > (q3_col + 1.5 * iqr_col))]
    return upper

# ----------------------------------------------------------
def show_num_col(df, col):
    print("*********************** {} ***********************\n".format(col))
    print("lower outlier: {} ****** upper outlier: {}\n".format(lower_outlier(df,col).shape[0], 
                                                                upper_outlier(df,col).shape[0]))
    plt.figure(figsize=(10,8))
    plt.subplot(2,1,1)
    df[col].plot(kind='box', subplots=True, sharex=False, vert=False)
    plt.subplot(2,1,2)
    df[col].plot(kind='density', subplots=True, sharex=False)
    plt.show()

# ----------------------------------------------------------
def show_cat_col(df, col):
    print("******************** {} ********************\n".format(col))
    df[col].value_counts().plot(kind='bar')
    plt.xticks(rotation='vertical')
    plt.show()
    
# ----------------------------------------------------------
def replace_upper(df, col):
    q1_col = Q1[col]
    iqr_col = IQR[col]
    q3_col = Q3[col]
    tmp = 9999999
    upper = q3_col + 1.5 * iqr_col
    df[col] = df[col].where(lambda x: (x < (upper)), tmp)
    df[col] = df[col].replace(tmp, upper)

# ----------------------------------------------------------
def replace_lower(df, col):
    q1_col = Q1[col]
    iqr_col = IQR[col]
    q3_col = Q3[col]
    tmp = 1111111
    lower = q1_col - 1.5 * iqr_col
    df[col] = df[col].where(lambda x: (x > (lower)), tmp)
    df[col] = df[col].replace(tmp, lower)

# ----------------------------------------------------------
def replace_mode(df, col):
    df[col] = df[col].fillna(df[col].mode()[0])
    print("NaN de {} remplacés par le mode {}".format(col, df[col].mode()[0]))

# ----------------------------------------------------------
def replace_mean(df, col):
    df[col] = df[col].fillna(df[col].mean())
    print("NaN de {} remplacés par la moyenne {}".format(col, df[col].mean()))

In [None]:
train = pd.read_csv("application_train.csv")
test = pd.read_csv("application_test.csv")

### 1.2 Observation des données déséquilibrées <a class="anchor" id="1.2"></a>

In [None]:
sns.countplot(x = "TARGET", data = train)
train.loc[:, 'TARGET'].value_counts()

In [None]:
len(train[train['TARGET']==0])/len(train)

Données déséquilibrées : un peu plus de 8% de "mauvais payeurs".

### 1.3 Supression de features et catégorisation <a class="anchor" id="1.3"></a>

In [None]:
threshold_train = len(train) * 0.60
int(threshold_train)

In [None]:
threshold_test = len(test) * 0.60
int(threshold_test)

In [None]:
print("Train:\n")
print(train.columns[train.isna().sum() > int(threshold_train)])
print("******************************************")
print("Test:\n")
print(test.columns[test.isna().sum() > int(threshold_test)])

In [None]:
# On restreint les features à celles qui sont renseignées pour plus de 60% des clients

train_new = train.dropna(axis=1, thresh=threshold_train)
print(train_new.shape)
print("******************************************")
test_new = test.dropna(axis=1, thresh=threshold_test)
print(test_new.shape)

On sépare les features par types.

In [None]:
numeric_feature = train_new.dtypes!=object
final_numeric_feature = train_new.columns[numeric_feature].tolist()

#----------------------------------------------------
numeric_feature_test = test_new.dtypes!=object
final_numeric_feature_test = test_new.columns[numeric_feature_test].tolist()

In [None]:
numeric = train_new[final_numeric_feature]

#-------------------------------------------
numeric_test = test_new[final_numeric_feature_test]

In [None]:
discrete_features = numeric.dtypes==np.int64
final_discrete_feature = numeric.columns[discrete_features].tolist()
discrete = numeric[final_discrete_feature]

#-------------------------------------------
discrete_features_test = numeric_test.dtypes==np.int64
final_discrete_feature_test = numeric_test.columns[discrete_features_test].tolist()
discrete_test = numeric_test[final_discrete_feature_test]

In [None]:
continuous_features = numeric.dtypes==np.float64
final_continuous_feature = numeric.columns[continuous_features].tolist()
continuous = numeric[final_continuous_feature]

#-------------------------------------------
continuous_features_test = numeric_test.dtypes==np.float64
final_continuous_feature_test = numeric_test.columns[continuous_features_test].tolist()
continuous_test = numeric_test[final_continuous_feature_test]

In [None]:
continuous_col = continuous.columns

### 1.4 Remplacement des valeurs nulles et des outliers <a class="anchor" id="1.4"></a>

In [None]:
for i in range(len(continuous_col)):
    show_num_col(continuous[continuous_col], continuous_col[i])

In [None]:
continuous_is_null = continuous.isna().sum() != 0
final_continuous_feature = continuous.columns[continuous_is_null].tolist()
print("Features continues pour le train: \n",final_continuous_feature)

print("****************************************")
continuous_is_null_test = continuous_test.isna().sum() != 0
final_continuous_feature_test = continuous_test.columns[continuous_is_null_test].tolist()
print("Features continues pour le test: \n",final_continuous_feature_test)

In [None]:
print("Pour le train:\n")
for i in range(len(final_continuous_feature)):
    replace_mean(continuous, final_continuous_feature[i])

print("************************************")
print("Pour le test:\n")
for i in range(len(final_continuous_feature_test)):
    replace_mean(continuous_test, final_continuous_feature_test[i])

In [None]:
numeric[continuous_col] = continuous[continuous_col]

# ----------------------------------------------
numeric_test[continuous_col] = continuous_test[continuous_col]

In [None]:
col_names = numeric.columns

# ------------------------------------
col_names_test = numeric_test.columns

In [None]:
print("Nb d'outliers par colonne du train:\n")
for i in range(len(col_names)):
    print("{}: {}".format(col_names[i],(outlier_detect(numeric,col_names[i]).shape[0])))
    
print("\n\n***************************************\n")
print("Nb d'outliers par colonne du test:\n")
for i in range(len(col_names_test)):
    print("{}: {}".format(col_names_test[i],(outlier_detect(numeric_test,col_names_test[i]).shape[0])))

In [None]:
Q1 = train_new.quantile(0.25)
Q3 = train_new.quantile(0.75)
IQR = Q3 - Q1

In [None]:
outlier = []
for i in range(len(final_numeric_feature)):
    if outlier_detect(numeric[final_numeric_feature],final_numeric_feature[i]).shape[0] !=0:
        outlier.append(final_numeric_feature[i])

outlier_test = []
for i in range(len(final_numeric_feature_test)):
    if outlier_detect(numeric_test[final_numeric_feature_test],final_numeric_feature_test[i]).shape[0] !=0:
        outlier_test.append(final_numeric_feature_test[i])

In [None]:
# without TARGET field
col_names = outlier_test

In [None]:
print("Outliers supérieurs pour le train:\n")
for i in range(len(col_names)):
    print("{}: {}".format(col_names[i],(upper_outlier(numeric,col_names[i]).shape[0])))
    
print("\n\n****************************************\n")
print("Outliers supérieurs pour le test:\n")
for i in range(len(col_names)):
    print("{}: {}".format(col_names[i],(upper_outlier(numeric_test,col_names[i]).shape[0])))

In [None]:
for i in range(len(col_names)):
    replace_upper(numeric, col_names[i])   
    
#------------------------------------------------------
for i in range(len(col_names)):
    replace_upper(numeric_test, col_names[i])   

In [None]:
print("Outliers inférieurs pour le train:\n")
for i in range(len(col_names)):
    print("{}: {}".format(col_names[i],(lower_outlier(numeric,col_names[i]).shape[0])))
    
print("\n\n****************************************\n")
print("Outliers inférieurs pour le test:\n")
for i in range(len(col_names)):
    print("{}: {}".format(col_names[i],(lower_outlier(numeric_test,col_names[i]).shape[0])))

In [None]:
for i in range(len(col_names)):
    replace_lower(numeric, col_names[i])
    
# #--------------------------------------------------
for i in range(len(col_names)):
    replace_lower(numeric_test, col_names[i])

In [None]:
categorical_feature = train_new.dtypes==object
final_categorical_feature = train_new.columns[categorical_feature].tolist()

#----------------------------------------------------
categorical_feature_test = test_new.dtypes==object
final_categorical_feature_test = test_new.columns[categorical_feature_test].tolist()

In [None]:
categorical = train_new[final_categorical_feature]

#---------------------------------------------
categorical_test = test_new[final_categorical_feature_test]

In [None]:
col_names_cat = categorical.columns

In [None]:
for i in range(len(col_names_cat)):
    show_cat_col(categorical, col_names_cat[i])

In [None]:
print("unique number is = {}\nunique values are: \n{} ".format(len(train_new['ORGANIZATION_TYPE'].unique()), 
                                                               train_new['ORGANIZATION_TYPE'].unique()))

In [None]:
print("Pour le train:\n")
for i in range(len(col_names_cat)):
    replace_mode(categorical, col_names_cat[i])

print("\n\n****************************************\n")
print("Pour le test:\n")
for i in range(len(col_names_cat)):
    replace_mode(categorical_test, col_names_cat[i])

In [None]:
categorical.drop(['ORGANIZATION_TYPE'], axis=1, inplace=True)
# ---------------------------------------------
categorical_test.drop(['ORGANIZATION_TYPE'], axis=1, inplace=True)

In [None]:
le = LabelEncoder() 
categorical = categorical.apply(lambda col_names_cat: le.fit_transform(col_names_cat)) 
categorical_test = categorical_test.apply(lambda col_names_cat: le.fit_transform(col_names_cat)) 

In [None]:
print("Train: ",categorical.shape)
print("Test: ",categorical_test.shape)

In [None]:
col_names_cat = categorical.columns
col_names = numeric_test.columns

In [None]:
train_new[col_names_cat] = categorical[col_names_cat]
train_new[col_names] = numeric[col_names]

# ----------------------------------------------------
test_new[col_names] = numeric_test[col_names]
test_new[col_names_cat] = categorical_test[col_names_cat]

In [None]:
train_new.drop(['ORGANIZATION_TYPE'], axis=1, inplace=True)
test_new.drop(['ORGANIZATION_TYPE'], axis=1, inplace=True)

In [None]:
print("Train: ",train_new.loc[train.duplicated()].shape)
#--------------------------------------------------
print("Test: ",test_new.loc[test.duplicated()].shape)

In [None]:
x_train = train_new.drop("TARGET", axis = 1)
y = train_new['TARGET']

In [None]:
scaler=MinMaxScaler()
col = ['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR',
       'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT',
       'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE',
       'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE',
       'REGION_POPULATION_RELATIVE', 'DAYS_BIRTH', 'DAYS_EMPLOYED',
       'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'FLAG_MOBIL', 'FLAG_EMP_PHONE',
       'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL',
       'OCCUPATION_TYPE', 'CNT_FAM_MEMBERS', 'REGION_RATING_CLIENT',
       'REGION_RATING_CLIENT_W_CITY', 'WEEKDAY_APPR_PROCESS_START',
       'HOUR_APPR_PROCESS_START', 'REG_REGION_NOT_LIVE_REGION',
       'REG_REGION_NOT_WORK_REGION', 'LIVE_REGION_NOT_WORK_REGION',
       'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY',
       'LIVE_CITY_NOT_WORK_CITY', 'EXT_SOURCE_2', 'EXT_SOURCE_3',
       'OBS_30_CNT_SOCIAL_CIRCLE', 'DEF_30_CNT_SOCIAL_CIRCLE',
       'OBS_60_CNT_SOCIAL_CIRCLE', 'DEF_60_CNT_SOCIAL_CIRCLE',
       'DAYS_LAST_PHONE_CHANGE', 'FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_3',
       'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6',
       'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9',
       'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12',
       'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15',
       'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18',
       'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21',
       'AMT_REQ_CREDIT_BUREAU_HOUR', 'AMT_REQ_CREDIT_BUREAU_DAY',
       'AMT_REQ_CREDIT_BUREAU_WEEK', 'AMT_REQ_CREDIT_BUREAU_MON',
       'AMT_REQ_CREDIT_BUREAU_QRT', 'AMT_REQ_CREDIT_BUREAU_YEAR']

x_train[col] = pd.DataFrame(scaler.fit_transform(x_train[col]))
test_new[col] = pd.DataFrame(scaler.transform(test_new[col]))

In [None]:
train_id = x_train[['SK_ID_CURR']]
test_id = test_new[['SK_ID_CURR']]
x_train = x_train.drop(columns = 'SK_ID_CURR')
test_new = test_new.drop(columns = 'SK_ID_CURR')

In [None]:
train_id.to_csv('cust_num.csv')
test_id.to_csv('new_cust_num.csv')

### 1.5 Observation des features finales <a class="anchor" id="1.5"></a>

In [None]:
disc_col = set(discrete.columns).intersection(col)

plt.figure(figsize = (22, 22))
mask = np.triu(np.ones_like(discrete[disc_col].corr()))

sns.heatmap(discrete[disc_col].corr(),mask = mask, cmap = plt.cm.Spectral,square=True, annot = True)
plt.title('Correlation Heatmap');

In [None]:
cont_col = set(continuous.columns).intersection(col)

plt.figure(figsize = (22, 22))
mask = np.triu(np.ones_like(continuous[cont_col].corr()))

sns.heatmap(continuous[cont_col].corr(),mask = mask, cmap = plt.cm.Spectral,square=True, annot = True)
plt.title('Correlation Heatmap');

In [None]:
correlations = train.corr()['TARGET'].sort_values()

In [None]:
print('Most Positive Correlations:\n', correlations.tail(15))
print('\nMost Negative Correlations:\n', correlations.head(15))

In [None]:
sns.histplot(x= train['DAYS_BIRTH']);

In [None]:
sns.histplot(x= train['DAYS_LAST_PHONE_CHANGE']);

In [None]:
sns.histplot(x= train['REG_CITY_NOT_WORK_CITY']);

In [None]:
sns.histplot(x= train['OWN_CAR_AGE']);

In [None]:
sns.histplot(x= train['EXT_SOURCE_3']);

In [None]:
sns.histplot(x= train_new['DAYS_EMPLOYED']);

In [None]:
sns.histplot(x= train_new['AMT_GOODS_PRICE']);

## 2. Pipe de transformation <a class="anchor" id="2"></a>

Création d'un pipe de transformation synthétisant les transformations effectuées et sauvegarde des 70 features utilisées pour la modélisation.

In [None]:
col = list(x_train.columns)

In [None]:
col

In [None]:
X = train[col]

numeric_features = X.select_dtypes(exclude='object').columns
categorical_features = X.select_dtypes(include=['object']).columns

imp_cat = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imp_num = SimpleImputer(missing_values=np.nan, strategy='mean')

cat_transformer = Pipeline([('LabelImputer',imp_cat), ('Encoder',OrdinalEncoder())])
numerical_transformer = Pipeline([('NumImputer',imp_num), ('Scaler',MinMaxScaler())])

ct = ColumnTransformer([('Cat',cat_transformer,categorical_features),('Num',numerical_transformer,numeric_features)])

ct = ct.fit(X)

In [None]:
joblib.dump(ct, 'col_transfo.joblib')

In [None]:
pipe = Pipeline([('transformer',ct)])

In [None]:
X = pd.DataFrame(ct.transform(X), columns = X.columns)

## 3. Modélisations et MLFlow <a class="anchor" id="3"></a>

In [None]:
from imblearn.over_sampling import SMOTE
from time import time
from collections import Counter

# ----------------------------------------------------
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression

# ----------------------------------------------------
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_validate, cross_val_predict
from sklearn.model_selection import train_test_split, RandomizedSearchCV

# ----------------------------------------------------
from sklearn import metrics as met
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, auc, roc_auc_score, roc_curve

# ----------------------------------------------------------
kfold = StratifiedKFold(n_splits=5, random_state=100, shuffle=True)

# ----------------------------------------------------------
import mlflow
from mlflow.models.signature import infer_signature
import joblib

# ----------------------------------------------------------
import shap

In [None]:
# fonction de coût prenant en compte le fait qu'un faux positif coûte 10 fois plus cher qu'un faux négatif
def cost_score(y_true,y_pred,fn_cost=10, fp_cost=1):
    fn = np.sum((y_true == 1) & (y_pred == 0))
    fp = np.sum((y_true == 0) & (y_pred == 1))
    loss = fn * fn_cost + fp * fp_cost
    score = loss
    return score

# ----------------------------------------------------------
# fonction renvoyant le seuil optimal pour minimiser la fonction de coût
def best_thresh(y_true, y_prob, fn_cost=10, fp_cost=1,step=0.0001):

    cost_min = np.inf
    opti_thresh = 0.0
    for threshold in np.arange(0.0, 1.0, step):
        y_pred = (y_prob >= threshold).astype(int)
        fn = np.sum((y_true == 1) & (y_pred == 0))
        fp = np.sum((y_true == 0) & (y_pred == 1))
        cost = fn * fn_cost + fp * fp_cost
        if cost < cost_min:
            cost_min = cost
            opti_thresh = threshold

    return opti_thresh

# ----------------------------------------------------------
# fonction de sélection des meilleurs modèles par RandomizeSearch 
# avec ou sans smote préalable (class_weight sinon et si possible)
# optimisation du score AUC puis du score créé
# identification des meilleurs seuils pour minimiser la fonction de coût
def model_res(model, X, y, params, smote = False, disp = True):

    model_name = str(type(model)).split('.')[-1][0:-2]
    
    if smote == True:
        model_name += ' & smote'
        pipeline = Pipeline(steps = [['smote', SMOTE(random_state = 11)],
                                     ['classifier', model]])
    else:
        model_name += ' & class_weight'     
        pipeline = model
    
    clf1 = RandomizedSearchCV(estimator = pipeline, param_distributions = params, scoring = 'roc_auc', cv = kfold, 
                              n_jobs=-1, random_state=100)
    clf1.fit(X, y)

    model1 = clf1.best_estimator_
    hyperparams1 = clf1.best_params_    
    start = time()
    y_pred_proba1 = cross_val_predict(model1, X, y, cv=kfold, method='predict_proba')[:, 1]
    end = time()
    laps1 = (end - start)/5
    
    y_pred1 = (y_pred_proba1 >= 0.5).astype(int)    
    accuracy=accuracy_score(y, y_pred1)
    recall=recall_score(y,y_pred1)
    precision=precision_score(y,y_pred1)
    f1=f1_score(y,y_pred1)
    rocauc=roc_auc_score(y,y_pred_proba1)
    score = cost_score(y,y_pred1)    
    scores1 = [accuracy, recall, precision, f1, rocauc, score, 0.5]
    
    thresh1 = best_thresh(y, y_pred_proba1)
    y_pred1bis = (y_pred_proba1 >= thresh1).astype(int)
    accuracy=accuracy_score(y, y_pred1bis)
    recall=recall_score(y,y_pred1bis)
    precision=precision_score(y,y_pred1bis)
    f1=f1_score(y,y_pred1bis)
    rocauc=roc_auc_score(y,y_pred_proba1)
    score = cost_score(y,y_pred1bis)     
    scores1bis = [accuracy, recall, precision, f1, rocauc, score, thresh1]
    
    scorer = met.make_scorer(cost_score, greater_is_better = False)
    
    clf2 = RandomizedSearchCV(estimator = pipeline, param_distributions = params, scoring = scorer, cv = kfold, 
                              n_jobs=-1, random_state=100)
    clf2.fit(X, y)

    model2 = clf2.best_estimator_
    hyperparams2 = clf2.best_params_  
    start = time()
    y_pred_proba2 = cross_val_predict(model2, X, y, cv=kfold, method='predict_proba')[:, 1]
    end = time()
    laps2 = (end - start)/5
    
    y_pred2 = (y_pred_proba2 >= 0.5).astype(int)  
    accuracy=accuracy_score(y, y_pred2)
    recall=recall_score(y,y_pred2)
    precision=precision_score(y,y_pred2)
    f1=f1_score(y,y_pred2)
    rocauc=roc_auc_score(y,y_pred_proba2)
    score = cost_score(y,y_pred2)    
    scores2 = [accuracy, recall, precision, f1, rocauc, score, 0.5]

    thresh2 = best_thresh(y, y_pred_proba2)
    y_pred2bis = (y_pred_proba2 >= thresh2).astype(int)
    accuracy=accuracy_score(y, y_pred2bis)
    recall=recall_score(y,y_pred2bis)
    precision=precision_score(y,y_pred2bis)
    f1=f1_score(y,y_pred2bis)
    rocauc=roc_auc_score(y,y_pred_proba2)
    score = cost_score(y,y_pred2bis)     
    scores2bis = [accuracy, recall, precision, f1, rocauc, score, thresh2]    
    
    df = None
    
    if disp == True:

        fpr1, tpr1, _ = roc_curve(y, y_pred_proba1)
        fpr2, tpr2, _ = roc_curve(y, y_pred_proba2)
        curve = plt.figure()
        plt.plot(fpr1, tpr1, 'b', label = 'AUC score optimization AUC = %0.2f' % auc(fpr1, tpr1))
        plt.plot(fpr2, tpr2, 'orange', label = 'Business score optimization AUC = %0.2f' % auc(fpr2, tpr2))
        plt.plot([0, 1], [0, 1],'r--')
        plt.xlim([0, 1])
        plt.ylim([0, 1])
        plt.ylabel('True Positive Rate')
        plt.xlabel('False Positive Rate')
        plt.legend(loc = 'lower right')
        plt.suptitle('Receiver Operating Characteristic')
        
        output1 = pd.DataFrame({'SK_ID_CURR': train_id.SK_ID_CURR, 
                               'TARGET': y_pred_proba1})
        output2 = pd.DataFrame({'SK_ID_CURR': train_id.SK_ID_CURR, 
                               'TARGET': y_pred_proba2})
        fig, ax = plt.subplots(1,1)        
        sns.histplot(output1['TARGET'], ax = ax, color = 'blue', label = 'AUC score optimization')
        sns.histplot(output2['TARGET'], ax = ax, color = 'orange', label = 'Business score optimization')
        plt.legend()
        plt.title('Probability Distribution')
        plt.show()
        
        cm1 = met.confusion_matrix(y, y_pred1)
        cm2 = met.confusion_matrix(y, y_pred2)
        fig, ax = plt.subplots(1,2, figsize = (18,7))
        sns.heatmap(cm1, annot=True,  fmt='', xticklabels = ["0", "1"] , 
                        yticklabels = ["0", "1"], ax = ax[0])
        ax[0].set_title('AUC score optimization / threshold = 0.5')   
        ax[0].set_ylabel('Actual')
        ax[0].set_xlabel('Predicted')
        sns.heatmap(cm2, annot=True,  fmt='', xticklabels = ["0", "1"] , 
                        yticklabels = ["0", "1"], ax = ax[1])
        ax[1].set_title('Business score optimization / threshold = 0.5')
        ax[1].set_ylabel('Actual')
        ax[1].set_xlabel('Predicted')
        plt.suptitle('Confusion Matrix')
        plt.show()
        
        cm1bis = met.confusion_matrix(y, y_pred1bis)
        cm2bis = met.confusion_matrix(y, y_pred2bis)
        fig, ax = plt.subplots(1,2, figsize = (18,7))
        sns.heatmap(cm1bis, annot=True,  fmt='', xticklabels = ["0", "1"] , 
                        yticklabels = ["0", "1"], ax = ax[0])
        ax[0].set_title(f'AUC score optimization / threshold = {thresh1}')   
        ax[0].set_ylabel('Actual')
        ax[0].set_xlabel('Predicted')
        sns.heatmap(cm2bis, annot=True,  fmt='', xticklabels = ["0", "1"] , 
                        yticklabels = ["0", "1"], ax = ax[1])
        ax[1].set_title(f'Business score optimization / threshold = {thresh2}')
        ax[1].set_ylabel('Actual')
        ax[1].set_xlabel('Predicted')
        plt.suptitle('Confusion Matrix')
        plt.show()

        print('Classification Report / AUC score Optimization \n', met.classification_report(y, y_pred1), '\n\n',
              'Classification Report / Business score Optimization \n', met.classification_report(y, y_pred2))   

        col = ['accuracy', 'precision', 'recall', 'f1-score', 'roc_auc', 'business_score', 'threshold', 'time']
        model_name1 = model_name + ' (scoring AUC)'
        model_name2 = model_name + ' (scoring Business)'
        model_name1bis = model_name + ' (scoring AUC) / best thresh'
        model_name2bis = model_name + ' (scoring Business) / best thresh'
        dic_mod = {model_name1 : scores1 + [laps1], model_name1bis : scores1bis + [laps1],
                   model_name2 : scores2 + [laps2], model_name2bis : scores2bis + [laps2]}
        df = pd.DataFrame.from_dict(dic_mod, orient = 'index', columns = col)

        display(df)        
        
    return df, (model1, model2), (hyperparams1, hyperparams2), (scores1, scores2, scores1bis, scores2bis), curve

# ----------------------------------------------------------
# fonction du MLFlow permettant d'enregistrer les expériences 
def run_mlflow_experiment(model, exp_name, fig_name, hyperparams, scores, time, curve):
    plt.savefig(fig_name)
    
    if str(type(model)).split('.')[1] in ['sklearn', 'dummy', 'linear_model', 'ensemble']:
        model_name = str(type(model)).split('.')[-1][0:-2]
    else:
        model_name = 'Smote' + '/' + str(type(model.named_steps['classifier'])).split('.')[-1][0:-2]
    
    # Nom de l'expérience
    experiment_name = exp_name
    
    # Initialisation de l'expérience
    mlflow.set_experiment(experiment_name)
    
    # Démarrage d'un run
    with mlflow.start_run(run_name = exp_name) as run:
        
        # Enregistrement des hyperparamètres
        mlflow.log_params(hyperparams)
        
        # Enregistrement des métriques
        mlflow.log_metric("Accuracy", scores[0])
        mlflow.log_metric("Precision", scores[1])
        mlflow.log_metric("Recall", scores[2])
        mlflow.log_metric("F1_score", scores[3])
        mlflow.log_metric("Auc_score", scores[4])
        mlflow.log_metric("Business_score", scores[5])
        mlflow.log_metric("Threshold", scores[6])
        mlflow.log_metric("Fit_time", time)

        # Enregistrement des graphiques
        mlflow.log_figure(curve, artifact_file = fig_name)
        
        # Enregistrement du modèle entraîné
        mlflow.sklearn.log_model(model, model_name)

### 3.1 DummyClassifier <a class="anchor" id="3.1"></a>

Classifieur de référence pour comparer les scores

In [None]:
dummy_clf = DummyClassifier(strategy="stratified")
df_dum_sm, model_dum_sm, params_dum_sm, scores_dum_sm, curve_dum_sm = model_res(dummy_clf, X, y, {}, smote = True)

In [None]:
dummy_clf = DummyClassifier(strategy="stratified")
df_dum, model_dum, params_dum, scores_dum, curve_dum = model_res(dummy_clf, X, y, {}, smote = False)

In [None]:
df_dum_final = pd.concat([df_dum, df_dum_sm])
df_dum_final.sort_values(by = 'time', 
                        ascending = True).sort_values(by = ['roc_auc', 
                                                            'accuracy'], 
                                                      ascending = False).sort_values(by = 'business_score', 
                                                                                     ascending = True)

In [None]:
run_mlflow_experiment(model_dum[1], 
                      'DummyClassifier_ref', 'dummy_roc.png', params_dum[1], scores_dum[1], 0.137990, curve_dum)

In [None]:
# curve_dum.savefig('dummy_roc.png')

### 3.2 LogisticRegressor <a class="anchor" id="3.2"></a>

In [None]:
lr = LogisticRegression()
df_lr_sm, model_lr_sm, params_lr_sm, scores_lr_sm, curve_lr_sm = model_res(lr, X, y, {}, smote = True)

In [None]:
lr = LogisticRegression()
df_lr, model_lr, params_lr, scores_lr, curve_lr = model_res(lr, X, y, {}, smote = False)

In [None]:
df_lr_final = pd.concat([df_lr, df_lr_sm])
df_lr_final.sort_values(by = 'time', 
                        ascending = True).sort_values(by = ['roc_auc', 
                                                            'accuracy'], 
                                                      ascending = False).sort_values(by = 'business_score', 
                                                                                     ascending = True)

In [None]:
run_mlflow_experiment(model_lr[1], 
                      'Best_LogReg_simple', 'logreg_roc.png', params_lr[1], scores_lr[3], 3.509839, curve_lr)

In [None]:
# curve_lr.savefig('logreg_roc.png')

### 3.3 LightGBM <a class="anchor" id="3.3"></a>

In [None]:
param_lgb = {'learning_rate':[0.2,0.1,0.01,0.05,0.001],
              'num_leaves':range(10,100,10),
              'min_child_samples':range(500,1000,100),
              'reg_alpha':[0.1,0.01,0.2,0.3],
              'reg_lambda':[0.1,0.01,0.2,0.3],
             'n_estimators':range(50,300,50),
              'max_bin': range(500,1500,100)}

param_lgb2 = {}
for key in param_lgb.keys():
    param_lgb2['classifier__'+key] = param_lgb[key]

In [None]:
df_lgb_sm, model_lgb_sm, params_lgb_sm, scores_lgb_sm, curve_lgb_sm = model_res(LGBMClassifier(random_state = 100, 
                                                                                                   n_jobs=-1),
                                                                                    X, y, param_lgb2, smote = True)

In [None]:
df_lgb, model_lgb, params_lgb, scores_lgb, curve_lgb = model_res(LGBMClassifier(random_state = 100, n_jobs=-1,
                                                                                class_weight = 'balanced'), 
                                                                 X, y, param_lgb)

In [None]:
df_lgb_final = pd.concat([df_lgb, df_lgb_sm])
df_lgb_final.sort_values(by = 'time', 
                        ascending = True).sort_values(by = ['roc_auc', 
                                                            'accuracy'], 
                                                      ascending = False).sort_values(by = 'business_score', 
                                                                                     ascending = True)

In [None]:
run_mlflow_experiment(model_lgb[0], 
                      'Best_LGMClassifier', 'lgbm_roc.png', params_lgb[0], scores_lgb[2], 2.363127, curve_lgb)

In [None]:
# curve_lgb.savefig('lgbm_roc.png')

### 3.4 XGBoost <a class="anchor" id="3.4"></a>

In [None]:
param_xgb = {'learning_rate':[0.2,0.1,0.01,0.05,0.001],
              'subsample':[1,0.5,0.2,0.1],
              'max_depth' : range(2,11,1),
              'n_estimators':range(50,300,50)}

param_xgb2 = {}
for key in param_xgb.keys():
    param_xgb2['classifier__'+key] = param_xgb[key]

In [None]:
df_xgb_sm, model_xgb_sm, params_xgb_sm, scores_xgb_sm, curve_xg_sm = model_res(XGBClassifier(random_state = 100, n_jobs=-1), 
                                                                                X, y, param_xgb2, smote = True)

In [None]:
counter = Counter(y)
estimate = counter[0] / counter[1]
print('Estimate: %.3f' % estimate)

In [None]:
df_xgb, model_xgb, params_xgb, scores_xgb, curve_xgb = model_res(XGBClassifier(random_state = 100, n_jobs=-1, 
                                                                               scale_pos_weight = estimate), 
                                                                 X, y, param_xgb)

In [None]:
df_xgb_final = pd.concat([df_xgb, df_xgb_sm])
df_xgb_final.sort_values(by = 'time', 
                        ascending = True).sort_values(by = ['roc_auc', 
                                                            'accuracy'], 
                                                      ascending = False).sort_values(by = 'business_score', 
                                                                                     ascending = True)

In [None]:
run_mlflow_experiment(model_xgb[0], 
                      'Best_XGBClassifier', 'xgb_roc.png', params_xgb[0], scores_xgb[2], 34.514814, curve_xgb)

In [None]:
# curve_xgb.savefig('xgb_roc.png')

### 3.5 AdaBoost <a class="anchor" id="3.5"></a>

In [None]:
param_ada = {'learning_rate':[0.2,0.1,0.01,0.05,0.001],
              'algorithm': ['SAMME', 'SAMME.R'],
              'n_estimators':range(50,300,50)}

param_ada2 = {}
for key in param_ada.keys():
    param_ada2['classifier__'+key] = param_ada[key]

In [None]:
df_ada, model_ada, params_ada, scores_ada, curve_ada = model_res(AdaBoostClassifier(random_state = 100), 
                                                                 X, y, param_ada)

In [None]:
df_ada_sm, model_ada_sm, params_ada_sm, scores_ada_sm, curve_ada_sm = model_res(AdaBoostClassifier(random_state = 100), 
                                                                                X, y, param_ada2, smote = True)

In [None]:
df_ada_final = pd.concat([df_ada, df_ada_sm])
df_ada_final.sort_values(by = 'time', 
                        ascending = True).sort_values(by = ['roc_auc', 
                                                            'accuracy'], 
                                                      ascending = False).sort_values(by = 'business_score', 
                                                                                     ascending = True)

In [None]:
run_mlflow_experiment(model_ada[1], 
                      'Best_AdaBoostClassifier', 'ada_roc.png', params_ada[1], scores_ada[3], 182.060805, curve_ada)

In [None]:
# curve_ada.savefig('ada_roc.png')

## 4. ROC-Curve, comparaison des meilleurs modèles <a class="anchor" id="4"></a>

In [None]:
liste_df = [df_dum_final, df_lr_final, df_ada_final, df_xgb_final, df_lgb_final]
df_comp = pd.DataFrame()
for df in liste_df:
    df= df.reset_index()
    df = df.sort_values(by = 'time', 
                        ascending = True).sort_values(by = ['roc_auc', 
                                                            'accuracy'], 
                                                      ascending = False).sort_values(by = 'business_score', 
                                                                                     ascending = True)
    df_comp = pd.concat([df_comp, df.head(1)])
df_comp

In [None]:
models = [model_dum[1], model_lr[1], model_ada[1], model_xgb[0], model_lgb[0]]

result_table = pd.DataFrame(columns=['models', 'fpr','tpr','auc'])

for model in models:
    yproba = model.predict_proba(X)[::,1]
    
    fpr, tpr, _ = roc_curve(y,  yproba)
    aucroc = roc_auc_score(y, yproba)
    
    result_table = result_table.append({'models':model.__class__.__name__,
                                        'fpr':fpr, 
                                        'tpr':tpr, 
                                        'auc':aucroc}, ignore_index=True)

result_table.set_index('models', inplace=True)

In [None]:
fig = plt.figure(figsize=(10,8))

for i in result_table.index:
    plt.plot(result_table.loc[i]['fpr'], 
             result_table.loc[i]['tpr'], 
             label="{}, AUC={:.3f}".format(i, result_table.loc[i]['auc'])
             )
    
plt.plot([0,1], [0,1], color='orange', linestyle='--')

plt.xticks(np.arange(0.0, 1.1, step=0.1))
plt.xlabel("False positive rate", fontsize=15)

plt.yticks(np.arange(0.0, 1.1, step=0.1))
plt.ylabel("True positive rate", fontsize=15)

plt.title('ROC Curve Analysis', fontweight='bold', fontsize=15)
plt.legend(prop={'size':13}, loc='lower right')

plt.show()

Le meilleur classifieur LGBM sera retenu ; il est par ailleurs beaucoup plus rapide que XGBoost.

## 5. Pipeline de référence <a class="anchor" id="5"></a>

Combinaison du pipe de transformation enregistré et du modèle entraîné.

In [None]:
transfo = joblib.load('col_transfo.joblib')

In [None]:
transfo

In [None]:
logged_model = 'runs:/e46c27c6fbfe4fd3884be624f3f7cad3/LGBMClassifier'
model_ref = mlflow.sklearn.load_model(logged_model)

In [None]:
model_ref

In [None]:
final_pipe = Pipeline([('transformer',transfo), ('model', model_ref)])

In [None]:
joblib.dump(final_pipe, 'model.joblib')

## 6. Feature importance et interprétabilité (globale et locale) <a class="anchor" id="6"></a>

In [None]:
pipe_test = joblib.load('model.joblib')

In [None]:
y_pred_test = pipe_test.predict_proba(train)

In [None]:
train_test = train.copy()
train_test['proba'] = pd.DataFrame(y_pred_test)[1]

### 6.1 Features les plus corrélées aux targets <a class="anchor" id="6.1"></a>

On reprend les features les plus corrélées aux targets pour observer la répartition des clients de référence au regard de la probabilité que le modèle leur aurait donnée.

In [None]:
sns.relplot(x='DAYS_BIRTH', y='proba', hue='TARGET', data=train_test)
sns.kdeplot(x='DAYS_BIRTH', y='proba', data=train_test);

In [None]:
sns.kdeplot(data=train_test, x="DAYS_BIRTH", y="proba", hue="TARGET", fill=True);

In [None]:
sns.relplot(x='DAYS_LAST_PHONE_CHANGE', y='proba', hue='TARGET', data=train_test)
sns.kdeplot(x='DAYS_LAST_PHONE_CHANGE', y='proba', data=train_test);

In [None]:
sns.kdeplot(data=train_test, x="DAYS_LAST_PHONE_CHANGE", y="proba", hue="TARGET", fill=True);

In [None]:
sns.relplot(x='REG_CITY_NOT_WORK_CITY', y='proba', hue='TARGET', data=train_test)
sns.kdeplot(x='REG_CITY_NOT_WORK_CITY', y='proba', data=train_test);

In [None]:
sns.kdeplot(data=train_test, x="REG_CITY_NOT_WORK_CITY", y="proba", hue="TARGET", fill=True);

In [None]:
sns.relplot(x='OWN_CAR_AGE', y='proba', hue='TARGET', data=train_test)
sns.kdeplot(x='OWN_CAR_AGE', y='proba', data=train_test);

In [None]:
sns.kdeplot(data=train_test, x="OWN_CAR_AGE", y="proba", hue="TARGET", fill=True);

In [None]:
sns.relplot(x='EXT_SOURCE_3', y='proba', hue='TARGET', data=train_test)
sns.kdeplot(x='EXT_SOURCE_3', y='proba', data=train_test);

In [None]:
sns.kdeplot(data=train_test, x="EXT_SOURCE_3", y="proba", hue="TARGET", fill=True);

In [None]:
sns.relplot(x='DAYS_EMPLOYED', y='proba', hue='TARGET', data=train_test)
sns.kdeplot(x='DAYS_EMPLOYED', y='proba', data=train_test);

In [None]:
sns.kdeplot(data=train_test, x="DAYS_EMPLOYED", y="proba", hue="TARGET", fill=True);

In [None]:
sns.relplot(x='AMT_GOODS_PRICE', y='proba', hue='TARGET', data=train_test)
sns.kdeplot(x='AMT_GOODS_PRICE', y='proba', data=train_test);

In [None]:
sns.kdeplot(data=train_test, x="AMT_GOODS_PRICE", y="proba", hue="TARGET", fill=True);

### 6.2 Feature importance de LGBM <a class="anchor" id="6.2"></a>

In [None]:
df_feat = pd.DataFrame(pipe_test[1].feature_importances_, index = X.columns).sort_values(by = 0, ascending = False)
fig, ax = plt.subplots()
sns.barplot(x=df_feat.head(20)[0], y=df_feat.head(20).index, ax = ax);

In [None]:
fig.savefig('Feat_importance.png', bbox_inches = 'tight')

In [None]:
y_pred_test_proba = pipe_test.predict_proba(test)
test_test = test.copy()
test_test['proba'] = pd.DataFrame(y_pred_test_proba)[1]

In [None]:
test_test.loc[0, 'proba']

In [None]:
for feat in df_feat.head(4).index:
    fig, ax = plt.subplots(1,1)
    sns.scatterplot(x=feat, y ='proba', hue = 'TARGET', data = train_test, ax = ax)
    x_min, x_max, y_min, y_max = plt.axis()
    plt.hlines(y = 0.5157, xmin = x_min, xmax = x_max, color = 'red')
    xp = test_test.loc[0, feat]
    yp = test_test.loc[0, 'proba']
    plt.plot(xp, yp, marker = 'o', color = 'green')
    plt.xticks(rotation = 45, ha = 'right');

Regardons si la réduction du dataset de train à 50% change l'allure des graphs...

In [None]:
train_samp, X_samp = train_test_split(train_test, test_size = 0.5, stratify = train_test[['TARGET']])

for feat in df_feat.head(4).index:
    fig, ax = plt.subplots(1,1)
    sns.scatterplot(x=feat, y ='proba', hue = 'TARGET', data = train_samp, ax = ax)
    x_min, x_max, y_min, y_max = plt.axis()
    plt.hlines(y = 0.5157, xmin = x_min, xmax = x_max, color = 'red')
    xp = test_test.loc[0, feat]
    yp = test_test.loc[0, 'proba']
    plt.plot(xp, yp, marker = 'o', color = 'green')
    plt.xticks(rotation = 45, ha = 'right');

In [None]:
for feat in df_feat.head(3).index:
    fig, ax = plt.subplots(1,1)
    sns.boxplot(x= 'TARGET', y = feat, data = train_test)
    xp = test_test.loc[0, feat]
    plt.plot(0, xp, marker = 'o', color = 'green');

In [None]:
for feat in df_feat.head(3).index:
    fig, ax = plt.subplots(1,1)
    sns.boxplot(x= 'TARGET', y = feat, data = train_samp)
    xp = test_test.loc[0, feat]
    plt.plot(0, xp, marker = 'o', color = 'green');

On considère que l'on pourra utiliser le dataset réduit pour l'explicabilité.

In [None]:
train_samp.to_csv('new_train.csv')

In [None]:
max(train['SK_ID_CURR'])

In [None]:
max(test['SK_ID_CURR'])

### 6.3 SHAP <a class="anchor" id="6.3"></a>

In [None]:
explainer = shap.TreeExplainer(pipe_test[1])

In [None]:
%%time
X_shap = pd.DataFrame(pipe_test[0].transform(train[col]), columns = col)
shap_val = explainer.shap_values(X_shap)

In [None]:
fig = shap.summary_plot(shap_val, X_shap, show = False)
plt.savefig('Shap_exp.png')
plt.show()

In [None]:
%%time
X_shap2 = pd.DataFrame(pipe_test[0].transform(train_samp[col]), columns = col)
shap_val2 = explainer.shap_values(X_shap2)

In [None]:
shap.summary_plot(shap_val2, X_shap2)

L'explainer de Shap est plus stable que celui du modèle. On préférera donc SHAP pour l'interprétabilité globale.

In [None]:
df_shap = pd.DataFrame(shap_val2[0], columns = X_shap2.columns)

In [None]:
vals = np.abs(df_shap.values).mean(0)
shap_importance = pd.DataFrame(list(zip(col, vals)), columns=['col_name', 'feature_importance_vals'])
shap_importance.sort_values(by=['feature_importance_vals'], ascending=False, inplace=True)

In [None]:
features = list(shap_importance['col_name'])

In [None]:
shap.summary_plot(shap_val[1], X_shap)

In [None]:
shap.summary_plot(shap_val2[0], X_shap2)

In [None]:
shap.dependence_plot("EXT_SOURCE_3", shap_val[0], X_shap, interaction_index="auto")

In [None]:
shap.dependence_plot("EXT_SOURCE_2", shap_val[0], X_shap, interaction_index="auto")

In [None]:
shap.dependence_plot("DAYS_BIRTH", shap_val[0], X_shap, interaction_index="auto")

In [None]:
shap.dependence_plot("CODE_GENDER", shap_val[0], X_shap, interaction_index="auto")

In [None]:
shap.dependence_plot("AMT_ANNUITY", shap_val[0], X_shap)

Pour l'explicabilité locale, il est nécessaire d'avoir fait travaillé l'explainer sur les données individuelles.

In [None]:
pipe_test[1].predict_proba(pd.DataFrame(X_shap.iloc[0]).T)[0][1]

In [None]:
shap.initjs()

In [None]:
shap.force_plot(explainer.expected_value[0], shap_val[0][0], X_shap.iloc[0])

In [None]:
shap.force_plot(explainer.expected_value[0], shap_val[0][0:1000], X_shap.iloc[0:1000])

### 6.4 LIME <a class="anchor" id="6.4"></a>

In [None]:
import lime
from lime.lime_tabular import LimeTabularExplainer

In [None]:
%%time

lime_explainer = LimeTabularExplainer(pipe_test[0].transform(train[col]), 
                                                       feature_names=col, 
                                                       class_names=['0', '1'], 
                                                       verbose=True)

In [None]:
test.loc[0,col]

In [None]:
transform_data = pipe_test[0].transform(pd.DataFrame(test.loc[0,col]).T)
lime_explanation = lime_explainer.explain_instance(transform_data[0], pipe_test[1].predict_proba)

# Extract feature importance values
feature_importance = {}
for feature, importance in lime_explanation.as_list():
    feature_importance[feature] = importance

In [None]:
feature_importance

In [None]:
lime_explanation.show_in_notebook(show_table=True)

Avec LIME, les features les plus importantes sont légèrement différentes mais on retrouve tout de même les principales.

In [None]:
lime_explanation.as_pyplot_figure();

En revanche, contrairement à Shap, il est possible d'interpréter la position d'un client inconnu sans avoir à le rentrer dans l'explainer... On choisira donc Shap pour l'interprétabilité locale.

## 7. Analyse du Data Drift <a class="anchor" id="7"></a>

In [None]:
from evidently.report import Report
from evidently.metric_preset import DataDriftPreset, TargetDriftPreset, DataQualityPreset, RegressionPreset

In [None]:
report = Report(metrics=[
    DataDriftPreset(), 
])

report.run(reference_data=train.drop(columns='TARGET'), current_data=test)
report

In [None]:
report.save_html('data_drift_report.html')

In [None]:
# on se limite ici aux 70 features utilisées

report2 = Report(metrics=[
    DataDriftPreset(), 
])

report2.run(reference_data=train[col], current_data=test[col])

In [None]:
report2.save_html('data_drift_model_feat.html')

Il n'y a pas de Data Drift.

Regardons l'importance relative des features qui ont un léger drift.

In [None]:
import json
dic_report = json.loads(report2.json())

In [None]:
dic_report.keys()

In [None]:
drifted_feat = dic_report['metrics'][1]['result']['drift_by_columns']

In [None]:
dico_drift = {}
for feat in drifted_feat.keys():
    if drifted_feat[feat]['drift_detected'] == True:
        dico_drift[feat] = round(drifted_feat[feat]['drift_score'], 3)

In [None]:
dico_drift

In [None]:
drift_col = list(dico_drift.keys())

In [None]:
for columns in drift_col:
    fig, ax = plt.subplots(1,2, figsize = (15, 4))
    sns.histplot(train[columns], bins = 50, ax = ax[0])
    ax[0].set_title('Initial data')
    ax[0].set_xlabel('')
    ax[0].set_ylabel('')
    sns.histplot(test[columns], bins = 50, ax = ax[1])
    ax[1].set_title('New data')
    ax[1].set_xlabel('')
    ax[1].set_ylabel('')
    plt.suptitle(f'{columns} / drift_score = {dico_drift[columns]} / importance_rank = {list(df_feat.index).index(columns)}')
    plt.show()

Les features qui ont un léger drift ne sont pas parmi les proncipales permettant d'expliquer la classification.  
Les nouveaux clients peuvent donc être considérés par le modèle.

## 8. Réduction de la taille des datasets pour export vers Github <a class="anchor" id="8"></a>

Github classique n'acceptant pas les gros fichiers, il est nécessaire de les réduire autant que possible pour les exporter afin de les utiliser pour le déploiement. Nous créons donc de nouveaux fichiers csv moins lourds, en retypant les colonnes.

In [None]:
def reduce_memory_usage(df):
  
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
col_ref = col + ['TARGET', 'proba', 'SK_ID_CURR']

In [None]:
app_train = reduce_memory_usage(train_test[col + ['TARGET', 'proba', 'SK_ID_CURR']])

In [None]:
app_test = reduce_memory_usage(test_test[col + ['proba', 'SK_ID_CURR']])

In [None]:
X_samp, train_samp = train_test_split(app_train, test_size = 0.24, stratify = train_test[['TARGET']])
end_mem = train_samp.memory_usage().sum() / 1024**2
print('Finale memory usage for train: {:.2f} MB'.format(end_mem))

In [None]:
train_samp.to_csv('new_train.csv')

In [None]:
app_test.to_csv('application_test.csv')