# Implementez un modèle de scoring

## Import des libraries

## Rapple de la mission
Construire un modèle de scoring qui donnera une prédiction sur la probabilité de faillite d'un client de façon automatique.

In [92]:
import time
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import make_column_selector, make_column_transformer
from IPython.display import display, HTML
from imblearn.over_sampling import SMOTE
from contextlib import contextmanager
from imblearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression  
from sklearn.metrics import classification_report
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier


In [93]:
def display_info(title = "",titleTag="h2",tag="div",pTag="p",message="Params allowed: title,tag,message,color",bgd="#96d6ee", styleClass="alert alert-block alert-info", fontsize="" , color="black"):
    if (title!=""):
        display(HTML(f"""<{titleTag} style='color:{color};font-size:{fontsize}'>{title}</{titleTag}>"""))
    else:
        display(HTML(f"""
                     <{tag} style='background:{bgd}; padding-top:5px; padding-bottom:5px; padding-left:10px;' class='{styleClass}'><{pTag} style='color:{color}; font-size:{fontsize}'>{message}</{pTag}></{tag}>
"""))

In [94]:
app_train = pd.read_csv('../datas/application_train.csv')
app_test = pd.read_csv('../datas/application_test.csv')
bureau = pd.read_csv('../datas/bureau.csv')
bureau_balance = pd.read_csv('../datas/bureau_balance.csv')
pos_CASH_balance = pd.read_csv('../datas/POS_CASH_balance.csv')
credit_card_balance = pd.read_csv('../datas/credit_card_balance.csv')
previous_application = pd.read_csv('../datas/previous_application.csv')
installments_payments = pd.read_csv('../datas/installments_payments.csv')
columns_description = pd.read_csv('../datas/HomeCredit_columns_description.csv', sep=';', encoding="unicode_escape")

In [95]:
@contextmanager
def timer(title):
    t0=time.time()
    yield
    print("{} - terminé en {:.000f}s".format(title, time.time()-t0))

In [96]:
display_info(message="Split des données en 2 en gardant la même proposrtion au niveau de 'TARGET'")
def split_df(df):
    X=df.drop('TARGET', axis=1)
    y=df['TARGET']
    X_train, X_test,y_train, y_test = train_test_split(X,y,test_size=0.5, stratify=y, random_state=42)
    return X_train, X_test,y_train, y_test

In [97]:
display_info(message="""En lisant la documentations sklearn, on comprend que le SMOTE est en version expérimental. On utilisera donc imblearn qui en plus est facilement intégrable dans un pipeline""")

In [98]:
display_info(title="PIPELINE", color="red",fontsize="30px")
display_info(message="""On va realiser notre modelisation en automatisant toute notre logique.</br>
             On va donc utiliser le plus possible les pipeline mis à disposition dans sklearn""")
display_info(message="On commence par séparer nos variable en utilisant make_column_selector")

In [102]:
numerical_features = make_column_selector(dtype_include=np.number)
categorical_features = make_column_selector(dtype_exclude=np.number)

display_info(message="On creer un pipeline pour chaque type de variables")
numerical_pipeline = Pipeline([
    ("imputer",SimpleImputer(strategy="mean")),
    ("scaler",StandardScaler())
    ])

categorical_pipeline = Pipeline([
    ("imputer",SimpleImputer(strategy='most_frequent')),
    ("encoder",OneHotEncoder(handle_unknown="ignore"))
    ])

# categorical_pipeline = Pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder())

display_info(message="On creer un pipeline générale qui va appliquer la modifcation souhaité sur nos differents type de variables")

preprocessor =make_column_transformer(
    (numerical_pipeline,numerical_features),
    (categorical_pipeline, categorical_features)
    )

models = {
    'RandomForest': RandomForestClassifier(random_state=42),
    'LogisticRegression': LogisticRegression( random_state=42),
    'LGBM': LGBMClassifier(random_state=42),
    'XGBOOST': XGBClassifier(random_state=42)
}


X_train, X_initial_test, y_train, y_initial_test= split_df(app_train)

X_test,X_val,y_test,y_val=train_test_split(X_initial_test,y_initial_test, test_size=0.5, stratify=y_initial_test, random_state=42)

display_info(message="On teste ensuite nos différents modèle")
best_model_name = None
best_score = -float('inf')
for name, model in models.items():
    print(f"Démarrage du test du modèle {name}")
    with timer(f'Calcul du score pour le model {name}'):
        model_pipeline = Pipeline([
            ("preprocessor",preprocessor),
            ("smote",SMOTE(random_state=42)),
            ("classifier",model)
        ])
        kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        scores = cross_val_score(model_pipeline, X_train, y_train, cv=kf, scoring='roc_auc')
        mean_score=scores.mean()
        print(f"Scores de validation croisée pour {name}:", scores)
        print(f"Scores moyen {name}:", mean_score)

        if mean_score > best_score:
            best_score=mean_score
            best_model_name=name
print("\nMeilleur modèle :", best_model_name)
print("\nScore moyen du meilleur modèle :", best_score)

Démarrage du test du modèle RandomForest
Scores de validation croisée pour RandomForest: [0.69461822 0.69900245 0.69811818 0.69733826 0.69587459]
Scores moyen RandomForest: 0.6969903382562115
Calcul du score pour le model RandomForest - terminé en 395s
Démarrage du test du modèle LogisticRegression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Scores de validation croisée pour LogisticRegression: [0.72832956 0.7309579  0.73751136 0.74237018 0.73144054]
Scores moyen LogisticRegression: 0.7341219077168792
Calcul du score pour le model LogisticRegression - terminé en 26s
Démarrage du test du modèle LGBM
[LightGBM] [Info] Number of positive: 113074, number of negative: 113074
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.033114 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 50648
[LightGBM] [Info] Number of data points in the train set: 226148, number of used features: 236
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 113074, number of negative: 113074
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.030451 seconds.
You can set `force_row_wise=true` to remove the overh

In [None]:
display_info(message=f" Le meilleur algorithme sans amélioration des hyper paramètre {best_model_name}")

# Paramètres pour GridSearchCV : on teste chaque modèle
param_grid = [
    {'classifier': [models[{best_model_name}]]}
]
grid_search = GridSearchCV(model_pipeline,param_grid, cv=5, scoring='roc_auc')

display_info(message="on test notre model sur notre jeu de données de test")
with timer(title=(f"Modelisation et prediction du model")):
    model_pipeline.fit(X_train,y_train)
    y_pred = model_pipeline.predict(X_test)
display_info(message="on compare nos valeur predite sur notre jeu de données de test au valeur a predire y_test mis de côté")
print(classification_report(y_test, y_pred))
model.fit(X_train, y_train)