In [1]:
!pip install imblearn
!pip install xgboost
!pip install --user numba shap



In [2]:
!pip install --upgrade scipy
!pip install --upgrade shap



In [3]:
!pip install --upgrade scikit-learn --user



# Projet de Modèle de Scoring

## Introduction
Ce projet vise à développer un modèle de scoring de crédit qui prédit la probabilité qu'un client rembourse son crédit. Ce modèle aidera la société financière à prendre des décisions éclairées sur l'octroi de prêts.


## Importation des Libraries

In [4]:
import sklearn
print(sklearn.__version__)

1.5.0


In [5]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import class_weight
from sklearn.model_selection import train_test_split, GridSearchCV, FixedThresholdClassifier
from sklearn.metrics import roc_auc_score, roc_curve, classification_report, confusion_matrix, make_scorer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline as make_pipeline_imblearn
from imblearn.under_sampling import RandomUnderSampler
import mlflow
import mlflow.sklearn
from collections import Counter
from xgboost import XGBClassifier

# Configuration de MLFlow

In [6]:
import mlflow
from mlflow import log_metric, log_param, log_artifacts

# Définir l'URI de suivi de MLFlow
path_mlruns = '../mlruns/'
mlflow.set_tracking_uri(path_mlruns)

# Affiche le URI de tracking actuel
print(mlflow.get_tracking_uri())

../mlruns/


In [7]:
'''
# Créer une nouvelle expérience et obtenir son ID
experiment_id = mlflow.create_experiment("model_scoring")
print(f"ID de la nouvelle expérience : {experiment_id}")
'''

'\n# Créer une nouvelle expérience et obtenir son ID\nexperiment_id = mlflow.create_experiment("model_scoring")\nprint(f"ID de la nouvelle expérience : {experiment_id}")\n'

## Chargement des Données

In [8]:
path_data_processed = "../data/processed/"
data = pd.read_csv(path_data_processed + 'processed_data_train.csv', index_col=[0])
data_test = pd.read_csv(path_data_processed + 'processed_data_test.csv', index_col=[0])

## Séparation des données en ensembles d'entraînement et de test

In [9]:
X = data.drop('TARGET', axis=1)
y = data['TARGET']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

## Définition des Fonctions Utiles

In [10]:
def plot_roc_curve(y_test, y_scores, title="ROC Curve"):
    fpr, tpr, _ = roc_curve(y_test, y_scores)
    auc = roc_auc_score(y_test, y_scores)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'AUC = {auc:.2f}')
    plt.plot([0, 1], [0, 1], linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(title)
    plt.legend()
    plt.show()
    return auc

In [11]:
def business_cost(y_true, y_pred, cost_fn=10, cost_fp=1):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return -(cost_fn * fn + cost_fp * fp)  # Minimize business cost

In [12]:
def business_score(y_true, y_pred, cost_fn=10, cost_fp=1):
    """
    Calculate the business score for predictions.

    Parameters:
    - y_true: array, true labels.
    - y_pred: array, predicted labels.
    - cost_fn: int, cost associated with a false negative.
    - cost_fp: int, cost associated with a false, positive.

    Returns:
    - float, normalized business score between 0 and 1.
    """
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    total_cost = cost_fn * fn + cost_fp * fp
    max_cost = cost_fn * (fn + tp) + cost_fp * (fp + tn)  # worst case scenario: all predictions are wrong
    
    # Normalize and subtract from 1 to flip the scale: higher is better
    return 1 - (total_cost / max_cost)

## Prétraitement des Données
### Vérification des Valeurs Manquantes

In [13]:
print(f"Missing values: {data.isna().sum().sum()} ({data.isna().sum().sum()/data.size:.2%})")

Missing values: 36088183 (19.62%)


In [14]:
# Affichage des colonnes pour vérification
print(data.columns)

# Définition des features numériques et catégorielles
numeric_features = [col for col in X.columns if X[col].dtype in ['int64', 'float64']]
categorical_features = [col for col in X.columns if X[col].dtype == 'object']

# Affichage des types de features
print("Numeric Features:", numeric_features)
print("Categorical Features:", categorical_features)

Index(['TARGET', 'NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR',
       'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT',
       'AMT_ANNUITY', 'REGION_POPULATION_RELATIVE',
       ...
       'PREV_PRODUCT_COMBINATION_Cash X-Sell: low_MEAN',
       'PREV_PRODUCT_COMBINATION_Cash X-Sell: middle_MEAN',
       'PREV_PRODUCT_COMBINATION_POS household with interest_MEAN',
       'PREV_PRODUCT_COMBINATION_POS household without interest_MEAN',
       'PREV_PRODUCT_COMBINATION_POS industry with interest_MEAN',
       'PREV_PRODUCT_COMBINATION_POS industry without interest_MEAN',
       'PREV_PRODUCT_COMBINATION_POS mobile with interest_MEAN',
       'PREV_PRODUCT_COMBINATION_POS mobile without interest_MEAN',
       'PREV_PRODUCT_COMBINATION_POS other with interest_MEAN',
       'PREV_PRODUCT_COMBINATION_POS others without interest_MEAN'],
      dtype='object', length=598)
Numeric Features: ['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CH

### Création de pipelines et de transformations

In [15]:
# Définition des transformations pour les données numériques et catégorielles
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])


# Entraînement et évaluation de XGBoost

In [16]:
'''
# Démarrage de la session MLFlow
mlflow.start_run(run_name="Optimized XGBoost with Fixed Thresholds", experiment_id=313886723643097310)

# Configuration de MLFlow pour le suivi des métriques
def log_metrics(y_true, y_pred, y_proba, threshold):
    auc = roc_auc_score(y_true, y_proba[:, 1])
    business_score_value = business_score(y_true, (y_proba[:, 1] > threshold).astype(int))
    mlflow.log_metric("AUC", auc)
    mlflow.log_metric("best_business_score", business_score_value)
    mlflow.log_metric("Optimized Threshold", threshold)

# Définition de la pipeline avec XGBoost et FixedThresholdClassifier
model_xgb = XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.1, random_state=10)
fixed_threshold_classifier = FixedThresholdClassifier(estimator=model_xgb, threshold=0.5, response_method='predict_proba')

model_xgb_pipeline = make_pipeline_imblearn(
    preprocessor,
    SMOTE(sampling_strategy=0.5, random_state=42),
    fixed_threshold_classifier
)

# Paramètres pour GridSearchCV
param_grid = {
    'fixedthresholdclassifier__estimator__max_depth': [3, 5],
    'fixedthresholdclassifier__estimator__learning_rate': [0.1, 0.2],
    'fixedthresholdclassifier__estimator__n_estimators': [100, 200],
    'fixedthresholdclassifier__estimator__subsample': [0.7, 1],
    'fixedthresholdclassifier__threshold': [0.1, 0.5, 1.0]
}

# Configuration de GridSearchCV avec une métrique personnalisée
business_scorer = make_scorer(business_score, greater_is_better=True, needs_proba=False, cost_fn=10, cost_fp=1)
grid_search = GridSearchCV(model_xgb_pipeline, param_grid, scoring=business_scorer, cv=3, verbose=2)
grid_search.fit(X_train, y_train)

# Récupération des résultats
best_params = grid_search.best_params_
best_score = grid_search.best_score_
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)

# Log des métriques
log_metrics(y_test, y_pred, y_proba, best_params['fixedthresholdclassifier__threshold'])
print("Meilleurs paramètres : ", best_params)
print("Meilleur score business : ", best_score)

# Log dans MLFlow
mlflow.log_params(best_params)
mlflow.log_metric("best_business_score", best_score)

# Vérification et log des importances des caractéristiques
if hasattr(best_model.named_steps['fixedthresholdclassifier'].estimator, 'feature_importances_'):
    feature_importances = best_model.named_steps['fixedthresholdclassifier'].estimator.feature_importances_
    feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
    feature_importance_df.sort_values('Importance', ascending=False, inplace=True)
    print(feature_importance_df.head(20))

mlflow.sklearn.log_model(best_model, "model_v1")
mlflow.end_run()
'''

'\n# Démarrage de la session MLFlow\nmlflow.start_run(run_name="Optimized XGBoost with Fixed Thresholds", experiment_id=313886723643097310)\n\n# Configuration de MLFlow pour le suivi des métriques\ndef log_metrics(y_true, y_pred, y_proba, threshold):\n    auc = roc_auc_score(y_true, y_proba[:, 1])\n    business_score_value = business_score(y_true, (y_proba[:, 1] > threshold).astype(int))\n    mlflow.log_metric("AUC", auc)\n    mlflow.log_metric("best_business_score", business_score_value)\n    mlflow.log_metric("Optimized Threshold", threshold)\n\n# Définition de la pipeline avec XGBoost et FixedThresholdClassifier\nmodel_xgb = XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.1, random_state=10)\nfixed_threshold_classifier = FixedThresholdClassifier(estimator=model_xgb, threshold=0.5, response_method=\'predict_proba\')\n\nmodel_xgb_pipeline = make_pipeline_imblearn(\n    preprocessor,\n    SMOTE(sampling_strategy=0.5, random_state=42),\n    fixed_threshold_classifier\n)\n\n

In [None]:
# Démarrage de la session MLFlow
mlflow.start_run(run_name="Optimized XGBoost with class_weight and Fixed Thresholds", experiment_id=313886723643097310)

# Calcul des poids des classes
class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
weight = class_weights[1] / class_weights[0]

# Définition de la pipeline avec XGBoost et FixedThresholdClassifier
model_xgb = XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.1, scale_pos_weight=weight, random_state=10)
fixed_threshold_classifier = FixedThresholdClassifier(estimator=model_xgb, threshold=0.5, response_method='predict_proba')

model_xgb_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', fixed_threshold_classifier)
])

# Paramètres pour GridSearchCV
param_grid = {
    'classifier__estimator__max_depth': [3, 5],
    'classifier__estimator__learning_rate': [0.1, 0.2],
    'classifier__estimator__n_estimators': [100, 200],
    'classifier__estimator__subsample': [0.7, 1],
    'classifier__threshold': [0.1, 0.5, 1.0]
}

# Configuration de GridSearchCV avec une métrique personnalisée
business_scorer = make_scorer(business_score, greater_is_better=True, needs_proba=False)
grid_search = GridSearchCV(model_xgb_pipeline, param_grid, scoring=business_scorer, cv=3, verbose=2)
grid_search.fit(X_train, y_train)

# Récupération et affichage des résultats de validation croisée
cv_results = grid_search.cv_results_
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print("CV Results: ", cv_results)
print("Meilleurs paramètres : ", best_params)
print("Meilleur score business : ", best_score)

# Enregistrement dans MLFlow
mlflow.log_params(best_params)
mlflow.log_metric("best_business_score", best_score)

# Vérification et log des importances des caractéristiques si disponibles
best_model = grid_search.best_estimator_
if hasattr(best_model.named_steps['classifier'].estimator, 'feature_importances_'):
    feature_importances = best_model.named_steps['classifier'].estimator.feature_importances_
    feature_names = [name for name, _ in model_xgb_pipeline.steps[0][1].transformers_[0]]
    feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
    feature_importance_df.sort_values('Importance', ascending=False, inplace=True)
    print(feature_importance_df.head(20))

mlflow.sklearn.log_model(best_model, "model_v2")
mlflow.end_run()



Fitting 3 folds for each of 48 candidates, totalling 144 fits
[CV] END classifier__estimator__learning_rate=0.1, classifier__estimator__max_depth=3, classifier__estimator__n_estimators=100, classifier__estimator__subsample=0.7, classifier__threshold=0.1; total time=  54.7s
[CV] END classifier__estimator__learning_rate=0.1, classifier__estimator__max_depth=3, classifier__estimator__n_estimators=100, classifier__estimator__subsample=0.7, classifier__threshold=0.1; total time=  53.0s
[CV] END classifier__estimator__learning_rate=0.1, classifier__estimator__max_depth=3, classifier__estimator__n_estimators=100, classifier__estimator__subsample=0.7, classifier__threshold=0.1; total time=  46.3s
[CV] END classifier__estimator__learning_rate=0.1, classifier__estimator__max_depth=3, classifier__estimator__n_estimators=100, classifier__estimator__subsample=0.7, classifier__threshold=0.5; total time=  45.5s
[CV] END classifier__estimator__learning_rate=0.1, classifier__estimator__max_depth=3, cla

In [None]:
dddd

## Importance des caractéristiques
Extraction de l'importance des caractéristiques après l'entraînement des modèles :
* **Ajustement du préprocesseur :** J'ai ajusté le preprocessor sur les données d'entraînement, ce qui est essentiel pour préparer les features avant de récupérer leur nom.

* **Gestion des features catégorielles :** J'ai conditionné l'ajustement des caractéristiques catégorielles et la récupération de leurs noms. C'est utile si la liste des caractéristiques catégorielles peut varier ou être vide.

* **Extraction et affichage des importances des caractéristiques :** Après avoir entrainé le modèle, j'ai extrait l'importance de chaque caractéristique et affiché les plus importantes. Cela nous aide à identifier quelles variables ont le plus d'impact sur les prédictions du modèle.

In [None]:
# Assurez-vous que le préprocesseur est ajusté
preprocessor.fit(X_train)

In [None]:
# Récupération des noms des caractéristiques
feature_names = list(preprocessor.transformers_[0][1].get_feature_names_out())
if categorical_features:
    feature_names += list(preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(categorical_features))

In [None]:
# Récupération des importances des caractéristiques du modèle entraîné
feature_importances = grid_search.best_estimator_.named_steps['fixedthresholdclassifier'].estimator.feature_importances_

# Création et affichage du DataFrame
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
feature_importance_df.sort_values('Importance', ascending=False, inplace=True)
print(feature_importance_df.head(20))

## Outils d'interprétation des modèles
Utilisation de SHAP pour l'interprétation des prédictions du modèle XGBoost :

In [None]:
import shap

explainer = shap.Explainer(best_model.named_steps['fixedthresholdclassifier'].estimator, X_train)
shap_values = explainer(X_train)

In [None]:
shap.summary_plot(shap_values, X_train, feature_names=feature_names)

In [None]:
import shap

# Calcul des valeurs SHAP
explainer = shap.TreeExplainer(grid_search.best_estimator_.named_steps['fixedthresholdclassifier'])
shap_values = explainer.shap_values(X_train)

# Visualisation
shap.summary_plot(shap_values, X_train, feature_names=feature_names)

In [None]:
# Création d'un objet Explanation pour les données de test
explainer = shap.Explainer(grid_search.best_estimator_.named_steps['xgbclassifier'])
shap_values_test = explainer(X_test)

# Visualisation pour une instance spécifique des données de test
index = 0  # Index de l'instance à expliquer
shap.force_plot(explainer.expected_value, shap_values_test[index].values, X_test.iloc[index])

# Visualisation de l'importance globale des caractéristiques
shap.summary_plot(shap_values_test.values, X_test, plot_type="bar")

# Diagramme détaillé pour une instance spécifique
shap.waterfall_plot(shap_values_test[index])