In [None]:

import pandas as pd
import sys
import os

import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno
import mlflow
import shap
import boto3

from lightgbm import LGBMClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from mlflow import MlflowClient
from mlflow.exceptions import MlflowException
from pprint import pprint
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer,confusion_matrix,ConfusionMatrixDisplay,roc_auc_score
from sklearn.svm import SVC
from xgboost import XGBClassifier

In [None]:
sys.path.append(os.path.abspath(os.path.join('..')))

In [None]:
from src.lightgbm_with_simple_features import main
from src.model_prep import get_initial_splits, get_grid_cv_scores,plot_metric_results

# MLFlow setup

Before this, run 
```bash
mlflow server --host 127.0.0.1 --port 8080
```
in the terminal

In [None]:
client = MlflowClient(tracking_uri="http://127.0.0.1:8080")

In [None]:
# Experiment description
experiment_description = (
    "OpenClassrooms - Projet 7 "
    "Implémentation d'un modèle de scoring pour des crédits"
)
experiment_tags = {
    "mlflow.note.content": experiment_description,
}

# Create the Experiment
try:
    scoring_experiment = client.create_experiment(
    name="Loan_Risk_Models",tags=experiment_tags,artifact_location='s3://p7mlflowartifacts/20240405_artifacts/')
except MlflowException as e:
    print (e)

In [None]:
all_experiments = client.search_experiments()

print(all_experiments)

In [None]:
mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")
mlflow.set_experiment("Loan_Risk_Models")

# Data loading

Lightgbm_with_simple_features kernel from kaggle ([link](https://www.kaggle.com/code/jsaguiar/lightgbm-with-simple-features/script)) has been slightly modified and will be used for data preprocessing and feature engineering.
Modifications made are :
- Dropping columns with missing values.
- Replacing inf values due to division by zero with nans.

In [None]:
df=main()

In [None]:
df

In [None]:
kaggle_df,X_train,X_test,y_train,y_test=get_initial_splits(df)

In [None]:
plt.pie(df['TARGET'].value_counts(),autopct='%1.0f%%')
plt.legend(labels=df['TARGET'].value_counts().index)
plt.title("Proportion d'individus par classe")
plt.show()

In [None]:
scaler=StandardScaler()
X_train_scaled=scaler.fit_transform(X_train)
X_train_scaled=pd.DataFrame(X_train_scaled,columns=X_train.columns,index=X_train.index)
X_test_scaled=scaler.transform(X_test)
X_test_scaled=pd.DataFrame(X_test_scaled,columns=X_test.columns,index=X_test.index)

In [None]:
#This features come from feature selection after first run of model selection (see below)
features_to_keep=['NAME_INCOME_TYPE_Working',
 'HOUSETYPE_MODE_block of flats',
 'NAME_EDUCATION_TYPE_Higher education',
 'FLAG_OWN_CAR',
 'CNT_CHILDREN',
 'WALLSMATERIAL_MODE_Stone, brick',
 'REGION_RATING_CLIENT_W_CITY',
 'DAYS_REGISTRATION',
 'FLAG_PHONE',
 'REGION_RATING_CLIENT',
 'REGION_POPULATION_RELATIVE',
 'NAME_EDUCATION_TYPE_Secondary / secondary special',
 'NAME_INCOME_TYPE_Commercial associate',
 'NAME_INCOME_TYPE_Pensioner',
 'NAME_TYPE_SUITE_Unaccompanied',
 'WEEKDAY_APPR_PROCESS_START_TUESDAY',
 'REG_CITY_NOT_WORK_CITY']

In [None]:
X_train_select=X_train_scaled[features_to_keep]
X_train_select.shape

In [None]:
X_test_select=X_test_scaled[features_to_keep]
X_test_select.shape

X_test will also be used as "new" clients data to make predictions with the API.

In [None]:
#Save a local copy
X_test.to_csv('../data/X_test.csv')
#Save a copy to AWS
s3_client = boto3.client('s3')

output_file = 'X_test.csv'
s3_client.upload_file("../data/X_test.csv", "clientsdataxtest",'X_test.csv')

# Model selection

In [None]:
print(X_train_scaled.shape,y_train.shape)

Computing times are too high, model selection will be done with a sample of data.

In [None]:
sample=pd.concat([X_train_scaled,y_train],axis=1).sample(1000,random_state=33)
X_train_sample=sample.copy()
y_train_sample=X_train_sample.pop('TARGET')

In [None]:
#Check if proportion of classes was kept
print('Initial proportion :',
y_train.value_counts()/len(y_train))

print('Sample proportion',y_train_sample.value_counts()/len(y_train_sample))

In [None]:
def custom_loss(y_true, y_pred):
    """Compute custom loss from y_true and y_pred to score models with

    Parameters
    ----------
    y_true : pd.Series
        True labels (0 or 1)
    y_pred : pd.Series
        Labels predicted by the model

    Returns
    -------
    int
        Score obtained considering a loan given and not reimbursed is 10x worse than a loan not given to a client who would reinburse-it.
    """
    #FP = loan not given to good client, FN loan given to bad client.
    _, fp, fn, _ = confusion_matrix(y_true, y_pred).ravel()
    return fn*10+fp

custom_score = make_scorer(custom_loss, greater_is_better=False)

### Dummy Baseline

In [None]:
mlflow.sklearn.autolog(extra_tags={'mlflow.runName': 'DummyRegressor'})

In [None]:
model=DummyClassifier()
params={'classification__strategy':['most_frequent']}
_,results_dummy=get_grid_cv_scores(model,params,custom_score=custom_score,X_train=X_train_select,y_train=y_train)
results_dummy

The dummy's accuracy score is very high (0.92) due to the classes imbalance. This shows the need for a custom score. 

### Logistic Regression

In [None]:
mlflow.sklearn.autolog(extra_tags={'mlflow.runName': 'LogisticRegression'})

In [None]:
model=LogisticRegression(random_state=33)
params={'classification__penalty' : ['elasticnet', None], 'classification__l1_ratio':[0,0.2,0.5,0.7,1],'classification__C': [0.1,1,2,5, 10.]}
best_estimator_, results_lr=get_grid_cv_scores(model,params,custom_score=custom_score,X_train=X_train_select,y_train=y_train)

In [None]:
best_estimator_

In [None]:
results_lr.dropna(axis=0).sort_values('mean_test_Custom_scorer',ascending=False)

In [None]:
plot_metric_results(results_lr.dropna(axis=0).sort_values('mean_test_Custom_scorer',ascending=False))

### SVC with gaussian kernel

In [None]:
mlflow.sklearn.autolog(extra_tags={'mlflow.runName': 'SVC'})

In [None]:
model=SVC(random_state=33)
params={'classification__kernel':['rbf'], 'classification__C':[0.01,1,10], 'classification__gamma':[0.1,0.5,1]}
best_estimator_, results_svc=get_grid_cv_scores(model,params,custom_score=custom_score,X_train=X_train_select,y_train=y_train)

In [None]:
results_svc.dropna(axis=0)

In [None]:
plot_metric_results(results_svc.dropna(axis=0))

### Random Forest

In [None]:
mlflow.sklearn.autolog(extra_tags={'mlflow.runName': 'RandomForest'})

In [None]:
model=RandomForestClassifier(random_state=33)
# First step is to select best n_estimators
params={'classification__n_estimators':[5,10,20,50,100,200,300],
        }
best_estimator_, results_rf=get_grid_cv_scores(model,params,custom_score=custom_score,X_train=X_train_select,y_train=y_train)

In [None]:
best_estimator_

In [None]:
results_rf.dropna(axis=0)

In [None]:
plot_metric_results(results_rf.dropna(axis=0))

In [None]:
model=RandomForestClassifier(random_state=33)
# Then the other parameters can be tested with the selected n_estimators
params={'classification__n_estimators':[20],
        'classification__min_samples_leaf':[1, 2, 4, 10,12,13,14,15,16, 20],
        'classification__max_depth':[3, 5, 10, 14, None],
        'classification__max_features':['log2', 'sqrt', None]
        }
best_estimator_, results_rf=get_grid_cv_scores(model,params,custom_score=custom_score,X_train=X_train_select,y_train=y_train)

In [None]:
best_estimator_

In [None]:
results_rf.dropna(axis=0).sort_values('mean_test_Custom_scorer',ascending=False).head(10)

In [None]:
plot_metric_results(results_rf.dropna(axis=0).sort_values('mean_test_Custom_scorer',ascending=False).head(10))

### XGBoost

In [None]:
mlflow.sklearn.autolog(extra_tags={'mlflow.runName': 'XGBoost'})

In [None]:
#First step if determining best n_estimators with a high learning rate
model=XGBClassifier(random_state=33)
# Then the other parameters can be tested with the selected n_estimators
params= {'classification__learning_rate' : [0.1],
        'classification__n_estimators' : [40,60,80,100,150,250,300]}
best_estimator_, results_xgb=get_grid_cv_scores(model,params,custom_score=custom_score,X_train=X_train_select,y_train=y_train)

In [None]:
results_xgb.dropna(axis=0).sort_values('mean_test_Custom_scorer',ascending=False).head(10)

In [None]:
plot_metric_results(results_xgb.dropna(axis=0).sort_values('mean_test_Custom_scorer').head(10))

In [None]:
best_estimator_

In [None]:
#Second step is finding other parameters
model=XGBClassifier(random_state=33)
# Then the other parameters can be tested with the selected n_estimators
params= {'classification__learning_rate' : [0.1],
              'classification__n_estimators' : [40],
              'classification__subsample' : [0.1,0.3, 0.5, 0.7,0.9],
              'classification__colsample_bytree' : [0.1,0.3,0.5,0.7,0.9],
              'classification__max_depth': [4,6,8,10],
              'classification__gamma': [0, 0.25, 0.5, 1.0],
              }

best_estimator_, results_xgb=get_grid_cv_scores(model,params,custom_score=custom_score,X_train=X_train_select,y_train=y_train)

In [None]:
best_estimator_

In [None]:
results_xgb.dropna(axis=0).sort_values('mean_test_Custom_scorer',ascending=False).head(10)

In [None]:
plot_metric_results(results_xgb.dropna(axis=0).sort_values('mean_test_Custom_scorer',ascending=False).head(10))

In [None]:
#Last step is lowering learning rate and incresing n_estimators to fine tune regularization parameters
model=XGBClassifier(random_state=33)
params= {'classification__learning_rate' :  [0.01,0.03,0.05,0.1],
              'classification__n_estimators' : [40,80,100,150,200,300],
              'classification__subsample' : [0.7],
              'classification__colsample_bytree' : [0.3],
              'classification__max_depth': [4],
              'classification__gamma': [0.5],
              'classification__alpha' : [0.001,0.0025,0.005,0.007,0.1], #reg L1
              'classification__lambda' : [0.001, 0.0025,0.005,0.007,0.1] #reg L2
              }

best_estimator_, results_xgb=get_grid_cv_scores(model,params,custom_score=custom_score,X_train=X_train_select,y_train=y_train)

In [None]:
best_estimator_

In [None]:
results_xgb.dropna(axis=0).sort_values('mean_test_Custom_scorer',ascending=False).head(10)

In [None]:
plot_metric_results(results_xgb.dropna(axis=0).sort_values('mean_test_Custom_scorer',ascending=False).head(10))

In [None]:
results_xgb.dropna(axis=0).sort_values('mean_test_Custom_scorer',ascending=False).iloc[0]

In [None]:
results_xgb.dropna(axis=0).sort_values('mean_test_Custom_scorer',ascending=False).iloc[0]['params']

### LightGMB with kernel parameters

In [None]:
mlflow.sklearn.autolog(extra_tags={'mlflow.runName': 'LightGBM'})

In [None]:
#Model as proposed in the selected kernel
model = LGBMClassifier(
            nthread=4,
            learning_rate=0.02,
            num_leaves=34,
            colsample_bytree=0.9497036,
            subsample=0.8715623,
            max_depth=8,
            reg_alpha=0.041545473,
            reg_lambda=0.0735294,
            min_split_gain=0.0222415,
            min_child_weight=39.3259775,
            silent=-1,
            verbose=-1, )

In [None]:
#LGBM requires a Dataframe without column names
X_train_select_without_columns=X_train_select.copy()
X_train_select_without_columns.columns=list(range(len(X_train_select.columns)))
X_train_select_without_columns

In [None]:
params={'classification__n_estimators':[1000]}
_,results_lgbm=get_grid_cv_scores(model,params,custom_score=custom_score,X_train=X_train_select_without_columns,y_train=y_train)
results_lgbm

## Best estimator feature importance

### Model feature importance

In [None]:
best=LogisticRegression(penalty=None, random_state=33)

In [None]:
pipeline = Pipeline([
            ('sampling', SMOTE()),
            ('classification', best)
        ])

In [None]:
#Fit on the whole training dataset
pipeline.fit(X_train_select,y_train)

In [None]:
pred=pipeline.predict(X_test_select)
matrix=confusion_matrix(y_test, pred,labels=pipeline['classification'].classes_)
cm=ConfusionMatrixDisplay(matrix,display_labels=pipeline['classification'].classes_)
cm.plot()
plt.show()

In [None]:
roc_auc_score(y_test,pred).round(2)

In [None]:
pipeline['classification']

In [None]:
shap_values.shape

In [None]:
explainer = shap.LinearExplainer(pipeline['classification'],X_test_select)
shap_values = explainer.shap_values(X_test_select)
plt.figure(figsize=(15,10))
shap.summary_plot(shap_values, X_test_select,plot_type='bar',max_display=50)
plt.show()

In [None]:
shap.summary_plot(shap_values, X_test_select,plot_type='violin')
plt.show()

In [None]:
shap.waterfall_plot(explainer(X_test_select)[33])

### Feature selection with random feature

After first run of model selection with sample of dataset, the best model is RandomForest. Feature importance on this model will be used to select features and re-run model selection with a new dataset consisting of all rows but selected columns.

In [None]:
best=RandomForestClassifier(max_depth=5, max_features='log2', min_samples_leaf=10,n_estimators=10, random_state=33)

In [None]:
#Add a feature consisting of random numbers
X2=X_train_scaled.copy()
X2['random']=np.random.random(X_train_scaled.shape[0])
X2_test=X_test_scaled.copy()
X2_test['random']=np.random.random(X2_test.shape[0])


In [None]:
pipeline.fit(X=X2,y=y_train)
plt.figure(figsize=(15,10))
explainer = shap.Explainer(pipeline['classification'])
shap_values = explainer.shap_values(X2_test)
shap.summary_plot(shap_values[:,:], X2_test,plot_type='bar',max_display=50)
plt.show()

In [None]:
shap.summary_plot(shap_values[:,:], X2_test,plot_type='violin',max_display=50)
plt.show()

All features below the random introduced feature can be removed.

In [None]:
vals= np.abs(shap_values[:,:,0]).mean(0)
columns=X2.columns
feature_importance=pd.DataFrame.from_dict(dict(zip(columns,vals)),orient='index').sort_values(0,ascending=False)
display(feature_importance.loc[:'random'].iloc[:-1])
features_to_keep=(feature_importance.loc[:'random'].iloc[:-1]).index.to_list()

In [None]:
features_to_keep

In [None]:
X_train_select=X_train_scaled[features_to_keep]
X_train_select.shape

## Find best threshold

In [None]:
def get_scores_for_threshold(pipeline,threshold_list,X=X_train_sample,y=y_train_sample):
    """ Compute custom scores obtained by model using different thresholds to determine a class from the probability outputed by the model.

    Parameters
    ----------
    pipeline : sklean pipeline or equivalement
        Pipeline containing the fitted model and preprocessing steps.
    threshold_list : list
        List of thresholds (floats) to test
    X : pd.Dataframe, optional
        The features dataframe, by default X_train_sample
    y :pd.Series, optional
        the target, by default y_train_sample

    Returns
    -------
    pd.Dataframe
        Scores obtained by threshold
    """
    proba=pd.DataFrame(pipeline.predict_proba(X))
    scores={}
    for threshold in threshold_list:
        prediction=(proba[0]<threshold).astype(int)
        scores[threshold]=custom_loss(y,prediction)
    return(pd.DataFrame.from_dict(scores,orient="index",columns=['custom_score']))

In [None]:
scores_by_thresh=get_scores_for_threshold(pipeline,[0.1,0.2,0.3,0.4,0.5,0.55,0.6,0.65,0.7,0.8,0.9])
scores_by_thresh

In [None]:
sns.lineplot(scores_by_thresh,markers=True)
plt.title('Custom score by threshold')
for row in scores_by_thresh.iterrows():
    plt.annotate(row[1].values[0],(row[0],row[1]+0.5))
plt.legend([])
plt.show()

In [None]:
best_threshold=scores_by_thresh.idxmin().values[0]
best_threshold