In [3]:
import pandas as pd
import sys
import os

import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from mlflow import MlflowClient
from pprint import pprint
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer,confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

# MLFlow setup

In [4]:
client = MlflowClient(tracking_uri="http://127.0.0.1:8080")

In [None]:
# Provide an Experiment description that will appear in the UI
experiment_description = (
    "OC - Projet 7 "
    "Implémentation d'un modèle de scoring pour des crédits"
)

# Create the Experiment, providing a unique name
scoring_experiment = client.create_experiment(
    name="Scoring_Models")

In [None]:
all_experiments = client.search_experiments()

print(all_experiments)

In [None]:
default_experiment = [
    {"name": experiment.name, "lifecycle_stage": experiment.lifecycle_stage}
    for experiment in all_experiments
    if experiment.name == "Default"
][0]

pprint(default_experiment)

# Data loading

In [6]:
sys.path.append(os.path.abspath(os.path.join('..')))

In [7]:
from src.lightgbm_with_simple_features import main
from notebooks.src.model_prep import get_initial_splits, get_grid_cv_scores

Lightgbm_with_simple_features kernel from kaggle ([link](https://www.kaggle.com/code/jsaguiar/lightgbm-with-simple-features/script)) has been slightly modified and will be used for data preprocessing and feature engineering.
Modifications made are :
- Dropping columns with missing values.
- Replacing inf values due to division by zero with nans.

In [8]:
df=main()

Train samples: 307511, test samples: 48744
Bureau df shape: (305811, 116)
Process bureau and bureau_balance - done in 36s
Previous applications df shape: (338857, 249)
Process previous_applications - done in 61s
Pos-cash balance df shape: (337252, 18)
Process POS-CASH balance - done in 31s
Installments payments df shape: (339587, 26)
Process installments payments - done in 41s
Credit card balance df shape: (103558, 141)
Process credit card balance - done in 34s
Dropping columns with missing values
Initial shape : (356251, 798)
Final shape : (356251, 182)


In [9]:
df

Unnamed: 0,index,SK_ID_CURR,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,REGION_POPULATION_RELATIVE,DAYS_BIRTH,...,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes,INCOME_CREDIT_PERC,TARGET
0,0,100002,0,0,0,0,202500.0,406597.5,0.018801,-9461,...,False,False,False,False,True,False,True,False,0.498036,1.0
1,1,100003,1,0,1,0,270000.0,1293502.5,0.003541,-16765,...,False,False,False,False,False,False,True,False,0.208736,0.0
2,2,100004,0,1,0,0,67500.0,135000.0,0.010032,-19046,...,False,False,False,False,False,False,False,False,0.500000,0.0
3,3,100006,1,0,0,0,135000.0,312682.5,0.008019,-19005,...,False,False,False,False,False,False,False,False,0.431748,0.0
4,4,100007,0,0,0,0,121500.0,513000.0,0.028663,-19932,...,False,False,False,False,False,False,False,False,0.236842,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
356250,48739,456221,1,0,0,0,121500.0,412560.0,0.002042,-19970,...,False,False,False,False,False,False,False,False,0.294503,
356251,48740,456222,1,0,1,2,157500.0,622413.0,0.035792,-11186,...,False,False,False,False,False,False,False,False,0.253047,
356252,48741,456223,1,1,0,1,202500.0,315000.0,0.026392,-15922,...,False,False,False,False,True,False,True,False,0.642857,
356253,48742,456224,0,0,1,0,225000.0,450000.0,0.018850,-13968,...,False,False,False,True,False,False,True,False,0.500000,


In [10]:
kaggle_df,X_train,X_test,y_train,y_test=get_initial_splits(df)

# Model selection

In [11]:
print(X_train.shape,y_train.shape)

(215254, 182) (215254,)


In [12]:
def custom_loss(y_true, y_pred):
    #FP = loan not given to good client, FN loan given to bad client.
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return fn*10+fp

custom_score = make_scorer(custom_loss, greater_is_better=False)

In [13]:
def get_grid_cv_scores(model,params,custom_score):
    scoring = {"AUC": "roc_auc", "Accuracy": "accuracy", "F1-score": "f1", "Custom_scorer" : custom_score}
    pipeline = Pipeline([
            ('scaling', StandardScaler()),
            ('sampling', SMOTE()),
            ('classification', model)
        ])

    grid=GridSearchCV(pipeline,
                    param_grid=params,
                    cv=5,
                    refit="Custom_scorer",
                    scoring=scoring,
                    return_train_score=True)

    grid.fit(X_train, y_train)

    results = pd.DataFrame(grid.cv_results_)[['mean_fit_time','params',
                                              'mean_train_AUC','mean_train_Accuracy','mean_train_F1-score','mean_train_Custom_scorer',
                                              'mean_test_AUC','mean_test_Accuracy','mean_test_F1-score','mean_test_Custom_scorer']]

    return grid.best_estimator_, results

In [14]:
model=LogisticRegression(random_state=33)
params={'classification__penalty' : ['elasticnet', None], 'classification__l1_ratio':[0,0.2,0.5,0.7,1],'classification__C': [0.1,1,2,5, 10.]}
best_estimator_, results=get_grid_cv_scores(model,params,custom_score=custom_score)



In [None]:
best_estimator_

In [None]:
results

In [None]:
def plot_metric_results(results):
    _,axs=plt.subplots(3,2,figsize=(18,20))
    idx=["setting_"+str(i) for i in range(results.shape[0])]

    sns.lineplot(x=idx,y=results['mean_fit_time'],ax=axs[0][0],label='mean_fit_time')
    axs[0][0].tick_params('x', labelrotation=90)

    sns.lineplot(x=idx,y=results['mean_train_AUC'],ax=axs[0][1],label='mean_train_AUC')
    sns.lineplot(x=idx,y=results['mean_test_AUC'],ax=axs[0][1],label='mean_test_AUC')
    axs[0][1].tick_params('x', labelrotation=90)

    sns.lineplot(x=idx,y=results['mean_train_Accuracy'],ax=axs[1][0],label='mean_train_Accuracy')
    sns.lineplot(x=idx,y=results['mean_test_Accuracy'],ax=axs[1][0],label='mean_test_Accuracy')
    axs[1][0].tick_params('x', labelrotation=90)

    sns.lineplot(x=idx,y=results['mean_train_F1-score'],ax=axs[1][1],label='mean_train_F1-score')
    sns.lineplot(x=idx,y=results['mean_test_F1-score'],ax=axs[1][1],label='mean_test_F1-score')
    axs[1][1].tick_params('x', labelrotation=90)

    sns.lineplot(x=idx,y=results['mean_train_Custom_scorer'],ax=axs[2][0],label='mean_train_Custom_scorer')
    sns.lineplot(x=idx,y=results['mean_test_Custom_scorer'],ax=axs[2][0],label='mean_test_Custom_scorer')
    axs[2][0].tick_params('x', labelrotation=90)

    #TODO In last ax, plot dict with idx and params correspondance
    plt.table(results['params'].astype('str'),ax=axs[2][1])
    axs[2][1].axis('off')
    plt.show()

In [None]:
plot_metric_results(results)

NameError: name 'results' is not defined

In [None]:
model=LinearSVC(random_state=33)
params={'classification__penalty':['l1','l2'], 'classification__C':[0.01,0.1,1,2,5,10]}
best_estimator_, results=get_grid_cv_scores(model,params,custom_score=custom_score)



KeyboardInterrupt: 