# Modelisation of credit data

# Modelisation of credit data

In [1]:
import pandas as pd
import mlflow
import gc
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    RocCurveDisplay,
    auc,
    roc_curve,
    roc_auc_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score
)

In [2]:
## Load data
data = pd.read_csv("Data/cleaned/data_train.csv")
data.drop(columns=["SK_ID_CURR"], inplace=True)
data

Unnamed: 0,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,...,CC_NAME_CONTRACT_STATUS_Signed_MAX,CC_NAME_CONTRACT_STATUS_Signed_MEAN,CC_NAME_CONTRACT_STATUS_Signed_SUM,CC_NAME_CONTRACT_STATUS_Signed_VAR,CC_NAME_CONTRACT_STATUS_nan_MIN,CC_NAME_CONTRACT_STATUS_nan_MAX,CC_NAME_CONTRACT_STATUS_nan_MEAN,CC_NAME_CONTRACT_STATUS_nan_SUM,CC_NAME_CONTRACT_STATUS_nan_VAR,CC_COUNT
0,0,1,0,1,0,202500.0,406597.5,24700.5,351000.0,0.018801,...,1.0,1.0,62.0,0.5,0.0,0.0,0.0,0.0,0.0,192.0
1,0,0,0,0,0,270000.0,1293502.5,35698.5,1129500.0,0.003541,...,1.0,1.0,62.0,0.5,0.0,0.0,0.0,0.0,0.0,192.0
2,1,1,1,1,0,67500.0,135000.0,6750.0,135000.0,0.010032,...,1.0,1.0,62.0,0.5,0.0,0.0,0.0,0.0,0.0,192.0
3,0,0,0,1,0,135000.0,312682.5,29686.5,297000.0,0.008019,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
4,0,1,0,1,0,121500.0,513000.0,21865.5,513000.0,0.028663,...,1.0,1.0,62.0,0.5,0.0,0.0,0.0,0.0,0.0,192.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307502,0,1,0,0,0,157500.0,254700.0,27558.0,225000.0,0.032561,...,1.0,1.0,62.0,0.5,0.0,0.0,0.0,0.0,0.0,192.0
307503,0,0,0,1,0,72000.0,269550.0,12001.5,225000.0,0.025164,...,1.0,1.0,62.0,0.5,0.0,0.0,0.0,0.0,0.0,192.0
307504,0,0,0,1,0,153000.0,677664.0,29979.0,585000.0,0.005002,...,1.0,1.0,62.0,0.5,0.0,0.0,0.0,0.0,0.0,192.0
307505,0,0,0,1,0,171000.0,370107.0,20205.0,319500.0,0.005313,...,1.0,1.0,62.0,0.5,0.0,0.0,0.0,0.0,0.0,192.0


In [3]:
# Fill nan with 0 as I created colums indicating the value was missing when I did hot-one-encoded label using get_dummy and parameter dummy_na=True
data.replace(np.nan, 0, inplace=True)
data

Unnamed: 0,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,...,CC_NAME_CONTRACT_STATUS_Signed_MAX,CC_NAME_CONTRACT_STATUS_Signed_MEAN,CC_NAME_CONTRACT_STATUS_Signed_SUM,CC_NAME_CONTRACT_STATUS_Signed_VAR,CC_NAME_CONTRACT_STATUS_nan_MIN,CC_NAME_CONTRACT_STATUS_nan_MAX,CC_NAME_CONTRACT_STATUS_nan_MEAN,CC_NAME_CONTRACT_STATUS_nan_SUM,CC_NAME_CONTRACT_STATUS_nan_VAR,CC_COUNT
0,0,1,0,1,0,202500.0,406597.5,24700.5,351000.0,0.018801,...,1.0,1.0,62.0,0.5,0.0,0.0,0.0,0.0,0.0,192.0
1,0,0,0,0,0,270000.0,1293502.5,35698.5,1129500.0,0.003541,...,1.0,1.0,62.0,0.5,0.0,0.0,0.0,0.0,0.0,192.0
2,1,1,1,1,0,67500.0,135000.0,6750.0,135000.0,0.010032,...,1.0,1.0,62.0,0.5,0.0,0.0,0.0,0.0,0.0,192.0
3,0,0,0,1,0,135000.0,312682.5,29686.5,297000.0,0.008019,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
4,0,1,0,1,0,121500.0,513000.0,21865.5,513000.0,0.028663,...,1.0,1.0,62.0,0.5,0.0,0.0,0.0,0.0,0.0,192.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307502,0,1,0,0,0,157500.0,254700.0,27558.0,225000.0,0.032561,...,1.0,1.0,62.0,0.5,0.0,0.0,0.0,0.0,0.0,192.0
307503,0,0,0,1,0,72000.0,269550.0,12001.5,225000.0,0.025164,...,1.0,1.0,62.0,0.5,0.0,0.0,0.0,0.0,0.0,192.0
307504,0,0,0,1,0,153000.0,677664.0,29979.0,585000.0,0.005002,...,1.0,1.0,62.0,0.5,0.0,0.0,0.0,0.0,0.0,192.0
307505,0,0,0,1,0,171000.0,370107.0,20205.0,319500.0,0.005313,...,1.0,1.0,62.0,0.5,0.0,0.0,0.0,0.0,0.0,192.0


In [4]:
scaler = StandardScaler().fit(data)

In [5]:
# Creation of trainning and test dataset
X_train, X_test, y_train, y_test = train_test_split(
    data, data["TARGET"], test_size=0.3, random_state=3, stratify=data["TARGET"]
)

In [6]:
# Ratio Target = 0 / Target = 1 in train dataset
print(
    "There is",
    (X_train.TARGET.value_counts()[1] / len(X_train)) * 100,
    "% of clients that did not paid their loan in the trainning set",
)

There is 8.072788426695904 % of clients that did not paid their loan in the trainning set


In [7]:
# Ratio Target = 0 / Target = 1 in test dataset
print(
    "There is",
    (X_test.TARGET.value_counts()[1] / len(X_test)) * 100,
    "% of clients that did not paid their loan in the trainning set",
)

There is 8.073450185901814 % of clients that did not paid their loan in the trainning set


In [8]:
data_train_scaled = scaler.transform(X_train)
data_test_scaled = scaler.transform(X_test)

In [9]:
data_train_scaled

array([[-0.32437316,  1.38816188, -0.71791022, ...,  0.        ,
         0.        , -1.90627587],
       [-0.32437316,  1.38816188,  1.39293184, ...,  0.        ,
         0.        ,  0.60790709],
       [-0.32437316, -0.72037708, -0.71791022, ...,  0.        ,
         0.        ,  0.60790709],
       ...,
       [-0.32437316,  1.38816188, -0.71791022, ...,  0.        ,
         0.        ,  0.60790709],
       [-0.32437316, -0.72037708, -0.71791022, ...,  0.        ,
         0.        ,  0.60790709],
       [-0.32437316, -0.72037708, -0.71791022, ...,  0.        ,
         0.        , -2.00350947]], shape=(215254, 805))

In [10]:
del data
gc.collect()

0

In [11]:
mlflow.set_tracking_uri("http://127.0.0.1:8080")

# Sets the current active experiment to the "Apple_Models" experiment and
# returns the Experiment metadata
imbalance_experiment = mlflow.set_experiment("imbalance")

### LogisticRegression with imbalance data

In [39]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

print("Before OverSampling, counts of label '1': {}".format(sum(y_train == 1)))
print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train == 0)))

# Define a run name for this iteration of training.
# If this is not set, a unique name will be auto-generated for your run.
run_name = "LogisticRegression_imbalanced_data_scaled_data"
# Define an artifact path that the model will be saved to.
artifact_path = "lr_imbalanced_scaled"

lr = LogisticRegression()
lr.fit(data_train_scaled, y_train)
predictions = lr.predict(data_test_scaled)

# print classification report
print(classification_report(y_test, predictions))

roc_auc = roc_auc_score(y_test, predictions)
f1 = f1_score(y_test, predictions)
confusion_mat = confusion_matrix(y_test, predictions)
precission = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)

metrics = {"AUC": roc_auc, "F1_score": f1, "Precission": precission, "Recall": recall, "True_negative": confusion_mat[0][0], "False_negative": confusion_mat[1][0], "True_positive": confusion_mat[1][1], "False_positive": confusion_mat[0][1]}

with mlflow.start_run(run_name=run_name) as run:
    # Log the parameters used for the model fit
    mlflow.log_params({"lenght_0": sum(y_train == 0), "lenght_1": sum(y_train == 1), "Sampling": "None"})

    # Log the error metrics that were calculated during validation
    mlflow.log_metrics(metrics)

    # Log an instance of the trained model for later use
    # mlflow.sklearn.log_model(sk_model=lr, input_example=X_test, artifact_path=artifact_path)


Before OverSampling, counts of label '1': 17377
Before OverSampling, counts of label '0': 197877 

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     84805
           1       1.00      1.00      1.00      7448

    accuracy                           1.00     92253
   macro avg       1.00      1.00      1.00     92253
weighted avg       1.00      1.00      1.00     92253

🏃 View run LogisticRegression_imbalanced_data_scaled_data at: http://127.0.0.1:8080/#/experiments/641223099707827758/runs/27e3c00d203840c3b92b65af452d5be0
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/641223099707827758


In [40]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

print("Before OverSampling, counts of label '1': {}".format(sum(y_train == 1)))
print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train == 0)))

# Define a run name for this iteration of training.
# If this is not set, a unique name will be auto-generated for your run.
run_name = "LogisticRegression_imbalanced_data"
# Define an artifact path that the model will be saved to.
artifact_path = "lr_imbalanced"

lr = LogisticRegression()
lr.fit(X_train, y_train)
predictions = lr.predict(X_test)

# print classification report
print(classification_report(y_test, predictions))

roc_auc = roc_auc_score(y_test, predictions)
f1 = f1_score(y_test, predictions)
confusion_mat = confusion_matrix(y_test, predictions)
precission = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)

metrics = {"AUC": roc_auc, "F1_score": f1, "Precission": precission, "Recall": recall, "True_negative": confusion_mat[0][0], "False_negative": confusion_mat[1][0], "True_positive": confusion_mat[1][1], "False_positive": confusion_mat[0][1]}

with mlflow.start_run(run_name=run_name) as run:
    # Log the parameters used for the model fit
    mlflow.log_params({"lenght_0": sum(y_train == 0), "lenght_1": sum(y_train == 1), "Sampling": "None"})

    # Log the error metrics that were calculated during validation
    mlflow.log_metrics(metrics)

    # Log an instance of the trained model for later use
    # mlflow.sklearn.log_model(sk_model=lr, input_example=X_test, artifact_path=artifact_path)


Before OverSampling, counts of label '1': 17377
Before OverSampling, counts of label '0': 197877 



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


              precision    recall  f1-score   support

           0       0.92      1.00      0.96     84805
           1       0.12      0.00      0.01      7448

    accuracy                           0.92     92253
   macro avg       0.52      0.50      0.48     92253
weighted avg       0.85      0.92      0.88     92253

🏃 View run LogisticRegression_imbalanced_data at: http://127.0.0.1:8080/#/experiments/641223099707827758/runs/4f647507b5394fc4a7bc57ad0cb0df49
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/641223099707827758


In [41]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

print("Before OverSampling, counts of label '1': {}".format(sum(y_train == 1)))
print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train == 0)))

# Define a run name for this iteration of training.
# If this is not set, a unique name will be auto-generated for your run.
run_name = "LogisticRegression_imbalanced_data_weighted"
# Define an artifact path that the model will be saved to.
artifact_path = "lr_weight"

lr = LogisticRegression(class_weight="balanced")
lr.fit(X_train, y_train)
predictions = lr.predict(X_test)

# print classification report
print(classification_report(y_test, predictions))

roc_auc = roc_auc_score(y_test, predictions)
f1 = f1_score(y_test, predictions)
confusion_mat = confusion_matrix(y_test, predictions)
precission = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)

metrics = {"AUC": roc_auc, "F1_score": f1, "Precission": precission, "Recall": recall, "True_negative": confusion_mat[0][0], "False_negative": confusion_mat[1][0], "True_positive": confusion_mat[1][1], "False_positive": confusion_mat[0][1]}

with mlflow.start_run(run_name=run_name) as run:
    # Log the parameters used for the model fit
    mlflow.log_params({"lenght_0": sum(y_train == 0), "lenght_1": sum(y_train == 1), "Sampling": "None", "class_weight": "balanced"})

    # Log the error metrics that were calculated during validation
    mlflow.log_metrics(metrics)

    # Log an instance of the trained model for later use
    # mlflow.sklearn.log_model(sk_model=lr, input_example=X_test, artifact_path=artifact_path)

Before OverSampling, counts of label '1': 17377
Before OverSampling, counts of label '0': 197877 



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


              precision    recall  f1-score   support

           0       0.93      0.65      0.77     84805
           1       0.10      0.46      0.17      7448

    accuracy                           0.64     92253
   macro avg       0.52      0.56      0.47     92253
weighted avg       0.87      0.64      0.72     92253

🏃 View run LogisticRegression_imbalanced_data_weighted at: http://127.0.0.1:8080/#/experiments/641223099707827758/runs/58d62d0d54f8476280e9e68cd489b713
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/641223099707827758


## Using SMOTE Algorithm
### Oversampling

In [42]:
print("Before OverSampling, counts of label '1': {}".format(sum(y_train == 1)))
print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train == 0)))

# import SMOTE module from imblearn library
# pip install imblearn (if you don't have imblearn in your system)
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=3)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

print("After OverSampling, the shape of train_X: {}".format(X_train_res.shape))
print("After OverSampling, the shape of train_y: {} \n".format(y_train_res.shape))

print("After OverSampling, counts of label '1': {}".format(sum(y_train_res == 1)))
print("After OverSampling, counts of label '0': {}".format(sum(y_train_res == 0)))

Before OverSampling, counts of label '1': 17377
Before OverSampling, counts of label '0': 197877 

After OverSampling, the shape of train_X: (395754, 805)
After OverSampling, the shape of train_y: (395754,) 

After OverSampling, counts of label '1': 197877
After OverSampling, counts of label '0': 197877


In [43]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

print("Before OverSampling, counts of label '1': {}".format(sum(y_train_res == 1)))
print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train_res == 0)))

# Define a run name for this iteration of training.
# If this is not set, a unique name will be auto-generated for your run.
run_name = "LogisticRegression_smote_data"
# Define an artifact path that the model will be saved to.
artifact_path = "lr_SMOTE"

lr = LogisticRegression()
lr.fit(X_train_res, y_train_res)
predictions = lr.predict(X_test)

# print classification report
print(classification_report(y_test, predictions))
roc_auc = roc_auc_score(y_test, predictions)
f1 = f1_score(y_test, predictions)
confusion_mat = confusion_matrix(y_test, predictions)
precission = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)

metrics = {"AUC": roc_auc, "F1_score": f1, "Precission": precission, "Recall": recall, "True_negative": confusion_mat[0][0], "False_negative": confusion_mat[1][0], "True_positive": confusion_mat[1][1], "False_positive": confusion_mat[0][1]}

with mlflow.start_run(run_name=run_name) as run:
    # Log the parameters used for the model fit
    mlflow.log_params({"lenght_0": sum(y_train_res == 0), "lenght_1": sum(y_train_res == 1), "Sampling": "SMOTE"})

    # Log the error metrics that were calculated during validation
    mlflow.log_metrics(metrics)

    # Log an instance of the trained model for later use
    # mlflow.sklearn.log_model(sk_model=lr, input_example=X_test, artifact_path=artifact_path)

Before OverSampling, counts of label '1': 197877
Before OverSampling, counts of label '0': 197877 



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


              precision    recall  f1-score   support

           0       0.93      0.67      0.78     84805
           1       0.11      0.45      0.17      7448

    accuracy                           0.65     92253
   macro avg       0.52      0.56      0.47     92253
weighted avg       0.87      0.65      0.73     92253

🏃 View run LogisticRegression_smote_data at: http://127.0.0.1:8080/#/experiments/641223099707827758/runs/dd6cf8cadd8b401ab65a0b92e67d1e30
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/641223099707827758


In [44]:
del X_train_res, y_train_res, sm, predictions
gc.collect()

876

### Undersampling with NearMiss Algorithm:

In [45]:
print("Before Undersampling, counts of label '1': {}".format(sum(y_train == 1)))
print("Before Undersampling, counts of label '0': {} \n".format(sum(y_train == 0)))

# apply near miss
from imblearn.under_sampling import NearMiss

nr = NearMiss()

X_train_miss, y_train_miss = nr.fit_resample(X_train, y_train)

print("After Undersampling, the shape of train_X: {}".format(X_train_miss.shape))
print("After Undersampling, the shape of train_y: {} \n".format(y_train_miss.shape))

print("After Undersampling, counts of label '1': {}".format(sum(y_train_miss == 1)))
print("After Undersampling, counts of label '0': {}".format(sum(y_train_miss == 0)))

Before Undersampling, counts of label '1': 17377
Before Undersampling, counts of label '0': 197877 

After Undersampling, the shape of train_X: (34754, 805)
After Undersampling, the shape of train_y: (34754,) 

After Undersampling, counts of label '1': 17377
After Undersampling, counts of label '0': 17377


In [46]:
# Define a run name for this iteration of training.
# If this is not set, a unique name will be auto-generated for your run.
run_name = "LogisticRegression_nearmiss_data"
# Define an artifact path that the model will be saved to.
artifact_path = "lr_NearMiss"


# train the model on train set
lr = LogisticRegression()
lr.fit(X_train_miss, y_train_miss)
predictions = lr.predict(X_test)

# print classification report
print(classification_report(y_test, predictions))
roc_auc = roc_auc_score(y_test, predictions)
f1 = f1_score(y_test, predictions)
confusion_mat = confusion_matrix(y_test, predictions)
precission = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)

metrics = {"AUC": roc_auc, "F1_score": f1, "Precission": precission, "Recall": recall, "True_negative": confusion_mat[0][0], "False_negative": confusion_mat[1][0], "True_positive": confusion_mat[1][1], "False_positive": confusion_mat[0][1]}

with mlflow.start_run(run_name=run_name) as run:
    # Log the parameters used for the model fit
    mlflow.log_params({"lenght_0": sum(y_train_miss == 0), "lenght_1": sum(y_train_miss == 1), "Sampling": "Near_Miss"})

    # Log the error metrics that were calculated during validation
    mlflow.log_metrics(metrics)

    # Log an instance of the trained model for later use
    # mlflow.sklearn.log_model(sk_model=lr, input_example=X_test, artifact_path=artifact_path)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


              precision    recall  f1-score   support

           0       0.90      0.15      0.26     84805
           1       0.08      0.81      0.14      7448

    accuracy                           0.20     92253
   macro avg       0.49      0.48      0.20     92253
weighted avg       0.83      0.20      0.25     92253

🏃 View run LogisticRegression_nearmiss_data at: http://127.0.0.1:8080/#/experiments/641223099707827758/runs/dcac2250df7d4c5ab651644f61d0d27a
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/641223099707827758


 combining SMOTE with random undersampling of the majority class

In [47]:
del X_train_miss, y_train_miss
gc.collect()

330

### Undersampling with EditedNearestNeigbours Algorithm:

In [48]:
print("Before Undersampling, counts of label '1': {}".format(sum(y_train == 1)))
print("Before Undersampling, counts of label '0': {} \n".format(sum(y_train == 0)))

# apply Edited nearest neighbours
from imblearn.under_sampling import EditedNearestNeighbours

enn = EditedNearestNeighbours()

X_train_enn, y_train_enn = enn.fit_resample(X_train, y_train)

print("After Undersampling, the shape of train_X: {}".format(X_train_enn.shape))
print("After Undersampling, the shape of train_y: {} \n".format(y_train_enn.shape))

print("After Undersampling, counts of label '1': {}".format(sum(y_train_enn == 1)))
print("After Undersampling, counts of label '0': {}".format(sum(y_train_enn == 0)))

Before Undersampling, counts of label '1': 17377
Before Undersampling, counts of label '0': 197877 

After Undersampling, the shape of train_X: (173300, 805)
After Undersampling, the shape of train_y: (173300,) 

After Undersampling, counts of label '1': 17377
After Undersampling, counts of label '0': 155923


In [49]:
# Define a run name for this iteration of training.
# If this is not set, a unique name will be auto-generated for your run.
run_name = "LogisticRegression_EditedNearestNeighbours_data"
# Define an artifact path that the model will be saved to.
artifact_path = "lr_EditedNearestNeighbours"

# train the model on train set
lr = LogisticRegression()
lr.fit(X_train_enn, y_train_enn)
predictions = lr.predict(X_test)

# print classification report
print(classification_report(y_test, predictions))
roc_auc = roc_auc_score(y_test, predictions)
f1 = f1_score(y_test, predictions)
confusion_mat = confusion_matrix(y_test, predictions)
precission = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)

metrics = {"AUC": roc_auc, "F1_score": f1, "Precission": precission, "Recall": recall, "True_negative": confusion_mat[0][0], "False_negative": confusion_mat[1][0], "True_positive": confusion_mat[1][1], "False_positive": confusion_mat[0][1]}

with mlflow.start_run(run_name=run_name) as run:
    # Log the parameters used for the model fit
    mlflow.log_params({"lenght_0": sum(y_train_enn == 0), "lenght_1": sum(y_train_enn == 1), "Sampling": "ENN"})

    # Log the error metrics that were calculated during validation
    mlflow.log_metrics(metrics)

    # Log an instance of the trained model for later use
    # mlflow.sklearn.log_model(sk_model=lr1, input_example=X_test, artifact_path=artifact_path)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


              precision    recall  f1-score   support

           0       0.92      1.00      0.96     84805
           1       0.11      0.00      0.01      7448

    accuracy                           0.92     92253
   macro avg       0.51      0.50      0.48     92253
weighted avg       0.85      0.92      0.88     92253

🏃 View run LogisticRegression_EditedNearestNeighbours_data at: http://127.0.0.1:8080/#/experiments/641223099707827758/runs/b4112b4ff54c4834af9ba2f08c654810
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/641223099707827758


In [50]:
del X_train_enn, y_train_enn
gc.collect()

330

### Over and undersampling

In [26]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
# define pipeline
over = SMOTE(sampling_strategy=0.1)
under = RandomUnderSampler(sampling_strategy=0.8)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)
# transform the dataset
X_pipeline, y_pipeline = pipeline.fit_resample(X_train, y_train)


In [27]:
# Define a run name for this iteration of training.
# If this is not set, a unique name will be auto-generated for your run.
run_name = "LR_SMOTE_Under"
# Define an artifact path that the model will be saved to.
artifact_path = "lr_SMOTE&Under"

# train the model on train set
lr = LogisticRegression(class_weight="balanced")
lr.fit(X_pipeline, y_pipeline)
predictions = lr.predict(X_test)

# print classification report
print(classification_report(y_test, predictions))
roc_auc = roc_auc_score(y_test, predictions)
f1 = f1_score(y_test, predictions)
confusion_mat = confusion_matrix(y_test, predictions)
precission = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)

metrics = {"AUC": roc_auc, "F1_score": f1, "Precission": precission, "Recall": recall, "True_negative": confusion_mat[0][0], "False_negative": confusion_mat[1][0], "True_positive": confusion_mat[1][1], "False_positive": confusion_mat[0][1]}

with mlflow.start_run(run_name=run_name) as run:
    # Log the parameters used for the model fit
    mlflow.log_params({"lenght_0": sum(y_pipeline == 0), "lenght_1": sum(y_pipeline == 1), "Sampling": "SMOTE (10%) & Under sampler(80%)", "class_weight": "balanced"})

    # Log the error metrics that were calculated during validation
    mlflow.log_metrics(metrics)

    # Log an instance of the trained model for later use
    # mlflow.sklearn.log_model(sk_model=lr1, input_example=X_test, artifact_path=artifact_path)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


              precision    recall  f1-score   support

           0       0.93      0.66      0.77     84805
           1       0.11      0.46      0.17      7448

    accuracy                           0.64     92253
   macro avg       0.52      0.56      0.47     92253
weighted avg       0.87      0.64      0.72     92253

🏃 View run LR_SMOTE_Under at: http://127.0.0.1:8080/#/experiments/641223099707827758/runs/f15cc73f4b0e426aad52d84bfdfa72b0
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/641223099707827758


## Model selection


In [28]:
mlflow.set_tracking_uri("http://127.0.0.1:8080")

# Sets the current active experiment to the "Apple_Models" experiment and
# returns the Experiment metadata
imbalance_experiment = mlflow.set_experiment("Best_model_selection")

### SVC

In [29]:
# Création du modèle SVR (Support Vector Regression)
from sklearn.svm import SVC
svc = SVC(class_weight="balance")

In [30]:
# Paramètres d'optimisation des hyperparamètres kernel, C et epsilon
from sklearn.model_selection import GridSearchCV
svc_grid_param = {"kernel": ["linear",'rbf'], "C" : [0.1, 1, 5]}

svc_grid = GridSearchCV(svc, param_grid=svc_grid_param, cv=5, n_jobs=-1)

In [None]:
# J'entraine le modèle sur mes données
svc_grid.fit(X_pipeline, y_pipeline)

### Descision tree
#### Random forest

In [16]:
from sklearn.ensemble import RandomForestClassifier
# Création du modèle de regression Ramdom Forest
rfc = RandomForestClassifier()

In [19]:
# Paramètres d'optimisation des hyperparamètres
from sklearn.model_selection import GridSearchCV
rfc_grid_param = { 
    'n_estimators': [100, 300, 1000],
    'max_features': ["sqrt", "log2"],
    'max_depth' : list(range(2,20,2)),
    "random_state" : [3]}

rfc_grid = GridSearchCV(estimator=rfc, param_grid=rfc_grid_param, cv= 5, n_jobs=-1)

In [None]:
# J'entraine le modèle sur mes données
rfc_grid.fit(X_train, y_train)