In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression

from functions_PR_7 import *

In [2]:
df = pd.read_csv('data/data_reduced_ML.csv')

In [3]:
X = df.drop('Bankrupt', axis=1).copy()
y = df['Bankrupt'].copy()

## Run model for imbalanced data

In [5]:
X_train, X_test, y_train, y_test = split_scale(X, y, test_size=0.2, random_state=4576)

LR = LogisticRegression()

LR.fit(X_train, y_train)

y_pred = LR.predict(X_test)

In [8]:
scores_LR = model_score(y_test, y_pred)

display(scores_LR)
print(confusion_matrix(y_test, y_pred))

Unnamed: 0,precission,accuracy,recall,f1_score
0,0.75,0.965,0.06,0.111


[[1313    1]
 [  47    3]]


## Upscale data

In [None]:
X_train, X_test, y_train, y_test = split_scale(X, y, test_size=0.2, random_state=4576)

In [10]:
X_train_up, y_train_up = data_resampling('up', X_train, y_train)

In [13]:
X_train.shape, X_train_up.shape

((5455, 18), (10570, 18))

In [15]:
y_train.value_counts()

Bankrupt
0    5285
1     170
Name: count, dtype: int64

In [14]:
y_train_up.value_counts()

Bankrupt
0    5285
1    5285
Name: count, dtype: int64

In [20]:
# Run model

LR_up = LogisticRegression()

LR_up.fit(X_train_up, y_train_up)

y_pred_up = LR_up.predict(X_test)

scores_LR_up = model_score(y_test, y_pred_up)

In [21]:
display(scores_LR_up)
print(confusion_matrix(y_test, y_pred_up))

Unnamed: 0,precission,accuracy,recall,f1_score
0,0.18,0.856,0.82,0.295


[[1127  187]
 [   9   41]]


## Model calibration

In [22]:
from sklearn.calibration import CalibratedClassifierCV

**Calibrating model for upscaled data**

In [24]:
initial_model = LogisticRegression()

calibrated_model = CalibratedClassifierCV(initial_model, method='sigmoid', cv='prefit')

initial_model.fit(X_train_up, y_train_up)

calibrated_model.fit(X_train_up, y_train_up)

y_pred_calibrated = calibrated_model.predict(X_test)

scores_calibrated = model_score(y_test, y_pred_calibrated)

In [25]:
print("Scores for calibrated model (with upscaled training data)")
display(scores_calibrated)
print(confusion_matrix(y_test, y_pred_calibrated))

Scores for calibrated model (with upscaled training data)


Unnamed: 0,precission,accuracy,recall,f1_score
0,0.181,0.857,0.82,0.296


[[1128  186]
 [   9   41]]


## Grid search of hyperparameters

In [28]:
from sklearn.model_selection import GridSearchCV

In [37]:
model = LogisticRegression(max_iter=1000)

param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2']
}

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='recall', verbose=1)


In [38]:
grid_search.fit(X_train_up, y_train_up)

best_params = grid_search.best_params_

best_model = grid_search.best_estimator_

# # Evaluate the best model on the test set
# accuracy = best_model.score(X_test, y_test)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


30 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Benia\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 890, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Benia\anaconda3\Lib\site-packages\sklearn\base.py", line 1351, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Benia\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1172, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

In [39]:
best_params

{'C': 1, 'penalty': 'l2'}

In [40]:
best_model

 - no chnage of parameters with respect to default values when precission is to be improved
 - chnage in 'penalty' parameter to improve 'recall' : 'l2' instead of 'l1'