In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression

from functions_PR_7 import *

In [2]:
df = pd.read_csv('data/data.csv')

In [None]:
df.tail(5)

In [5]:
df.shape

(6819, 96)

In [6]:
df.select_dtypes(include='number').shape

(6819, 96)

In [7]:
df.select_dtypes(include='object').shape

(6819, 0)

In [8]:
df.isna().sum().max()

0

In [3]:
# Just cleaning column names

df.rename(columns={'Bankrupt?' : 'Bankrupt'}, inplace=True)

df.rename(columns=lambda x: x.strip() if x.startswith(' ') else x, inplace=True)

In [4]:
df['Bankrupt'].value_counts()

Bankrupt
0    6599
1     220
Name: count, dtype: int64

In [6]:
df.duplicated().sum()

0

## Rmoving uselless columns

 - column 'Net Income Flag' has only one value - so it can be removed since it does not brig any information
 
 - 'Liability-Assets Flag' - only 8 out of 6819 values are equal 1, the rest is 0 - so this column as well is not very usefull and can be removed

In [13]:
# for col in df:
#     print(col, df[col].var())

In [5]:
df.drop('Net Income Flag', axis=1, inplace=True)

df.drop('Liability-Assets Flag', axis=1, inplace=True)

## Run Logistic Regression for whole data set 

**Divide data into atributes data frame (X) and the target value (y) - target is Bankrupt**

In [6]:
X = df.drop('Bankrupt', axis=1).copy()
y = df['Bankrupt'].copy()

**Logistic Regression of full imbalanced data**

In [7]:
X_train, X_test, y_train, y_test = split_scale(X, y, test_size=0.2, random_state=4576)

LR = LogisticRegression()
    
LR.fit(X_train, y_train)
    
y_pred_LR = LR.predict(X_test)
    
score_LR = model_score(y_test, y_pred_LR)

In [8]:
print("Logistic Regression score for inmbalanced data set")
display(score_LR)

print()
print("Confusion matrix for inmbalanced data set")
print(confusion_matrix(y_test, y_pred_LR))

Logistic Regression score for inmbalanced data set


Unnamed: 0,precission,accuracy,recall,f1_score
0,0.6,0.964,0.06,0.109



Confusion matrix for inmbalanced data set
[[1312    2]
 [  47    3]]


In [12]:
X_train.shape, X_test.shape

((5455, 93), (1364, 93))

In [13]:
y_test.value_counts()

Bankrupt
0    1314
1      50
Name: count, dtype: int64

In [14]:
y_train.value_counts()

Bankrupt
0    5285
1     170
Name: count, dtype: int64

**Logistic Refression after removing collinearity in full imbalanced data set**

In [25]:
X_no_corr = remove_collinearity(X, threshold=0.95)

In [26]:
X_no_corr.shape

(6819, 77)

In [27]:
X_train_no_corr, X_test_no_corr, y_train_no_corr, y_test_no_corr = split_scale(X_no_corr, y, test_size=0.2, random_state=4576)

LR_no_corr = LogisticRegression()
    
LR_no_corr.fit(X_train_no_corr, y_train_no_corr)
    
y_pred_LR_no_corr = LR_no_corr.predict(X_test_no_corr)
    
score_LR_no_corr = model_score(y_test_no_corr, y_pred_LR_no_corr)

In [28]:
print("Logistic Regression score for inmbalanced data set without collinearity")
display(score_LR_no_corr)

print()
print("Confusion matrix for inmbalanced data set without collinearity")
confusion_matrix(y_test_no_corr, y_pred_LR_no_corr)

Logistic Regression score for inmbalanced data set without collinearity


Unnamed: 0,precission,accuracy,recall,f1_score
0,0.5,0.963,0.02,0.038



Confusion matrix for inmbalanced data set without collinearity


array([[1313,    1],
       [  49,    1]], dtype=int64)

In [77]:
y_train_no_corr.value_counts()

Bankrupt
0    5285
1     170
Name: count, dtype: int64

## Overscaling the data to balance number of 1 and 0 in target variable
 
 **1. Upscaling bankrupt '1' values**

In [15]:
# Call the fucntion to upscale the data:

X_train_up, y_train_up = data_resampling('up', X_train, y_train)

LR_upscaled = LogisticRegression(max_iter=500)

LR_upscaled.fit(X_train_up, y_train_up)

y_pred_LR_up = LR_upscaled.predict(X_test)

score_LR_up = model_score(y_test, y_pred_LR_up)

In [22]:
print()
print("Logistic Regression score for upscaled data set")
display(score_LR_up)
print("Confusion matrix")
print(confusion_matrix(y_test, y_pred_LR_up))


Logistic Regression score for upscaled data set


Unnamed: 0,precission,accuracy,recall,f1_score
0,0.199,0.872,0.82,0.32


Confusion matrix
[[1149  165]
 [   9   41]]


In [21]:
y_train.value_counts()

Bankrupt
0    5285
1     170
Name: count, dtype: int64

In [19]:
y_train_up.value_counts()

Bankrupt
0    5285
1    5285
Name: count, dtype: int64

In [18]:
y_test.value_counts()

Bankrupt
0    1314
1      50
Name: count, dtype: int64

**Downscaling bankrupt '0' values**

In [23]:
# Call the function to downsample the data

X_train_dw, y_train_dw = data_resampling('down', X_train, y_train)

LR_downscaled = LogisticRegression()

LR_downscaled.fit(X_train_dw, y_train_dw)

y_pred_LR_dw = LR_downscaled.predict(X_test)

score_LR_dw = model_score(y_test, y_pred_LR_dw)

In [26]:
print("Logistic Regression score for downscaled data set")
display(score_LR_dw)
print("Confusion matrix")
print(confusion_matrix(y_test, y_pred_LR_dw))

Logistic Regression score for downscaled data set


Unnamed: 0,precission,accuracy,recall,f1_score
0,0.179,0.853,0.84,0.296


Confusion matrix
[[1122  192]
 [   8   42]]


In [28]:
confusion_matrix(y_test, y_pred_LR_dw).sum()

1364

In [25]:
y_train_dw.value_counts()

Bankrupt
1    170
0    170
Name: count, dtype: int64

In [17]:
X_test.shape, X_train.shape, X.shape

((1364, 93), (5455, 93), (6819, 93))

## Fileter the columns to have the most important features

 - Call kbest_features_selection function that will chose the most important features (Values for selected features are original ones, so no scaling applied)

In [29]:
X_selected = kbest_features_selection(X, y, scaler = MinMaxScaler(), func=f_classif, k_atr=30)

In [30]:
X_selected.shape

(6819, 30)

In [31]:
# Remove collinearity between columns

X_reduced = remove_collinearity(X_selected, threshold=0.95)

In [32]:
X_reduced.shape

(6819, 18)

In [34]:
len(X_reduced.columns)

18

## So now chose only selected features from original data frame and do the models

In [40]:
reduced_df = pd.concat([y, X_reduced], axis=1)

reduced_df.to_csv('data/data_reduced_ML.csv', index=False)

In [41]:
X_reduced.shape

(6819, 18)

In [35]:
X_reduced_train, X_reduced_test, y_reduced_train, y_reduced_test = split_scale(X_reduced, y, test_size=0.2, random_state=4576)

LR_reduced = LogisticRegression()
    
LR_reduced.fit(X_reduced_train, y_reduced_train)
    
y_reduced_pred_LR = LR_reduced.predict(X_reduced_test)
    

score_reduced_LR = model_score(y_reduced_test, y_reduced_pred_LR)

In [36]:
X_reduced_train.shape, X_reduced_test.shape

((5455, 18), (1364, 18))

In [37]:
y_reduced_test.value_counts()

Bankrupt
0    1314
1      50
Name: count, dtype: int64

In [38]:
print("Logistic Regression score for inmbalanced data set after features reduction")
display(score_reduced_LR)
print("Confusion matrix")
print(confusion_matrix(y_reduced_test, y_reduced_pred_LR))


# print()
# print("Logistic Regression score for inmbalanced data set (full)")
# display(score_LR)
# print("Confusion matrix")
# print(confusion_matrix(y_test, y_pred_LR))

# print()
# print("Logistic Regression score for inmbalanced data set without collinearity")
# display(score_LR_no_corr)

# print("Confusion matrix")
# confusion_matrix(y_test_no_corr, y_pred_LR_no_corr)

Logistic Regression score for inmbalanced data set after features reduction


Unnamed: 0,precission,accuracy,recall,f1_score
0,0.75,0.965,0.06,0.111


Confusion matrix
[[1313    1]
 [  47    3]]


In [39]:
y_test.value_counts()

Bankrupt
0    1314
1      50
Name: count, dtype: int64

In [40]:
y_reduced_train.value_counts()

Bankrupt
0    5285
1     170
Name: count, dtype: int64

 - even thoug precission seems to increase after feature reduction, confusion matrix shows that change in classification is minor 
 - generally there is no change between results of full data set and reduced one -> so there is no point in using whole data frame

**Upscaling data with reduced features**

In [41]:
X_reduced_train_up, y_reduced_train_up = data_resampling('up', X_reduced, y)

LR_reduced_up = LogisticRegression()
    
LR_reduced_up.fit(X_reduced_train_up, y_reduced_train_up)
    
y_reduced_pred_LR_up = LR_reduced_up.predict(X_reduced_test)
    
score_reduced_LR_up = model_score(y_reduced_test, y_reduced_pred_LR_up)

In [42]:
y_reduced_test.value_counts()

Bankrupt
0    1314
1      50
Name: count, dtype: int64

In [43]:
y_reduced_train.value_counts()

Bankrupt
0    5285
1     170
Name: count, dtype: int64

In [45]:
y_reduced_train_up.value_counts()

Bankrupt
0    6599
1    6599
Name: count, dtype: int64

**Downscaling data with reduced features**

In [69]:
X_reduced_train_dw, y_reduced_train_dw = data_resampling('down', X_reduced, y)

LR_reduced_dw = LogisticRegression()
    
LR_reduced_dw.fit(X_reduced_train_dw, y_reduced_train_dw)
    
y_reduced_pred_LR_dw = LR_reduced_dw.predict(X_reduced_test_dw)
    
score_reduced_LR_dw = model_score(y_reduced_test_dw, y_reduced_pred_LR_dw)

In [70]:
y_reduced_test_dw.value_counts()

Bankrupt
0    45
1    43
Name: count, dtype: int64

In [72]:
print()
print("Logistic Regression score for reduced data set with upscaled majority data")
display(score_reduced_LR_up)

print("Confusion matrix")
print(confusion_matrix(y_reduced_test, y_reduced_pred_LR_up))


print()
print("Logistic Regression score for reduced data set with dowscaled majority  data")
display(score_reduced_LR_dw)

print("Confusion matrix")
print(confusion_matrix(y_reduced_test_dw, y_reduced_pred_LR_dw))


Logistic Regression score for reduced data set with upscaled majority data


Unnamed: 0,precission,accuracy,recall,f1_score
0,0.087,0.62,0.98,0.159


Confusion matrix
[[797 517]
 [  1  49]]

Logistic Regression score for reduced data set with dowscaled majority  data


Unnamed: 0,precission,accuracy,recall,f1_score
0,0.741,0.807,0.93,0.825


Confusion matrix
[[31 14]
 [ 3 40]]


In [50]:
X_reduced_train_dw.shape, X_reduced_train_up.shape

((352, 18), (10558, 18))

## Calibration methods

- it not actually an algorithm, but rather methods to calibrate used model

In [10]:
from sklearn.calibration import CalibratedClassifierCV

In [48]:
initial_model = LogisticRegression()

calibrated_model = CalibratedClassifierCV(initial_model, method='sigmoid', cv='prefit')

initial_model.fit(X_reduced_train, y_reduced_train)

calibrated_model.fit(X_reduced_train, y_reduced_train)

y_pred_calibrated = calibrated_model.predict(X_reduced_test)

In [53]:
confusion_matrix(y_reduced_test, y_pred_calibrated)

array([[1311,    3],
       [  46,    4]], dtype=int64)

## RFE method

In [50]:
from sklearn.feature_selection import RFE

In [52]:
# Your code here:
model = LogisticRegression()

model.fit(X_train, y_train)

In [55]:
X_train.shape, X_test.shape

((5455, 93), (1364, 93))

In [56]:
selector = RFE(model, n_features_to_select=20, step=1, verbose=0, importance_getter='auto')

In [57]:
# Your code here: X_train, X_test, y_train, y_test
selector.fit(X_train, y_train)

In [60]:
selector.ranking_

array([ 1,  1,  1, 71, 63, 41, 30, 37, 58, 39, 27, 70, 40, 15,  1,  1,  1,
        1,  1, 68, 38, 12,  1, 62, 54, 61, 50, 64, 34, 52, 65, 20, 73, 21,
       42, 74,  1,  1,  8, 14, 43, 16,  1, 29,  1, 33, 45, 47,  3, 46, 23,
        7, 59,  1,  9, 28,  1, 25,  4,  1, 24, 66, 55, 57,  6, 32, 17, 67,
       13,  5, 53, 49, 51, 11, 44,  2, 72, 31,  1,  1, 26, 18, 36, 19,  1,
       10, 48, 69, 56, 22, 35, 60,  1])

In [61]:
len(selector.ranking_)

93

## Calibration for original data frame

In [16]:
initial_model_2 = LogisticRegression()

calibrated_model_2 = CalibratedClassifierCV(initial_model_2, method='sigmoid', cv='prefit')

initial_model_2.fit(X_train, y_train)

calibrated_model_2.fit(X_train, y_train)

y_pred_calibrated_2 = calibrated_model_2.predict(X_test)

confusion_matrix(y_test, y_pred_calibrated_2)

score_model_2 = model_score(y_test, y_pred_calibrated_2)

display(score_model_2)