In [1]:
import pandas as pd
import numpy as np
df=pd.read_csv("data.csv")
df.head()

Unnamed: 0,Bankrupt?,ROA(C) before interest and depreciation before interest,ROA(A) before interest and % after tax,ROA(B) before interest and depreciation after tax,Operating Gross Margin,Realized Sales Gross Margin,Operating Profit Rate,Pre-tax net Interest Rate,After-tax net Interest Rate,Non-industry income and expenditure/revenue,...,Net Income to Total Assets,Total assets to GNP price,No-credit Interval,Gross Profit to Sales,Net Income to Stockholder's Equity,Liability to Equity,Degree of Financial Leverage (DFL),Interest Coverage Ratio (Interest expense to EBIT),Net Income Flag,Equity to Liability
0,1,0.370594,0.424389,0.40575,0.601457,0.601457,0.998969,0.796887,0.808809,0.302646,...,0.716845,0.009219,0.622879,0.601453,0.82789,0.290202,0.026601,0.56405,1,0.016469
1,1,0.464291,0.538214,0.51673,0.610235,0.610235,0.998946,0.79738,0.809301,0.303556,...,0.795297,0.008323,0.623652,0.610237,0.839969,0.283846,0.264577,0.570175,1,0.020794
2,1,0.426071,0.499019,0.472295,0.60145,0.601364,0.998857,0.796403,0.808388,0.302035,...,0.77467,0.040003,0.623841,0.601449,0.836774,0.290189,0.026555,0.563706,1,0.016474
3,1,0.399844,0.451265,0.457733,0.583541,0.583541,0.9987,0.796967,0.808966,0.30335,...,0.739555,0.003252,0.622929,0.583538,0.834697,0.281721,0.026697,0.564663,1,0.023982
4,1,0.465022,0.538432,0.522298,0.598783,0.598783,0.998973,0.797366,0.809304,0.303475,...,0.795016,0.003878,0.623521,0.598782,0.839973,0.278514,0.024752,0.575617,1,0.03549


In [2]:
df.shape

(6819, 96)

In [3]:
# Dividing Data and Labels
y = df['Bankrupt?']
X = df.drop(['Bankrupt?'], axis = 1)

In [4]:
#splitting dataset
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2)

In [5]:
# Outliers removal

def outliers_removal(feature,feature_name,dataset):
    
    # Identify 25th & 75th quartiles

    q25, q75 = np.percentile(feature, 25), np.percentile(feature, 75)
    # print('Quartile 25: {} | Quartile 75: {}'.format(q25, q75))
    feat_iqr = q75 - q25
    # print('iqr: {}'.format(feat_iqr))
    
    feat_cut_off = feat_iqr * 1.5
    feat_lower, feat_upper = q25 - feat_cut_off, q75 + feat_cut_off
    # print('Cut Off: {}'.format(feat_cut_off))
    # print(feature_name +' Lower: {}'.format(feat_lower))
    # print(feature_name +' Upper: {}'.format(feat_upper))
    
    outliers = [x for x in feature if x < feat_lower or x > feat_upper]
    # print(feature_name + ' outliers for close to bankruptcy cases: {}'.format(len(outliers)))
    #print(feature_name + ' outliers:{}'.format(outliers))

    dataset = dataset.drop(dataset[(dataset[feature_name] > feat_upper) | (dataset[feature_name] < feat_lower)].index)
    # print('-' * 65)
    
    return dataset

for col in df:
    new_df = outliers_removal(df[col],str(col),df)

In [6]:
# Dividing Data and Labels
y1 = new_df['Bankrupt?']
X1 = new_df.drop(['Bankrupt?'], axis = 1)

In [7]:
y1.shape

(6270,)

In [8]:
X1.shape

(6270, 95)

In [9]:
def log_transform(df):
    
    for col in df:
        skew = df[col].skew()
        if skew > 0.5 or skew < -0.5:
            df[col] = np.log1p(df[col])
        else:
            continue
            
    return df

data_norm = log_transform(X1)

In [10]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1,test_size=0.2)

In [11]:
print(X_train1.shape, X_test1.shape, y_train1.shape, y_test1.shape)

(5016, 95) (1254, 95) (5016,) (1254,)


In [12]:
from imblearn.over_sampling import SMOTE
#Smote Before Feature selection
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train1, y_train1)

In [13]:
# Count the occurrences of each class in the original training set
original_class_counts = y_train1.value_counts()
print("Original Training Set Class Counts:")
print(original_class_counts)

# Count the occurrences of each class in the SMOTE-resampled training set
smote_class_counts = y_train_smote.value_counts()
print("\nSMOTE-Resampled Training Set Class Counts:")
print(smote_class_counts)

Original Training Set Class Counts:
0    4842
1     174
Name: Bankrupt?, dtype: int64

SMOTE-Resampled Training Set Class Counts:
0    4842
1    4842
Name: Bankrupt?, dtype: int64


In [14]:
print(X_train_smote.shape, y_train_smote.shape, X_test1.shape, y_test1.shape)

(9684, 95) (9684,) (1254, 95) (1254,)


In [15]:
#scaling before PCA
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_smote)
X_test_scaled = scaler.transform(X_test1)

In [16]:
print(X_train_scaled.shape, X_test_scaled.shape, y_train_smote.shape, y_test1.shape)

(9684, 95) (1254, 95) (9684,) (1254,)


In [17]:
explained_variance_ratio = 0.95
pca = PCA(n_components=explained_variance_ratio)
pca.fit(X_train_scaled)

In [18]:
X_train_pca = pca.transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

In [19]:
X_train_pca.shape 

(9684, 46)

In [20]:
y_train_smote.shape

(9684,)

In [21]:
from sklearn.neighbors import KNeighborsClassifier

# create a KNN classifier object
knn = KNeighborsClassifier(n_neighbors=4)

# fit the model on the standardized training data
knn.fit(X_train_pca, y_train_smote)

# predict on the standardized testing data
y_pred_knn = knn.predict(X_test_pca)

In [22]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

# calculate accuracy
accuracy_knn = accuracy_score(y_test1, y_pred_knn)

# calculate precision
precision_knn = precision_score(y_test1, y_pred_knn)

# calculate recall
recall_knn = recall_score(y_test1, y_pred_knn)

# calculate F1-score
f1_knn = f1_score(y_test1, y_pred_knn)

# print the results
print("Accuracy: ", accuracy_knn)
print("Precision: ", precision_knn)
print("Recall: ", recall_knn)
print("F1-score: ", f1_knn)
print("\n")
from sklearn.metrics import classification_report

# calculate the classification report for the KNN model
knn_report=classification_report(y_test1, y_pred_knn)
print(knn_report)

# calculate the confusion matrix
cm = confusion_matrix(y_test1, y_pred_knn)

# print the confusion matrix
print(cm)

Accuracy:  0.9106858054226475
Precision:  0.22580645161290322
Recall:  0.6363636363636364
F1-score:  0.3333333333333333


              precision    recall  f1-score   support

           0       0.99      0.92      0.95      1210
           1       0.23      0.64      0.33        44

    accuracy                           0.91      1254
   macro avg       0.61      0.78      0.64      1254
weighted avg       0.96      0.91      0.93      1254

[[1114   96]
 [  16   28]]


In [23]:
#Now let's try with undersampling (outliers are removed already)
X_train2, X_test2, y_train2, y_test2 = train_test_split(X1, y1,test_size=0.2)

In [24]:
print(X_train2.shape, X_test2.shape, y_train2.shape, y_test2.shape)

(5016, 95) (1254, 95) (5016,) (1254,)


In [25]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(random_state=42)

In [26]:
X_train_resampled, y_train_resampled = rus.fit_resample(X_train2, y_train2)

In [27]:
# Count the occurrences of each class in the original training set
original_class_counts = y_train2.value_counts()
print("Original Training Set Class Counts:")
print(original_class_counts)

# Count the occurrences of each class in the SMOTE-resampled training set
resampled_class_counts = y_train_resampled.value_counts()
print("\nSMOTE-Resampled Training Set Class Counts:")
print(resampled_class_counts)

Original Training Set Class Counts:
0    4845
1     171
Name: Bankrupt?, dtype: int64

SMOTE-Resampled Training Set Class Counts:
0    171
1    171
Name: Bankrupt?, dtype: int64


In [28]:
#scaling before PCA
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled1 = scaler.fit_transform(X_train_resampled)
X_test_scaled1 = scaler.transform(X_test2)

In [29]:
explained_variance_ratio = 0.95
pca = PCA(n_components=explained_variance_ratio)
pca.fit(X_train_scaled1)

In [30]:
X_train_pca1 = pca.transform(X_train_scaled1)
X_test_pca1 = pca.transform(X_test_scaled1)

In [31]:
from sklearn.neighbors import KNeighborsClassifier

# create a KNN classifier object
knn = KNeighborsClassifier(n_neighbors=4)

# fit the model on the standardized training data
knn.fit(X_train_pca1, y_train_resampled)

# predict on the standardized testing data
y_pred_knn2 = knn.predict(X_test_pca1)

In [32]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

# calculate accuracy
accuracy_knn = accuracy_score(y_test2, y_pred_knn2)

# calculate precision
precision_knn = precision_score(y_test2, y_pred_knn2)

# calculate recall
recall_knn = recall_score(y_test2, y_pred_knn2)

# calculate F1-score
f1_knn = f1_score(y_test2, y_pred_knn2)

# print the results
print("Accuracy: ", accuracy_knn)
print("Precision: ", precision_knn)
print("Recall: ", recall_knn)
print("F1-score: ", f1_knn)
print("\n")
from sklearn.metrics import classification_report

# calculate the classification report for the KNN model
knn_report=classification_report(y_test2, y_pred_knn2)
print(knn_report)

# calculate the confusion matrix
cm = confusion_matrix(y_test1, y_pred_knn2)

# print the confusion matrix
print(cm)

Accuracy:  0.8708133971291866
Precision:  0.17142857142857143
Recall:  0.6382978723404256
F1-score:  0.27027027027027023


              precision    recall  f1-score   support

           0       0.98      0.88      0.93      1207
           1       0.17      0.64      0.27        47

    accuracy                           0.87      1254
   macro avg       0.58      0.76      0.60      1254
weighted avg       0.95      0.87      0.90      1254

[[1042  168]
 [  37    7]]
