### Setup

In [94]:
import pandas as pd

In [95]:
import random
seed = 42

In [96]:
data = pd.read_csv('clean_data.csv', index_col=0)
data.head()

Unnamed: 0,NetIncometoTotalAssets,ROA(A)beforeinterestand%aftertax,ROA(B)beforeinterestanddepreciationaftertax,ROA(C)beforeinterestanddepreciationbeforeinterest,Networth/Assets,Debtratio%,PersistentEPSintheLastFourSeasons,RetainedEarningstoTotalAssets,Netprofitbeforetax/Paid-incapital,PerShareNetprofitbeforetax(Yuan¥),bankrupt
0,0.716845,0.424389,0.40575,0.370594,0.792424,0.207576,0.169141,0.903225,0.137757,0.138736,1
1,0.795297,0.538214,0.51673,0.464291,0.828824,0.171176,0.208944,0.931065,0.168962,0.169918,1
2,0.77467,0.499019,0.472295,0.426071,0.792484,0.207516,0.180581,0.909903,0.148036,0.142803,1
3,0.739555,0.451265,0.457733,0.399844,0.848535,0.151465,0.193722,0.906902,0.147561,0.148603,1
4,0.795016,0.538432,0.522298,0.465022,0.893491,0.106509,0.212537,0.91385,0.167461,0.168412,1


In [97]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data.drop('bankrupt', axis=1), data['bankrupt'], test_size=0.2, random_state=seed, stratify=data['bankrupt'])
X_train

Unnamed: 0,NetIncometoTotalAssets,ROA(A)beforeinterestand%aftertax,ROA(B)beforeinterestanddepreciationaftertax,ROA(C)beforeinterestanddepreciationbeforeinterest,Networth/Assets,Debtratio%,PersistentEPSintheLastFourSeasons,RetainedEarningstoTotalAssets,Netprofitbeforetax/Paid-incapital,PerShareNetprofitbeforetax(Yuan¥)
318,0.806631,0.550153,0.544622,0.493784,0.945521,0.054479,0.221424,0.932937,0.175145,0.176998
5796,0.827723,0.590765,0.573425,0.534393,0.839526,0.160474,0.249031,0.942162,0.205822,0.206749
4454,0.828627,0.586895,0.566519,0.526398,0.935544,0.064456,0.230689,0.941995,0.187218,0.188898
2225,0.797846,0.536851,0.521066,0.469702,0.829277,0.170723,0.216413,0.935162,0.171141,0.172102
3249,0.756573,0.443197,0.432464,0.391557,0.985851,0.014149,0.202609,0.917002,0.160312,0.157114
...,...,...,...,...,...,...,...,...,...,...
4688,0.811639,0.563890,0.593501,0.550090,0.849684,0.150316,0.239576,0.942843,0.189759,0.191459
1078,0.791904,0.528129,0.530007,0.481597,0.843063,0.156937,0.217548,0.930064,0.167363,0.168261
6687,0.875503,0.707697,0.689330,0.622288,0.974544,0.025456,0.316347,0.966794,0.250486,0.254651
2623,0.856548,0.642172,0.623374,0.586847,0.925930,0.074070,0.248464,0.952396,0.205037,0.205995


### Logistic Regression

In [98]:
from sklearn.linear_model import LogisticRegression

class_weights = {0: 1, 1: 10}  # Class 1 is given 3x more weight than Class 0

logreg = LogisticRegression(random_state=seed, class_weight=class_weights, solver='lbfgs')
logreg.fit(X_train, y_train)

In [99]:
y_pred_logreg = logreg.predict(X_test)

In [100]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred_logreg))

              precision    recall  f1-score   support

           0       0.99      0.96      0.97      1320
           1       0.33      0.64      0.43        44

    accuracy                           0.95      1364
   macro avg       0.66      0.80      0.70      1364
weighted avg       0.97      0.95      0.95      1364



In [101]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred_logreg)

cm_df = pd.DataFrame(cm, index=['Actual_Class_0', 'Actual_Class_1'], 
                     columns=['Predicted_Class_0', 'Predicted_Class_1'])

print("Confusion Matrix:")
print(cm_df)

Confusion Matrix:
                Predicted_Class_0  Predicted_Class_1
Actual_Class_0               1263                 57
Actual_Class_1                 16                 28


### Export

In [102]:
import pickle

file = open('model_logreg.pkl', 'wb')
pickle.dump(logreg, file)
file.close()