In [None]:
# Import the necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
#Load the dataset
data = pd.read_csv('/american_bankruptcy.csv')

In [None]:
# Preview Data
data.head()

Unnamed: 0,company_name,status_label,year,X1,X2,X3,X4,X5,X6,X7,...,X9,X10,X11,X12,X13,X14,X15,X16,X17,X18
0,C_1,alive,1999,511.267,833.107,18.373,89.031,336.018,35.163,128.348,...,1024.333,740.998,180.447,70.658,191.226,163.816,201.026,1024.333,401.483,935.302
1,C_1,alive,2000,485.856,713.811,18.577,64.367,320.59,18.531,115.187,...,874.255,701.854,179.987,45.79,160.444,125.392,204.065,874.255,361.642,809.888
2,C_1,alive,2001,436.656,526.477,22.496,27.207,286.588,-58.939,77.528,...,638.721,710.199,217.699,4.711,112.244,150.464,139.603,638.721,399.964,611.514
3,C_1,alive,2002,396.412,496.747,27.172,30.745,259.954,-12.41,66.322,...,606.337,686.621,164.658,3.573,109.59,203.575,124.106,606.337,391.633,575.592
4,C_1,alive,2003,432.204,523.302,26.68,47.491,247.245,3.504,104.661,...,651.958,709.292,248.666,20.811,128.656,131.261,131.884,651.958,407.608,604.467


In [None]:
# Data Preprocessing
# Convert Company Status to numerical values
data['status_label'] = data['status_label'].map({'alive': 0, 'failed': 1})

# Handle missing values (you can modify the strategy accordingly)
data = data.dropna()
# Calculating the X1, X2, X3, X4 and X5 ratios below to get the Altman Z-Score
# X1 = ratio of working capital to total assets
data['working capital'] = data['X1'] - data['X14']
data['x1'] = data['working capital'] / data['X10']
# X2 = ratio of retained earnings to total assets
data['x2'] = data['X15'] / data['X10']
# X3 = ratio of earnings before interest and taxes (EBIT) to total assets
data['x3'] = data['X12'] / data['X10']

# X4 = ratio of market value of equity to book value of total liabilities
data['x4'] = data['X8'] / data['X17']

# X5 = ratio of sales to total assets
data['x5'] = data['X9'] / data['X10']

In [None]:
# Prepare features (X) and target (y)
features = ['x1', 'x2', 'x3', 'x4', 'x5']
X = data[features]
y = data['status_label']
# Split dataset into train and test sets (consider years to evaluate during 2008 crisis)
data_pre_2008 = data[data['year'] < 2008]
data_2008 = data[data['year'] == 2008]
data_post_2008 = data[data['year'] > 2008]
# Split pre-2008 data into train and test
X_train, X_test, y_train, y_test = train_test_split(data_pre_2008[features], data_pre_2008['status_label'], test_size=0.2, random_state=42)


In [None]:
# Train the model using Logistic Regression
model = LogisticRegression()
model.fit(X_train, y_train)

In [None]:
# Evaluate the model during the 2008 crisis
X_2008 = data_2008[features]
y_2008 = data_2008['status_label']
y_2008_pred = model.predict(X_2008)
print("Accuracy during 2008 financial crisis:", accuracy_score(y_2008, y_2008_pred))
print(classification_report(y_2008, y_2008_pred))

Accuracy during 2008 financial crisis: 0.9263676432460461
              precision    recall  f1-score   support

           0       0.93      1.00      0.96      3573
           1       0.00      0.00      0.00       284

    accuracy                           0.93      3857
   macro avg       0.46      0.50      0.48      3857
weighted avg       0.86      0.93      0.89      3857



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# Try with additional features to improve the model
additional_features = ['X1', 'X2', 'X3', 'X13']
X_new = data[features + additional_features]

# Split with additional features
X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(data_pre_2008[features + additional_features], data_pre_2008['status_label'], test_size=0.2, random_state=42)

# Train a new model
model_new = LogisticRegression()
model_new.fit(X_train_new, y_train_new)

# Test the new model
y_pred_new = model_new.predict(X_test_new)
print("Accuracy with additional features on pre-2008 data:", accuracy_score(y_test_new, y_pred_new))

# Evaluate the new model during the 2008 crisis
X_2008_new = data_2008[features + additional_features]
y_2008_pred_new = model_new.predict(X_2008_new)
print("Accuracy with additional features during 2008 financial crisis:", accuracy_score(y_2008, y_2008_pred_new))
print(classification_report(y_2008, y_2008_pred_new))

Accuracy with additional features on pre-2008 data: 0.9142995872784656
Accuracy with additional features during 2008 financial crisis: 0.9250712989369977
              precision    recall  f1-score   support

           0       0.93      1.00      0.96      3573
           1       0.00      0.00      0.00       284

    accuracy                           0.93      3857
   macro avg       0.46      0.50      0.48      3857
weighted avg       0.86      0.93      0.89      3857



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
# Extract the first 10 companies from the 2008 data
first_10_2008 = data_2008.head(10)

# Extract the features for the first 10 companies in 2008
X_first_10_2008 = first_10_2008[features]

# Make predictions for these companies
y_first_10_2008_pred = model.predict(X_first_10_2008)

# Create a DataFrame to show the actual and predicted values side by side
predictions_2008 = first_10_2008[['company_name', 'status_label']].copy()
predictions_2008['Predicted Status'] = y_first_10_2008_pred

# Replace numerical values with labels for easier understanding
predictions_2008['status_label'] = predictions_2008['status_label'].map({0: 'Alive', 1: 'Bankrupt'})
predictions_2008['Predicted Status'] = predictions_2008['Predicted Status'].map({0: 'Alive', 1: 'Bankrupt'})

# Display the results
print(predictions_2008)


    company_name status_label Predicted Status
9            C_1        Alive            Alive
28           C_2        Alive            Alive
39           C_3        Alive            Alive
59           C_6     Bankrupt            Alive
71           C_7        Alive            Alive
99           C_9        Alive            Alive
119         C_10        Alive            Alive
139         C_11        Alive            Alive
159         C_12        Alive            Alive
171         C_13        Alive            Alive
