In [1]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.combine import SMOTETomek
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import warnings
warnings.filterwarnings('ignore')

# Load the dataset
df = pd.read_csv('american_bankruptcy.csv')

In [3]:
df.rename(columns={
    'X1': 'Current assets',
    'X2': 'Cost of goods sold',
    'X3': 'Depreciation and amortization',
    'X4': 'EBITDA',
    'X5': 'Inventory',
    'X6': 'Net Income',
    'X7': 'Total Receivables',
    'X8': 'Market Value',
    'X9': 'Net Sales',
    'X10': 'Total Assets',
    'X11': 'Total Long-term Debt',
    'X12': 'EBIT',
    'X13': 'Gross Profit',
    'X14': 'Total Current Liabilities',
    'X15': 'Retained Earnings',
    'X16': 'Total Revenue',
    'X17': 'Total Liabilities',
    'X18': 'Total Operating Expenses'
}, inplace=True)

# Inspect data to ensure columns have been renamed correctly
print(df.info())
print(df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78682 entries, 0 to 78681
Data columns (total 21 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   company_name                   78682 non-null  object 
 1   status_label                   78682 non-null  object 
 2   year                           78682 non-null  int64  
 3   Current assets                 78682 non-null  float64
 4   Cost of goods sold             78682 non-null  float64
 5   Depreciation and amortization  78682 non-null  float64
 6   EBITDA                         78682 non-null  float64
 7   Inventory                      78682 non-null  float64
 8   Net Income                     78682 non-null  float64
 9   Total Receivables              78682 non-null  float64
 10  Market Value                   78682 non-null  float64
 11  Net Sales                      78682 non-null  float64
 12  Total Assets                   78682 non-null 

In [5]:
# Step 1: Create Financial Ratios
def create_financial_ratios(df):
    """Create important financial ratios for bankruptcy prediction"""
    
    # Liquidity Ratios
    df['Quick_Ratio'] = (df['Current assets'] - df['Inventory']) / df['Total Current Liabilities']
    df['Cash_Ratio'] = (df['Current assets'] - df['Inventory'] - df['Total Receivables']) / df['Total Current Liabilities']
    
    # Efficiency Ratios
    df['Asset_Turnover'] = df['Net Sales'] / df['Total Assets']
    df['Inventory_Turnover'] = df['Cost of goods sold'] / df['Inventory']
    df['Receivables_Turnover'] = df['Net Sales'] / df['Total Receivables']
    
    # Profitability Ratios
    df['Gross_Margin'] = df['Gross Profit'] / df['Net Sales']
    df['Operating_Margin'] = df['EBIT'] / df['Net Sales']
    df['Net_Profit_Margin'] = df['Net Income'] / df['Net Sales']
    df['ROA'] = df['Net Income'] / df['Total Assets']
    
    # Leverage Ratios
    df['Debt_Ratio'] = df['Total Liabilities'] / df['Total Assets']
    df['Interest_Coverage'] = df['EBIT'] / df['Total Long-term Debt']
    
    # Replace infinities with zeros
    df = df.replace([np.inf, -np.inf], 0)
    
    # Fill NaN values with 0
    df = df.fillna(0)
    
    return df

# Apply financial ratios
df = create_financial_ratios(df)

# Calculate Altman Z-Score
df['Working_Capital'] = df['Current assets'] - df['Total Current Liabilities']
df['Working_Capital_Ratio'] = df['Working_Capital'] / df['Total Assets']
df['Retained_Earnings_Ratio'] = df['Retained Earnings'] / df['Total Assets']
df['EBIT_Ratio'] = df['EBIT'] / df['Total Assets']
df['Market_Equity_Ratio'] = df['Market Value'] / df['Total Liabilities']
df['Sales_Ratio'] = df['Net Sales'] / df['Total Assets']

df['Altman_Z_Score'] = (
    1.2 * df['Working_Capital_Ratio'] +
    1.4 * df['Retained_Earnings_Ratio'] +
    3.3 * df['EBIT_Ratio'] +
    0.6 * df['Market_Equity_Ratio'] +
    1.0 * df['Sales_Ratio']
)

# Convert status_label to numeric
df['status_label'] = df['status_label'].apply(lambda x: 1 if x == 'failed' else 0)

# Prepare features and target
X = df.drop(columns=['company_name', 'status_label', 'year'])
y = df['status_label']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply SMOTETomek for balanced sampling
smote_tomek = SMOTETomek(random_state=42)
X_train_balanced, y_train_balanced = smote_tomek.fit_resample(X_train_scaled, y_train)

# Create and train base models
def train_base_models(X_train, y_train, X_test, y_test):
    models = {
        'Random Forest': RandomForestClassifier(
            n_estimators=200,
            max_depth=10,
            class_weight='balanced',
            random_state=42
        ),
        'Gradient Boosting': GradientBoostingClassifier(
            n_estimators=200,
            learning_rate=0.1,
            max_depth=5,
            random_state=42
        ),
        'Logistic Regression': LogisticRegression(
            class_weight='balanced',
            random_state=42,
            max_iter=1000
        )
    }
    
    results = {}
    for name, model in models.items():
        # Train model
        model.fit(X_train, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)[:, 1]
        
        # Store results
        results[name] = {
            'model': model,
            'predictions': y_pred,
            'probabilities': y_pred_proba,
            'confusion_matrix': confusion_matrix(y_test, y_pred),
            'classification_report': classification_report(y_test, y_pred),
            'roc_auc': roc_auc_score(y_test, y_pred_proba)
        }
    
    return results

# Train and evaluate models
model_results = train_base_models(X_train_balanced, y_train_balanced, X_test_scaled, y_test)

# Print results for each model
for name, results in model_results.items():
    print(f"\nResults for {name}:")
    print("\nConfusion Matrix:")
    print(results['confusion_matrix'])
    print("\nClassification Report:")
    print(results['classification_report'])
    print("\nROC AUC Score:", results['roc_auc'])
    print("-" * 60)

# Function to make predictions with threshold optimization
def predict_with_optimal_threshold(model, X, y_true, threshold=0.5):
    y_pred_proba = model.predict_proba(X)[:, 1]
    y_pred = (y_pred_proba >= threshold).astype(int)
    
    return {
        'predictions': y_pred,
        'probabilities': y_pred_proba,
        'confusion_matrix': confusion_matrix(y_true, y_pred),
        'classification_report': classification_report(y_true, y_pred),
        'roc_auc': roc_auc_score(y_true, y_pred_proba)
    }

# Find best model based on ROC AUC score
best_model_name = max(model_results.items(), key=lambda x: x[1]['roc_auc'])[0]
best_model = model_results[best_model_name]['model']

print(f"\nBest Model: {best_model_name}")

# Make predictions with optimized threshold
thresholds = [0.3, 0.4, 0.5, 0.6, 0.7]
threshold_results = {}

for threshold in thresholds:
    results = predict_with_optimal_threshold(best_model, X_test_scaled, y_test, threshold)
    threshold_results[threshold] = results

# Print results for different thresholds
print("\nResults with different prediction thresholds for the best model:")
for threshold, results in threshold_results.items():
    print(f"\nThreshold: {threshold}")
    print("Confusion Matrix:")
    print(results['confusion_matrix'])
    print("\nClassification Report:")
    print(results['classification_report'])
    print("ROC AUC Score:", results['roc_auc'])
    print("-" * 60)

# Function to make predictions on new data
def predict_bankruptcy(model, new_data, threshold=0.5):
    # Scale the features
    new_data_scaled = scaler.transform(new_data)
    
    # Get probability predictions
    probabilities = model.predict_proba(new_data_scaled)[:, 1]
    
    # Apply threshold
    predictions = (probabilities >= threshold).astype(int)
    
    return predictions, probabilities


Results for Random Forest:

Confusion Matrix:
[[11257  3436]
 [  392   652]]

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.77      0.85     14693
           1       0.16      0.62      0.25      1044

    accuracy                           0.76     15737
   macro avg       0.56      0.70      0.55     15737
weighted avg       0.91      0.76      0.81     15737


ROC AUC Score: 0.778501726132782
------------------------------------------------------------

Results for Gradient Boosting:

Confusion Matrix:
[[11957  2736]
 [  401   643]]

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.81      0.88     14693
           1       0.19      0.62      0.29      1044

    accuracy                           0.80     15737
   macro avg       0.58      0.71      0.59     15737
weighted avg       0.92      0.80      0.84     15737


ROC AUC Score: 0.7978079717372648
--------