# Company Bankruptcy Prediction

## Overview
This notebook predicts company bankruptcy using financial indicators.

### Problem Type: Classification
- **Target Variable**: Bankrupt? (Y) - Binary classification (0: Not Bankrupt, 1: Bankrupt)
- **Features**: 95 financial indicators (X1-X95)
- **Models**:
  1. Logistic Regression (with configurations)
  2. Random Forest (with configurations)
  3. Neural Network (with configurations)


In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, accuracy_score, f1_score, precision_score, recall_score
import warnings
warnings.filterwarnings('ignore')

# Set style for plots
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")


### Feature Documentation Reference

The dataset includes 95 financial features (X1-X95) as documented in `CompanyBankruptcyData Documentation.txt`. Each feature represents various financial ratios and indicators that can help predict bankruptcy risk, such as:

- Return on Assets (ROA) metrics
- Operating margins and profit rates
- Current ratios and quick ratios
- Debt-to-equity ratios
- Cash flow indicators
- Growth rates (asset, equity, profit)
- Turnover ratios
- Financial leverage indicators


## 1. Data Loading and Exploration


In [None]:
# Load the dataset
df = pd.read_csv('CompanyBankruptcyData.csv')

# Display basic information
print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.shape[1]}")
print(f"Rows: {df.shape[0]}")
df.head()


In [None]:
# Check for missing values
print("Missing values per column:")
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0])

# Data info
df.info()


In [None]:
# Separate features and target
X = df.drop('Bankrupt?', axis=1)
y = df['Bankrupt?']

print(f"Target distribution:")
print(y.value_counts())
print(f"\nTarget distribution (%):")
print(y.value_counts(normalize=True) * 100)


## 2. Visualizations


In [None]:
# Visualize target distribution
plt.figure(figsize=(10, 6))
target_counts = y.value_counts()
plt.subplot(1, 2, 1)
plt.bar(['Not Bankrupt (0)', 'Bankrupt (1)'], target_counts.values, color=['green', 'red'], alpha=0.7)
plt.title('Target Class Distribution (Count)', fontsize=14, fontweight='bold')
plt.ylabel('Count')
for i, v in enumerate(target_counts.values):
    plt.text(i, v + 50, str(v), ha='center', va='bottom', fontweight='bold')

plt.subplot(1, 2, 2)
plt.pie(target_counts.values, labels=['Not Bankrupt', 'Bankrupt'], autopct='%1.2f%%', 
        colors=['green', 'red'], startangle=90, textprops={'fontsize': 12, 'fontweight': 'bold'})
plt.title('Target Class Distribution (%)', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()


In [None]:
# Display feature information
print(f"Total features: {len(X.columns)}")
print(f"\nFirst 10 features:")
for i, col in enumerate(X.columns[:10], 1):
    print(f"{i}. {col}")

print(f"\nLast 10 features:")
for i, col in enumerate(X.columns[-10:], len(X.columns)-9):
    print(f"{i}. {col}")


### Correlation Heatmap Analysis


In [None]:
# Create correlation heatmap for top features
correlation_matrix = df.corr()
top_features = correlation_with_target.abs().sort_values(ascending=False).head(20).index

plt.figure(figsize=(14, 12))
correlation_subset = correlation_matrix.loc[['Bankrupt?'] + list(top_features), 
                                              ['Bankrupt?'] + list(top_features)]
sns.heatmap(correlation_subset, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=1, cbar_kws={"shrink": 0.8}, fmt='.2f')
plt.title('Correlation Heatmap: Bankruptcy Target vs Top 20 Features', 
          fontsize=14, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()


### Feature Distribution Analysis


In [None]:
# Visualize distributions of top predictive features
top_features_list = correlation_with_target.abs().sort_values(ascending=False).head(12).index

fig, axes = plt.subplots(4, 3, figsize=(18, 16))
axes = axes.ravel()

for i, feature in enumerate(top_features_list):
    data_bankrupt = df[df['Bankrupt?'] == 1][feature]
    data_non_bankrupt = df[df['Bankrupt?'] == 0][feature]
    
    axes[i].hist(data_non_bankrupt, bins=30, alpha=0.6, label='Not Bankrupt', 
                color='green', edgecolor='black')
    axes[i].hist(data_bankrupt, bins=30, alpha=0.6, label='Bankrupt', 
                color='red', edgecolor='black')
    axes[i].set_title(f'{feature[:45]}...' if len(feature) > 45 else feature, 
                     fontsize=9, fontweight='bold')
    axes[i].set_xlabel('Value')
    axes[i].set_ylabel('Frequency')
    axes[i].legend()
    axes[i].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


In [None]:
# Define evaluation function with confusion matrix
def evaluate_model(y_true, y_pred, y_pred_proba, model_name):
    """Comprehensive model evaluation"""
    print(f"\n{'='*60}")
    print(f"{model_name} - Evaluation Results")
    print(f"{'='*60}")
    
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_true, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_true, y_pred, average='weighted', zero_division=0)
    auc = roc_auc_score(y_true, y_pred_proba[:, 1])
    
    print(f"\nAccuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print(f"ROC-AUC: {auc:.4f}")
    
    print(f"\n{'-'*60}")
    print("Classification Report:")
    print(f"{'-'*60}")
    print(classification_report(y_true, y_pred))
    
    # Confusion Matrix
    cm = confusion_matrix(y_true, y_pred)
    print(f"\n{'-'*60}")
    print("Confusion Matrix:")
    print(f"{'-'*60}")
    print(cm)
    
    return {
        'Model': model_name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'ROC-AUC': auc,
        'Confusion_Matrix': cm
    }


In [None]:
# Correlation analysis with target
correlation_with_target = df.corr()['Bankrupt?'].sort_values(ascending=False).drop('Bankrupt?')

plt.figure(figsize=(12, 8))
top_correlations = pd.concat([correlation_with_target.head(10), correlation_with_target.tail(10)])
plt.barh(range(len(top_correlations)), top_correlations.values, color='steelblue', alpha=0.7)
plt.yticks(range(len(top_correlations)), top_correlations.index)
plt.xlabel('Correlation with Bankrupt?', fontsize=12, fontweight='bold')
plt.title('Top 20 Features Most Correlated with Bankruptcy', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()


## 3. Data Preprocessing


In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")
print(f"\nTraining set class distribution:")
print(y_train.value_counts())
print(f"\nTest set class distribution:")
print(y_test.value_counts())


In [None]:
# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Data scaled successfully!")


In [None]:
# Define evaluation function
def evaluate_model(y_true, y_pred, y_pred_proba, model_name):
    """Comprehensive model evaluation"""
    print(f"\n{'='*60}")
    print(f"{model_name} - Evaluation Results")
    print(f"{'='*60}")
    
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_true, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_true, y_pred, average='weighted', zero_division=0)
    auc = roc_auc_score(y_true, y_pred_proba[:, 1])
    
    print(f"\nAccuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print(f"ROC-AUC: {auc:.4f}")
    
    print(f"\n{'-'*60}")
    print("Classification Report:")
    print(f"{'-'*60}")
    print(classification_report(y_true, y_pred))
    
    return {
        'Model': model_name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'ROC-AUC': auc
    }


### 4.1 Logistic Regression - Configuration 1


In [None]:
# Logistic Regression - Config 1: Default settings
lr_model1 = LogisticRegression(random_state=42, max_iter=1000, solver='liblinear')
lr_model1.fit(X_train_scaled, y_train)

y_pred_lr1 = lr_model1.predict(X_test_scaled)
y_pred_proba_lr1 = lr_model1.predict_proba(X_test_scaled)

results_lr1 = evaluate_model(y_test, y_pred_lr1, y_pred_proba_lr1, 
                            "Logistic Regression - Config 1 (Default)")


### 4.2 Logistic Regression - Configuration 2


In [None]:
# Logistic Regression - Config 2: L2 regularization with C=0.1
lr_model2 = LogisticRegression(random_state=42, max_iter=1000, solver='liblinear', C=0.1, penalty='l2')
lr_model2.fit(X_train_scaled, y_train)

y_pred_lr2 = lr_model2.predict(X_test_scaled)
y_pred_proba_lr2 = lr_model2.predict_proba(X_test_scaled)

results_lr2 = evaluate_model(y_test, y_pred_lr2, y_pred_proba_lr2, 
                            "Logistic Regression - Config 2 (C=0.1, L2)")


### 4.3 Logistic Regression - Configuration 3


In [None]:
# Logistic Regression - Config 3: L1 regularization with C=10
lr_model3 = LogisticRegression(random_state=42, max_iter=1000, solver='liblinear', C=10, penalty='l1')
lr_model3.fit(X_train_scaled, y_train)

y_pred_lr3 = lr_model3.predict(X_test_scaled)
y_pred_proba_lr3 = lr_model3.predict_proba(X_test_scaled)

results_lr3 = evaluate_model(y_test, y_pred_lr3, y_pred_proba_lr3, 
                            "Logistic Regression - Config 3 (C=10, L1)")


### 4.4 Random Forest - Configuration 1


In [None]:
# Random Forest - Config 1: Default settings
rf_model1 = RandomForestClassifier(random_state=42, n_jobs=-1)
rf_model1.fit(X_train, y_train)

y_pred_rf1 = rf_model1.predict(X_test)
y_pred_proba_rf1 = rf_model1.predict_proba(X_test)

results_rf1 = evaluate_model(y_test, y_pred_rf1, y_pred_proba_rf1, 
                            "Random Forest - Config 1 (Default)")


### 4.5 Random Forest - Configuration 2


In [None]:
# Random Forest - Config 2: Deeper trees with more estimators
rf_model2 = RandomForestClassifier(n_estimators=200, max_depth=20, min_samples_split=5, 
                                   random_state=42, n_jobs=-1)
rf_model2.fit(X_train, y_train)

y_pred_rf2 = rf_model2.predict(X_test)
y_pred_proba_rf2 = rf_model2.predict_proba(X_test)

results_rf2 = evaluate_model(y_test, y_pred_rf2, y_pred_proba_rf2, 
                            "Random Forest - Config 2 (n_estimators=200, max_depth=20)")


### 4.6 Random Forest - Configuration 3


In [None]:
# Random Forest - Config 3: Shallow trees with class balancing
rf_model3 = RandomForestClassifier(n_estimators=150, max_depth=10, min_samples_split=10,
                                   class_weight='balanced', random_state=42, n_jobs=-1)
rf_model3.fit(X_train, y_train)

y_pred_rf3 = rf_model3.predict(X_test)
y_pred_proba_rf3 = rf_model3.predict_proba(X_test)

results_rf3 = evaluate_model(y_test, y_pred_rf3, y_pred_proba_rf3, 
                            "Random Forest - Config 3 (Class Weight Balanced)")


### 4.7 Neural Network - Configuration 1


In [None]:
# Import TensorFlow/Keras for Neural Network
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping

print(f"TensorFlow version: {tf.__version__}")


In [None]:
# Neural Network - Config 1: Simple architecture
def create_nn1(input_dim):
    model = keras.Sequential([
        layers.Dense(64, activation='relu', input_shape=(input_dim,)),
        layers.Dropout(0.3),
        layers.Dense(32, activation='relu'),
        layers.Dropout(0.3),
        layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Create and train model
nn_model1 = create_nn1(X_train_scaled.shape[1])
nn_model1.summary()

# Early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train model
history1 = nn_model1.fit(X_train_scaled, y_train, 
                         validation_split=0.2,
                         epochs=100,
                         batch_size=32,
                         callbacks=[early_stopping],
                         verbose=1)

# Evaluate
y_pred_proba_nn1 = nn_model1.predict(X_test_scaled)
y_pred_nn1 = (y_pred_proba_nn1 > 0.5).astype(int).ravel()
y_pred_proba_nn1_formatted = np.column_stack([1 - y_pred_proba_nn1.ravel(), y_pred_proba_nn1.ravel()])

results_nn1 = evaluate_model(y_test, y_pred_nn1, y_pred_proba_nn1_formatted, 
                            "Neural Network - Config 1 (64-32)")


### 4.8 Neural Network - Configuration 2


In [None]:
# Neural Network - Config 2: Deeper architecture with RMSprop
def create_nn2(input_dim):
    model = keras.Sequential([
        layers.Dense(128, activation='relu', input_shape=(input_dim,)),
        layers.Dropout(0.4),
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.3),
        layers.Dense(32, activation='relu'),
        layers.Dropout(0.2),
        layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Create and train model
nn_model2 = create_nn2(X_train_scaled.shape[1])
nn_model2.summary()

# Train model
history2 = nn_model2.fit(X_train_scaled, y_train, 
                         validation_split=0.2,
                         epochs=100,
                         batch_size=16,
                         callbacks=[early_stopping],
                         verbose=1)

# Evaluate
y_pred_proba_nn2 = nn_model2.predict(X_test_scaled)
y_pred_nn2 = (y_pred_proba_nn2 > 0.5).astype(int).ravel()
y_pred_proba_nn2_formatted = np.column_stack([1 - y_pred_proba_nn2.ravel(), y_pred_proba_nn2.ravel()])

results_nn2 = evaluate_model(y_test, y_pred_nn2, y_pred_proba_nn2_formatted, 
                            "Neural Network - Config 2 (128-64-32)")


### 4.9 Neural Network - Configuration 3


In [None]:
# Neural Network - Config 3: Wide shallow network with SGD
def create_nn3(input_dim):
    model = keras.Sequential([
        layers.Dense(256, activation='tanh', input_shape=(input_dim,)),
        layers.BatchNormalization(),
        layers.Dropout(0.5),
        layers.Dense(128, activation='tanh'),
        layers.BatchNormalization(),
        layers.Dropout(0.4),
        layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='sgd', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Create and train model
nn_model3 = create_nn3(X_train_scaled.shape[1])
nn_model3.summary()

# Train model
history3 = nn_model3.fit(X_train_scaled, y_train, 
                         validation_split=0.2,
                         epochs=100,
                         batch_size=64,
                         callbacks=[early_stopping],
                         verbose=1)

# Evaluate
y_pred_proba_nn3 = nn_model3.predict(X_test_scaled)
y_pred_nn3 = (y_pred_proba_nn3 > 0.5).astype(int).ravel()
y_pred_proba_nn3_formatted = np.column_stack([1 - y_pred_proba_nn3.ravel(), y_pred_proba_nn3.ravel()])

results_nn3 = evaluate_model(y_test, y_pred_nn3, y_pred_proba_nn3_formatted, 
                            "Neural Network - Config 3 (256-128, SGD)")


## 5. Model Comparison and Visualization


In [None]:
# Compile all results
all_results = [results_lr1, results_lr2, results_lr3, 
               results_rf1, results_rf2, results_rf3,
               results_nn1, results_nn2, results_nn3]

results_df = pd.DataFrame(all_results)
results_df = results_df.sort_values('ROC-AUC', ascending=False)

print("\n" + "="*60)
print("FINAL MODEL COMPARISON")
print("="*60)
print(results_df.to_string(index=False))


### Visualization of Training and Validation Loss Curves


In [None]:
# Plot training history for all neural network models
fig, axes = plt.subplots(3, 2, figsize=(16, 14))

# Plot history1 (NN Config 1)
axes[0, 0].plot(history1.history['loss'], label='Training Loss', linewidth=2)
axes[0, 0].plot(history1.history['val_loss'], label='Validation Loss', linewidth=2)
axes[0, 0].set_title('NN Config 1 (64-32): Loss Curves', fontsize=12, fontweight='bold')
axes[0, 0].set_xlabel('Epoch')
axes[0, 0].set_ylabel('Loss')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

axes[0, 1].plot(history1.history['accuracy'], label='Training Accuracy', linewidth=2)
axes[0, 1].plot(history1.history['val_accuracy'], label='Validation Accuracy', linewidth=2)
axes[0, 1].set_title('NN Config 1 (64-32): Accuracy Curves', fontsize=12, fontweight='bold')
axes[0, 1].set_xlabel('Epoch')
axes[0, 1].set_ylabel('Accuracy')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# Plot history2 (NN Config 2)
axes[1, 0].plot(history2.history['loss'], label='Training Loss', linewidth=2, color='orange')
axes[1, 0].plot(history2.history['val_loss'], label='Validation Loss', linewidth=2, color='red')
axes[1, 0].set_title('NN Config 2 (128-64-32): Loss Curves', fontsize=12, fontweight='bold')
axes[1, 0].set_xlabel('Epoch')
axes[1, 0].set_ylabel('Loss')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

axes[1, 1].plot(history2.history['accuracy'], label='Training Accuracy', linewidth=2, color='orange')
axes[1, 1].plot(history2.history['val_accuracy'], label='Validation Accuracy', linewidth=2, color='red')
axes[1, 1].set_title('NN Config 2 (128-64-32): Accuracy Curves', fontsize=12, fontweight='bold')
axes[1, 1].set_xlabel('Epoch')
axes[1, 1].set_ylabel('Accuracy')
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)

# Plot history3 (NN Config 3)
axes[2, 0].plot(history3.history['loss'], label='Training Loss', linewidth=2, color='green')
axes[2, 0].plot(history3.history['val_loss'], label='Validation Loss', linewidth=2, color='darkgreen')
axes[2, 0].set_title('NN Config 3 (256-128): Loss Curves', fontsize=12, fontweight='bold')
axes[2, 0].set_xlabel('Epoch')
axes[2, 0].set_ylabel('Loss')
axes[2, 0].legend()
axes[2, 0].grid(True, alpha=0.3)

axes[2, 1].plot(history3.history['accuracy'], label='Training Accuracy', linewidth=2, color='green')
axes[2, 1].plot(history3.history['val_accuracy'], label='Validation Accuracy', linewidth=2, color='darkgreen')
axes[2, 1].set_title('NN Config 3 (256-128): Accuracy Curves', fontsize=12, fontweight='bold')
axes[2, 1].set_xlabel('Epoch')
axes[2, 1].set_ylabel('Accuracy')
axes[2, 1].legend()
axes[2, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


### Confusion Matrices for All Models


In [None]:
# Visualize confusion matrices for all models
fig, axes = plt.subplots(3, 3, figsize=(18, 16))

models_confusion = [
    (y_pred_lr1, "Logistic Regression Config 1", 0, 0),
    (y_pred_lr2, "Logistic Regression Config 2", 0, 1),
    (y_pred_lr3, "Logistic Regression Config 3", 0, 2),
    (y_pred_rf1, "Random Forest Config 1", 1, 0),
    (y_pred_rf2, "Random Forest Config 2", 1, 1),
    (y_pred_rf3, "Random Forest Config 3", 1, 2),
    (y_pred_nn1, "Neural Network Config 1", 2, 0),
    (y_pred_nn2, "Neural Network Config 2", 2, 1),
    (y_pred_nn3, "Neural Network Config 3", 2, 2)
]

for y_pred, name, row, col in models_confusion:
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[row, col],
                cbar_kws={"shrink": 0.8}, square=True, linewidths=2)
    axes[row, col].set_title(name, fontsize=10, fontweight='bold')
    axes[row, col].set_xlabel('Predicted')
    axes[row, col].set_ylabel('Actual')

plt.tight_layout()
plt.show()


### Cross-Validation Analysis


In [None]:
# Perform cross-validation on best models
from sklearn.model_selection import cross_val_score

print("="*70)
print("CROSS-VALIDATION RESULTS (5-Fold)")
print("="*70)

# Cross-validation for best logistic regression
cv_lr = cross_val_score(lr_model1, X_train_scaled, y_train, cv=5, 
                        scoring='roc_auc', n_jobs=-1)
print(f"\nLogistic Regression Config 1:")
print(f"  Mean AUC: {cv_lr.mean():.4f} (±{cv_lr.std():.4f})")

# Cross-validation for best random forest
cv_rf = cross_val_score(rf_model2, X_train, y_train, cv=5, 
                        scoring='roc_auc', n_jobs=-1)
print(f"\nRandom Forest Config 2:")
print(f"  Mean AUC: {cv_rf.mean():.4f} (±{cv_rf.std():.4f})")

# Cross-validation accuracy
cv_acc_lr = cross_val_score(lr_model1, X_train_scaled, y_train, cv=5, 
                            scoring='accuracy', n_jobs=-1)
cv_acc_rf = cross_val_score(rf_model2, X_train, y_train, cv=5, 
                            scoring='accuracy', n_jobs=-1)

print(f"\nAccuracy Scores:")
print(f"  Logistic Regression: {cv_acc_lr.mean():.4f} (±{cv_acc_lr.std():.4f})")
print(f"  Random Forest: {cv_acc_rf.mean():.4f} (±{cv_acc_rf.std():.4f})")

# Visualize cross-validation results
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# AUC comparison
axes[0].boxplot([cv_lr, cv_rf], labels=['Logistic Regression', 'Random Forest'])
axes[0].set_title('Cross-Validation AUC Score Distribution', fontsize=12, fontweight='bold')
axes[0].set_ylabel('AUC Score')
axes[0].grid(True, alpha=0.3)

# Accuracy comparison
axes[1].boxplot([cv_acc_lr, cv_acc_rf], labels=['Logistic Regression', 'Random Forest'])
axes[1].set_title('Cross-Validation Accuracy Score Distribution', fontsize=12, fontweight='bold')
axes[1].set_ylabel('Accuracy Score')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


## Model Justification and Theoretical Background

### 1. Logistic Regression
**Theoretical Justification:**
- Logistic Regression is appropriate for binary classification problems
- Uses the sigmoid function to model the probability of bankruptcy
- **Regularization:** 
  - L1 (Lasso): Promotes feature selection by driving coefficients to zero
  - L2 (Ridge): Prevents overfitting by penalizing large coefficients
- **C Parameter:** Controls regularization strength (lower C = stronger regularization)
- Reference: Hastie et al. (2009) - Elements of Statistical Learning

**Configuration Choices:**
- Config 1: Default (balanced regularization)
- Config 2: C=0.1 with L2 (stronger regularization for overfitting prevention)
- Config 3: C=10 with L1 (feature selection for interpretability)

### 2. Random Forest
**Theoretical Justification:**
- Ensemble method combining multiple decision trees via bagging
- Reduces overfitting through averaging multiple trees
- Handles non-linear relationships and feature interactions
- **Hyperparameter Impact:**
  - `n_estimators`: More trees reduce variance (200 provides good bias-variance tradeoff)
  - `max_depth`: Controls overfitting (deeper trees capture more patterns, depth=20 balances complexity)
  - `min_samples_split`: Prevents overfitting (higher values = simpler trees)
  - `class_weight='balanced'`: Addresses imbalanced classes by adjusting tree splitting
- Reference: Breiman (2001) - Random Forests

**Configuration Choices:**
- Config 1: Default (quick baseline)
- Config 2: Deeper trees (max_depth=20) with more estimators (200) for better performance
- Config 3: Balanced class weights to handle minority class (bankrupt companies)

### 3. Neural Network
**Theoretical Justification:**
- Deep learning captures complex non-linear patterns in financial data
- **Architecture Choices:**
  - ReLU activation: Non-saturating, prevents vanishing gradients (Glorot et al., 2011)
  - Dropout: Regularization technique to prevent co-adaptation (Srivastava et al., 2014)
  - Batch Normalization: Stabilizes training and allows higher learning rates (Ioffe & Szegedy, 2015)
- **Optimizer Selection:**
  - Adam: Adaptive learning rate, combines advantages of RMSprop and AdaGrad
  - RMSprop: Good for non-stationary objectives
  - SGD: Vanilla gradient descent with momentum potential
- **Loss Function:** Binary cross-entropy for classification

**Configuration Choices:**
- Config 1: Simple architecture (64-32) with Adam for stable convergence
- Config 2: Deeper network (128-64-32) with RMSprop for non-stationary financial patterns
- Config 3: Wide shallow network (256-128) with tanh and BatchNorm for different representational capacity


## Data Preprocessing Justification

### 1. Stratified Train-Test Split (80-20)
**Reasoning:** 
- Maintains class distribution in both training and test sets
- Prevents bias towards majority class (non-bankrupt companies)
- Standard practice for imbalanced datasets

### 2. Feature Scaling (StandardScaler)
**Reasoning:**
- Financial features have different scales and units (ratios, percentages, absolute values)
- Required for logistic regression (coefficient interpretation and convergence)
- Essential for neural networks (prevents features with larger scales from dominating)
- StandardScaler: Z-score normalization → mean=0, std=1
- Random Forest doesn't require scaling (tree-based, scale-invariant)

### 3. Handling Class Imbalance
**Strategies Applied:**
- Stratified splitting preserves minority class in all folds
- Balanced class weights in Random Forest Config 3
- Early stopping in Neural Networks prevents overfitting to majority class


## Improvement Suggestions

### 1. **Feature Engineering & Selection**
**Current Gap:** Using all 95 features without dimensionality reduction
**Improvement:** 
- Apply PCA or Factor Analysis to reduce multicollinearity among financial ratios
- Use feature importance scores to select top 20-30 most predictive features
- Create interaction features (e.g., ROA × Debt Ratio)
- Literature: Guyon & Elisseeff (2003) - Feature Selection for Machine Learning

### 2. **Advanced Handling of Class Imbalance**
**Current Gap:** Limited imbalance strategies
**Improvements:**
- Apply SMOTE (Synthetic Minority Oversampling Technique) to create synthetic bankrupt examples
- Use weighted loss function in neural networks
- Implement focal loss (Lin et al., 2017) to focus on hard examples
- Try ADASYN (Adaptive Synthetic Sampling) for adaptive oversampling
- Literature: Chawla et al. (2002) - SMOTE

### 3. **Hyperparameter Optimization**
**Current Gap:** Manual hyperparameter selection
**Improvements:**
- Implement Grid Search or Random Search with cross-validation
- Use Bayesian Optimization (e.g., Optuna) for efficient hyperparameter tuning
- Apply Early Stopping with learning rate scheduling
- Use automated ML frameworks (AutoML)
- Literature: Bergstra & Bengio (2012) - Random Search for Hyperparameter Optimization

### 4. **Ensemble Methods**
**Current Gap:** Models evaluated separately
**Improvements:**
- Stack multiple models (meta-learner approach)
- Implement voting classifier (hard/soft voting)
- Use gradient boosting (XGBoost, LightGBM) as additional model
- Bagging with different algorithms
- Literature: Wolpert (1992) - Stacked Generalization

### 5. **Interpretability & Explainability**
**Current Gap:** Limited model interpretability
**Improvements:**
- Apply SHAP (SHapley Additive exPlanations) values for feature importance
- Use LIME (Local Interpretable Model-agnostic Explanations) for local explanations
- Generate partial dependence plots for feature effects
- Implement permutation importance
- Literature: Lundberg & Lee (2017) - SHAP

### 6. **Domain-Specific Feature Engineering**
**Current Gap:** Using raw financial ratios only
**Improvements:**
- Engineer bankruptcy-specific features (Altman Z-score, Ohlson O-score)
- Create temporal features if time-series data available
- Industry-specific ratios and benchmarks
- Macroeconomics indicators (market conditions, economic cycles)
- Literature: Altman (1968) - Financial Ratios

### References:
1. Hastie, T., Tibshirani, R., & Friedman, J. (2009). *The Elements of Statistical Learning*
2. Breiman, L. (2001). "Random Forests". *Machine Learning*, 45(1)
3. Glorot, X., et al. (2011). "Deep sparse rectifier neural networks"
4. Lin, T. Y., et al. (2017). "Focal Loss for Dense Object Detection"
5. Chawla, N. V., et al. (2002). "SMOTE: Synthetic Minority Over-sampling Technique"


In [None]:
# Visualize model performance comparison
fig, axes = plt.subplots(2, 2, figsize=(18, 12))

# Accuracy comparison
axes[0, 0].barh(results_df['Model'], results_df['Accuracy'], color='steelblue', alpha=0.7, edgecolor='black')
axes[0, 0].set_xlabel('Accuracy', fontsize=12, fontweight='bold')
axes[0, 0].set_title('Model Accuracy Comparison', fontsize=14, fontweight='bold')
axes[0, 0].grid(True, alpha=0.3)

# F1-Score comparison
axes[0, 1].barh(results_df['Model'], results_df['F1-Score'], color='orange', alpha=0.7, edgecolor='black')
axes[0, 1].set_xlabel('F1-Score', fontsize=12, fontweight='bold')
axes[0, 1].set_title('Model F1-Score Comparison', fontsize=14, fontweight='bold')
axes[0, 1].grid(True, alpha=0.3)

# ROC-AUC comparison
axes[1, 0].barh(results_df['Model'], results_df['ROC-AUC'], color='green', alpha=0.7, edgecolor='black')
axes[1, 0].set_xlabel('ROC-AUC', fontsize=12, fontweight='bold')
axes[1, 0].set_title('Model ROC-AUC Comparison', fontsize=14, fontweight='bold')
axes[1, 0].grid(True, alpha=0.3)

# Precision comparison
axes[1, 1].barh(results_df['Model'], results_df['Precision'], color='red', alpha=0.7, edgecolor='black')
axes[1, 1].set_xlabel('Precision', fontsize=12, fontweight='bold')
axes[1, 1].set_title('Model Precision Comparison', fontsize=14, fontweight='bold')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


In [None]:
# ROC Curves for all models
fig, axes = plt.subplots(3, 3, figsize=(20, 20))
axes = axes.ravel()

models_and_predictions = [
    (results_lr1, y_pred_proba_lr1, "LR Config 1"),
    (results_lr2, y_pred_proba_lr2, "LR Config 2"),
    (results_lr3, y_pred_proba_lr3, "LR Config 3"),
    (results_rf1, y_pred_proba_rf1, "RF Config 1"),
    (results_rf2, y_pred_proba_rf2, "RF Config 2"),
    (results_rf3, y_pred_proba_rf3, "RF Config 3"),
    (results_nn1, y_pred_proba_nn1_formatted, "NN Config 1"),
    (results_nn2, y_pred_proba_nn2_formatted, "NN Config 2"),
    (results_nn3, y_pred_proba_nn3_formatted, "NN Config 3")
]

for i, (result, y_pred_proba, name) in enumerate(models_and_predictions):
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba[:, 1])
    auc_score = roc_auc_score(y_test, y_pred_proba[:, 1])
    
    axes[i].plot(fpr, tpr, linewidth=2, label=f"AUC = {auc_score:.3f}")
    axes[i].plot([0, 1], [0, 1], 'k--', linewidth=1)
    axes[i].set_xlabel('False Positive Rate', fontsize=10)
    axes[i].set_ylabel('True Positive Rate', fontsize=10)
    axes[i].set_title(f"ROC Curve - {name}", fontsize=11, fontweight='bold')
    axes[i].legend(loc='lower right')
    axes[i].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


## 6. Feature Importance Analysis


In [None]:
# Feature importance from best Random Forest model
if hasattr(rf_model2, 'feature_importances_'):
    feature_importance = pd.DataFrame({
        'Feature': X.columns,
        'Importance': rf_model2.feature_importances_
    }).sort_values('Importance', ascending=False)
    
    # Plot top 20 most important features
    plt.figure(figsize=(12, 8))
    top_features = feature_importance.head(20)
    plt.barh(range(len(top_features)), top_features['Importance'], color='crimson', alpha=0.7, edgecolor='black')
    plt.yticks(range(len(top_features)), top_features['Feature'])
    plt.xlabel('Importance Score', fontsize=12, fontweight='bold')
    plt.title('Top 20 Most Important Features (Random Forest)', fontsize=14, fontweight='bold')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()


## 7. Summary and Conclusions


In [None]:
print("\n" + "="*80)
print("SUMMARY AND CONCLUSIONS")
print("="*80)

print(f"\nDataset: Company Bankruptcy Prediction")
print(f"Problem Type: Binary Classification")
print(f"Total Samples: {len(df)}")
print(f"Features: {df.shape[1] - 1}")
print(f"Target Variable: Bankrupt? (0 = Not Bankrupt, 1 = Bankrupt)")

print(f"\n{'='*80}")
print("BEST PERFORMING MODELS")
print(f"{'='*80}")

print(f"\nBest Overall Model (By ROC-AUC): {results_df.iloc[0]['Model']}")
print(f"  - ROC-AUC: {results_df.iloc[0]['ROC-AUC']:.4f}")
print(f"  - Accuracy: {results_df.iloc[0]['Accuracy']:.4f}")
print(f"  - F1-Score: {results_df.iloc[0]['F1-Score']:.4f}")

print(f"\nSecond Best Model: {results_df.iloc[1]['Model']}")
print(f"  - ROC-AUC: {results_df.iloc[1]['ROC-AUC']:.4f}")
print(f"  - Accuracy: {results_df.iloc[1]['Accuracy']:.4f}")
print(f"  - F1-Score: {results_df.iloc[1]['F1-Score']:.4f}")

print(f"\n{'='*80}")
print("KEY FINDINGS")
print(f"{'='*80}")
print("\n1. All models were tested with multiple configurations:")
print("   - Logistic Regression: 3 configurations (different regularization and C values)")
print("   - Random Forest: 3 configurations (different tree depths, estimators, and class weights)")
print("   - Neural Network: 3 configurations (different architectures, optimizers, and activations)")
print("\n2. The dataset shows class imbalance which was addressed through:")
print("   - Stratified train/test splitting")
print("   - Class weight balancing in some Random Forest configurations")
print("\n3. Feature scaling was applied to ensure optimal performance")
print("   for Logistic Regression and Neural Network models.")
print("\n4. Performance metrics show that:")
print(f"   - Highest ROC-AUC achieved: {results_df.iloc[0]['ROC-AUC']:.4f}")
print(f"   - The models are effective in predicting company bankruptcy risk.")

print(f"\n{'='*80}")
