In [7]:
# Import core libraries
import pandas as pd
import numpy as np

# Load the dataset 
file_path = '/Users/daniel/Northwestern Local/MSDS-422/Module 5/data.csv'
df = pd.read_csv(file_path)

# Inspect the dataset shape and column info
print(f"Rows: {df.shape[0]}, Columns: {df.shape[1]}")

# Check for missing values
missing = df.isnull().sum().sort_values(ascending=False)
print("\nTop 10 columns with missing values:")
print(missing.head(10))

# Review the target variable distribution
target = 'Bankrupt?'
print("\nTarget distribution (count and %):")
print(df[target].value_counts())
print((df[target].value_counts(normalize=True) * 100).round(2))

# Show the first few rows
df.head()

Rows: 6819, Columns: 96

Top 10 columns with missing values:
Bankrupt?                                                   0
 ROA(C) before interest and depreciation before interest    0
 Total expense/Assets                                       0
 Total income/Total expense                                 0
 Retained Earnings to Total Assets                          0
 Long-term Liability to Current Assets                      0
 Current Liabilities/Equity                                 0
 Working Capital/Equity                                     0
 Current Liabilities/Liability                              0
 Inventory/Current Liability                                0
dtype: int64

Target distribution (count and %):
Bankrupt?
0    6599
1     220
Name: count, dtype: int64
Bankrupt?
0    96.77
1     3.23
Name: proportion, dtype: float64


Unnamed: 0,Bankrupt?,ROA(C) before interest and depreciation before interest,ROA(A) before interest and % after tax,ROA(B) before interest and depreciation after tax,Operating Gross Margin,Realized Sales Gross Margin,Operating Profit Rate,Pre-tax net Interest Rate,After-tax net Interest Rate,Non-industry income and expenditure/revenue,...,Net Income to Total Assets,Total assets to GNP price,No-credit Interval,Gross Profit to Sales,Net Income to Stockholder's Equity,Liability to Equity,Degree of Financial Leverage (DFL),Interest Coverage Ratio (Interest expense to EBIT),Net Income Flag,Equity to Liability
0,1,0.370594,0.424389,0.40575,0.601457,0.601457,0.998969,0.796887,0.808809,0.302646,...,0.716845,0.009219,0.622879,0.601453,0.82789,0.290202,0.026601,0.56405,1,0.016469
1,1,0.464291,0.538214,0.51673,0.610235,0.610235,0.998946,0.79738,0.809301,0.303556,...,0.795297,0.008323,0.623652,0.610237,0.839969,0.283846,0.264577,0.570175,1,0.020794
2,1,0.426071,0.499019,0.472295,0.60145,0.601364,0.998857,0.796403,0.808388,0.302035,...,0.77467,0.040003,0.623841,0.601449,0.836774,0.290189,0.026555,0.563706,1,0.016474
3,1,0.399844,0.451265,0.457733,0.583541,0.583541,0.9987,0.796967,0.808966,0.30335,...,0.739555,0.003252,0.622929,0.583538,0.834697,0.281721,0.026697,0.564663,1,0.023982
4,1,0.465022,0.538432,0.522298,0.598783,0.598783,0.998973,0.797366,0.809304,0.303475,...,0.795016,0.003878,0.623521,0.598782,0.839973,0.278514,0.024752,0.575617,1,0.03549


In [9]:
# --- Train/test split and preprocessing setup ---

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.metrics import f1_score, classification_report

# Define features (X) and target (y)
target = 'Bankrupt?'
X = df.drop(columns=[target])
y = df[target]

# Split data (80% train / 20% validation), keep class balance
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Preprocessing pipeline
numeric_features = X.columns.tolist()
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())  # optional but fine for GB models
])

preprocessor = ColumnTransformer(
    transformers=[('num', numeric_transformer, numeric_features)]
)

# Define CV strategy
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [11]:
# --- Model training, hyperparameter tuning, and evaluation ---

# Define models and hyperparameter grids
models_and_grids = [
    (
        "RandomForest",
        RandomForestClassifier(random_state=42, class_weight='balanced'),
        {
            "model__n_estimators": [200, 500],
            "model__max_depth": [None, 10, 20],
            "model__max_features": ["sqrt", 0.5],
            "model__criterion": ["gini", "entropy"]
        },
    ),
    (
        "GradientBoosting",
        GradientBoostingClassifier(random_state=42),
        {
            "model__n_estimators": [200, 500],
            "model__max_depth": [2, 3],
            "model__max_features": ["sqrt", 0.5],
        },
    ),
    (
        "ExtraTrees",
        ExtraTreesClassifier(random_state=42, class_weight='balanced'),
        {
            "model__n_estimators": [300, 600],
            "model__max_depth": [None, 10, 20],
            "model__max_features": ["sqrt", 0.5],
            "model__criterion": ["gini", "entropy"]
        },
    ),
]

# Container to store results
results = []
best_models = {}

# Train each model using GridSearchCV
for name, model, grid in models_and_grids:
    print(f"\n Training {name}...")
    pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("model", model)
    ])
    
    grid_search = GridSearchCV(
        pipeline,
        param_grid=grid,
        scoring="f1",
        cv=cv,
        n_jobs=-1,
        verbose=1
    )
    
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    best_models[name] = best_model

    # Evaluate on validation set
    y_pred = best_model.predict(X_val)
    f1 = f1_score(y_val, y_pred)
    
    print(f"\nBest params for {name}: {grid_search.best_params_}")
    print(f"Best CV F1-score: {grid_search.best_score_:.4f}")
    print(f"Validation F1-score: {f1:.4f}")
    print(classification_report(y_val, y_pred, digits=4))
    
    results.append({
        "Model": name,
        "Best CV F1": grid_search.best_score_,
        "Validation F1": f1,
        "Best Params": grid_search.best_params_,
    })

# Display all results in descending order of validation F1
results_df = pd.DataFrame(results).sort_values(by="Validation F1", ascending=False)
print("\n=== Model Comparison ===")
print(results_df)


 Training RandomForest...
Fitting 5 folds for each of 24 candidates, totalling 120 fits

Best params for RandomForest: {'model__criterion': 'entropy', 'model__max_depth': 10, 'model__max_features': 0.5, 'model__n_estimators': 500}
Best CV F1-score: 0.4170
Validation F1-score: 0.4516
              precision    recall  f1-score   support

           0     0.9825    0.9788    0.9806      1320
           1     0.4286    0.4773    0.4516        44

    accuracy                         0.9626      1364
   macro avg     0.7055    0.7280    0.7161      1364
weighted avg     0.9646    0.9626    0.9636      1364


 Training GradientBoosting...
Fitting 5 folds for each of 8 candidates, totalling 40 fits

Best params for GradientBoosting: {'model__max_depth': 2, 'model__max_features': 'sqrt', 'model__n_estimators': 500}
Best CV F1-score: 0.3745
Validation F1-score: 0.4675
              precision    recall  f1-score   support

           0     0.9805    0.9886    0.9845      1320
           1     

In [13]:
# --- Generate predictions and export results ---

# Select best model
best_model_name = "ExtraTrees"
best_model = best_models[best_model_name]

# Generate predictions on your validation set
y_pred_val = best_model.predict(X_val)

# Combine results into a DataFrame
predictions_df = pd.DataFrame({
    "Actual": y_val.values,
    "Predicted": y_pred_val
})

# Display the first few predictions
print(predictions_df.head(10))

# Save to CSV 
output_path = "/Users/daniel/Northwestern Local/MSDS-422/Module 5/validation_predictions.csv"
predictions_df.to_csv(output_path, index=False)
print(f"\nPredictions saved to: {output_path}")

   Actual  Predicted
0       0          0
1       0          0
2       0          0
3       0          0
4       0          0
5       0          0
6       0          0
7       0          1
8       0          0
9       0          0

Predictions saved to: /Users/daniel/Northwestern Local/MSDS-422/Module 5/validation_predictions.csv


In [15]:
import numpy as np

# Check how many bankruptcies were predicted in total
unique, counts = np.unique(y_pred_val, return_counts=True)
print(dict(zip(unique, counts)))

# Compare to actual bankruptcies
print("Actual bankruptcies in validation set:", y_val.sum())

{0: 1272, 1: 92}
Actual bankruptcies in validation set: 44


In [19]:
print(df.columns.tolist())

['Bankrupt?', ' ROA(C) before interest and depreciation before interest', ' ROA(A) before interest and % after tax', ' ROA(B) before interest and depreciation after tax', ' Operating Gross Margin', ' Realized Sales Gross Margin', ' Operating Profit Rate', ' Pre-tax net Interest Rate', ' After-tax net Interest Rate', ' Non-industry income and expenditure/revenue', ' Continuous interest rate (after tax)', ' Operating Expense Rate', ' Research and development expense rate', ' Cash flow rate', ' Interest-bearing debt interest rate', ' Tax rate (A)', ' Net Value Per Share (B)', ' Net Value Per Share (A)', ' Net Value Per Share (C)', ' Persistent EPS in the Last Four Seasons', ' Cash Flow Per Share', ' Revenue Per Share (Yuan ¥)', ' Operating Profit Per Share (Yuan ¥)', ' Per Share Net profit before tax (Yuan ¥)', ' Realized Sales Gross Profit Growth Rate', ' Operating Profit Growth Rate', ' After-tax Net Profit Growth Rate', ' Regular Net Profit Growth Rate', ' Continuous Net Profit Growth 

In [21]:
# Clean column names (remove leading spaces)
df.columns = df.columns.str.strip()

# Define target and features
y = df['Bankrupt?'].astype(int)
X = df.drop(columns=['Bankrupt?'])

# Quick sanity checks
print("Shape:", X.shape, "Target dtype:", y.dtype)
print("Class balance (counts):")
print(y.value_counts())
print("\nClass balance (percent):")
print((y.value_counts(normalize=True) * 100).round(2))

Shape: (6819, 95) Target dtype: int64
Class balance (counts):
Bankrupt?
0    6599
1     220
Name: count, dtype: int64

Class balance (percent):
Bankrupt?
0    96.77
1     3.23
Name: proportion, dtype: float64


In [23]:
from sklearn.model_selection import train_test_split

# Split data: 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Training set size:", X_train.shape)
print("Test set size:", X_test.shape)
print("Bankruptcy ratio in training set:")
print(y_train.value_counts(normalize=True).round(3))

Training set size: (5455, 95)
Test set size: (1364, 95)
Bankruptcy ratio in training set:
Bankrupt?
0    0.968
1    0.032
Name: proportion, dtype: float64
