## Subgroup 0
Dhruv Prasanna

In [1]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import (StackingClassifier, RandomForestClassifier, GradientBoostingClassifier, 
                               ExtraTreesClassifier, AdaBoostClassifier, BaggingClassifier)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, QuantileTransformer, RobustScaler
from sklearn.feature_selection import SelectPercentile, f_classif, SelectKBest, mutual_info_classif
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from joblib import dump

In [2]:
df = pd.read_csv('artifacts/cluster_0_train.csv')
X = df.drop(columns=['Bankrupt?', 'Index']).to_numpy()
y = df['Bankrupt?'].to_numpy()

In [3]:
# Check class distribution
print("Class distribution:")
print(f"Non-bankrupt (0): {np.sum(y == 0)} ({100*np.mean(y == 0):.1f}%)")
print(f"Bankrupt (1): {np.sum(y == 1)} ({100*np.mean(y == 1):.1f}%)")

Class distribution:
Non-bankrupt (0): 1860 (94.4%)
Bankrupt (1): 110 (5.6%)


In [4]:
def subgroup0(X, y):
    random_state = 67
    
    # Optimal preprocessing pipeline for precision
    preproc_pipe = Pipeline(steps=[
        ('scaler', RobustScaler()),
        ('quantile', QuantileTransformer(output_distribution='normal', n_quantiles=200)),
        ('selector', SelectKBest(score_func=mutual_info_classif, k=10))
    ])
    X_selected = preproc_pipe.fit_transform(X, y)
    X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, stratify=y, random_state=random_state)

    # Adjust class weight so bankrupt doesnt get ignored
    pos_class_weight = len(y) / len(y[y == 1])

    model = StackingClassifier(
        estimators=[
            ('rf', RandomForestClassifier(
                n_estimators=500,
                max_depth=28,
                min_samples_split=2,
                min_samples_leaf=1,
                class_weight={0: 1, 1: pos_class_weight},
                criterion='gini',
                max_features='sqrt',
                random_state=random_state,
                n_jobs=-1
            )),
            ('et', ExtraTreesClassifier(
                n_estimators=500,
                max_depth=28,
                min_samples_split=2,
                min_samples_leaf=1,
                class_weight={0: 1, 1: pos_class_weight},
                criterion='gini',
                max_features='sqrt',
                random_state=random_state,
                n_jobs=-1
            )),
            ('gb', GradientBoostingClassifier(
                n_estimators=300,
                learning_rate=0.03,
                max_depth=8,
                subsample=0.85,
                min_samples_split=2,
                random_state=random_state
            ))
        ],
        final_estimator=GradientBoostingClassifier(
            n_estimators=200,
            learning_rate=0.05,
            max_depth=8,
            random_state=random_state
        ),
        cv=5,
        n_jobs=-1
    )

    # Train and evaluate
    model.fit(X_train, y_train)
    test_out = model.predict(X_test)
    cm = confusion_matrix(y_test, test_out)
    
    tn, fp, fn, tp = cm.ravel()
    overall_acc = (tn + tp) / (tn + fp + fn + tp)
    
    print("Model Evaluation on Subgroup 0 Test Set")
    print("-"*60)
    print("Confusion Matrix:")
    print(cm)
    print(f"\nOverall Accuracy: {100*overall_acc:.2f}%")
    print(f"Project Accuracy TT/(TT+TF): {100*tp/(fp+tp) if (fp+tp) > 0 else 0:.2f}%")
    
    # Fit on full data for final model
    model.fit(X_selected, y)
    
    return model, preproc_pipe

final_model, final_pipe = subgroup0(X, y)

Model Evaluation on Subgroup 0 Test Set
------------------------------------------------------------
Confusion Matrix:
[[370   2]
 [ 14   8]]

Overall Accuracy: 95.94%
Project Accuracy TT/(TT+TF): 80.00%


In [5]:
# Save both model and pipeline together in a single file
subgroup0_bundle = {
    'model': final_model,
    'pipeline': final_pipe,
    'random_state': 67
}
dump(subgroup0_bundle, './artifacts/subgroup0_complete.joblib')
print("Saved complete model bundle (model + pipeline) to artifacts/subgroup0_complete.joblib")

Saved complete model bundle (model + pipeline) to artifacts/subgroup0_complete.joblib
