In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.model_selection import StratifiedKFold, cross_val_score
from imblearn.over_sampling import SMOTE, BorderlineSMOTE
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from joblib import dump
import warnings
warnings.filterwarnings('ignore')


Cluster 3 size: 1792 companies
Bankruptcies: 6 (0.33%)

5 features selected
Shape: (1792, 5)

8 features selected
Shape: (1792, 8)

10 features selected
Shape: (1792, 10)

After SMOTE:
Total samples: 3572
Bankruptcies: 1786 (50.0%)

Confusion Matrix:
[[1769   17]
 [   0    6]]

Table 3 Values:
TT (correct bankruptcies): 6
TF (missed bankruptcies): 0
Accuracy: 100.0%
N_features: 10

Total predicted bankruptcies: 23

BASE MODEL CONFUSION MATRICES 

LR - LogisticRegression
----------------------------------------
                 Predicted
                 Non-B  Bankrupt
Actual Non-B      1694      92
Actual Bankrupt      0       6
Accuracy: 100.0% (6/6 bankruptcies caught)

DT - DecisionTreeClassifier
----------------------------------------
                 Predicted
                 Non-B  Bankrupt
Actual Non-B      1769      17
Actual Bankrupt      0       6
Accuracy: 100.0% (6/6 bankruptcies caught)

SVC - SVC
----------------------------------------
                 Predicted
     

Exception ignored in: <function ResourceTracker.__del__ at 0x104f5dbc0>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x102969bc0>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x105771bc0>
Traceback (most recent call last

In [None]:

RANDOM_STATE = 67

# Load cluster
df = pd.read_csv('artifacts/cluster_3_train.csv')
X = df.drop(columns=['Bankrupt?', 'Index', 'cluster_id']).to_numpy()
y = df['Bankrupt?'].to_numpy()

print(f"Cluster 3 size: {len(X)} companies")
print(f"Bankruptcies: {np.sum(y)} ({100*np.mean(y):.2f}%)")

from sklearn.pipeline import Pipeline

# Preprocessing pipeline
preproc_pipe = Pipeline([
    ('scaler', RobustScaler()),  # Robust to outliers
])

X_scaled = preproc_pipe.fit_transform(X)

# Test different feature counts
for k in [5, 8, 10]:
    selector = SelectKBest(score_func=f_classif, k=k)
    X_selected = selector.fit_transform(X_scaled, y)
    print(f"\n{k} features selected")
    print(f"Shape: {X_selected.shape}")


# SMOTE with k_neighbors=3 
from imblearn.over_sampling import SMOTE, ADASYN

# Option 1: SMOTE  
smote = SMOTE(random_state=RANDOM_STATE, k_neighbors=3)
X_res, y_res = smote.fit_resample(X_selected, y)

print(f"\nAfter SMOTE:")
print(f"Total samples: {len(X_res)}")
print(f"Bankruptcies: {np.sum(y_res)} ({100*np.mean(y_res):.1f}%)")

# Very simple base models to prevent overfitting
base_models = [
    ('lr', LogisticRegression(
        penalty='l2',
        C=1.0,
        class_weight='balanced',
        max_iter=2000,
        random_state=RANDOM_STATE
    )),
    ('dt', DecisionTreeClassifier(
        max_depth=4,  # Very shallow tree
        min_samples_split=20,
        class_weight='balanced',
        random_state=RANDOM_STATE
    )),
    ('svc', SVC(
        kernel='rbf',
        C=1.0,
        class_weight='balanced',
        probability=True,
        random_state=RANDOM_STATE
    )),
]

stacking_model = StackingClassifier(
    estimators=base_models,
    final_estimator=LogisticRegression(
        class_weight='balanced',
        random_state=RANDOM_STATE
    ),
    cv=3,  # Fewer folds due to tiny positive class
    n_jobs=-1
)

# Train on resampled data
stacking_model.fit(X_res, y_res)

from sklearn.metrics import confusion_matrix, classification_report

# Predict on ORIGINAL (non-resampled) data
y_pred = stacking_model.predict(X_selected)

# Confusion matrix
cm = confusion_matrix(y, y_pred)
print("\nConfusion Matrix:")
print(cm)

# Extract TT and TF for Table 3
TN, FP, FN, TP = cm.ravel()
TT = TP  # True positives (bankruptcies correctly identified)
TF = FN  # False negatives (bankruptcies missed)

print(f"\nTable 3 Values:")
print(f"TT (correct bankruptcies): {TT}")
print(f"TF (missed bankruptcies): {TF}")
print(f"Accuracy: {TT/(TT+TF)*100:.1f}%" if (TT+TF) > 0 else "N/A")
print(f"N_features: {X_selected.shape[1]}")

# Check predictions
print(f"\nTotal predicted bankruptcies: {np.sum(y_pred)}")

# Evaluate individual base models for 
print("\n" + "="*60)
print("BASE MODEL CONFUSION MATRICES ")
print("="*60)

for name, model in base_models:
    print(f"\n{name.upper()} - {model.__class__.__name__}")
    print("-" * 40)
    
    # Train on SMOTE data, predict on ORIGINAL data
    model.fit(X_res, y_res)
    y_pred_base = model.predict(X_selected)
    
    cm_base = confusion_matrix(y, y_pred_base)
    TN_b, FP_b, FN_b, TP_b = cm_base.ravel()
    
    print(f"                 Predicted")
    print(f"                 Non-B  Bankrupt")
    print(f"Actual Non-B      {cm_base[0,0]:4d}    {cm_base[0,1]:4d}")
    print(f"Actual Bankrupt   {cm_base[1,0]:4d}    {cm_base[1,1]:4d}")
    
    acc = TP_b / (TP_b + FN_b) * 100 if (TP_b + FN_b) > 0 else 0
    print(f"Accuracy: {acc:.1f}% ({TP_b}/{TP_b+FN_b} bankruptcies caught)")


In [None]:

# Save preprocessing and model
dump(preproc_pipe, 'artifacts/preprocessing_pipeline_subgroup3.joblib')
dump(selector, 'artifacts/feature_selector_subgroup3.joblib')
dump(stacking_model, 'artifacts/stacking_model_subgroup3.joblib')

print("\nAll artifacts saved!")
