In [3]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.model_selection import StratifiedKFold, cross_val_score
from imblearn.over_sampling import SMOTE, BorderlineSMOTE
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from joblib import dump
import warnings
warnings.filterwarnings('ignore')

RANDOM_STATE = 67

# Load your cluster
df = pd.read_csv('Cluster 3 train.csv')
X = df.drop(columns=['Bankrupt?', 'Index']).to_numpy()
y = df['Bankrupt?'].to_numpy()

print(f"Cluster 3 size: {len(X)} companies")
print(f"Bankruptcies: {np.sum(y)} ({100*np.mean(y):.2f}%)")

from sklearn.pipeline import Pipeline

# Keep it simple - just scaling
preproc_pipe = Pipeline([
    ('scaler', RobustScaler()),  # Robust to outliers
])

X_scaled = preproc_pipe.fit_transform(X)

# Test different feature counts
for k in [5, 8, 10]:
    selector = SelectKBest(score_func=f_classif, k=k)
    X_selected = selector.fit_transform(X_scaled, y)
    print(f"\n{k} features selected")
    print(f"Shape: {X_selected.shape}")


# SMOTE with k_neighbors=3 (can't be more than 5 since you have 6 examples)
from imblearn.over_sampling import SMOTE, ADASYN

# Option 1: SMOTE (safer)
smote = SMOTE(random_state=RANDOM_STATE, k_neighbors=3)
X_res, y_res = smote.fit_resample(X_selected, y)

print(f"\nAfter SMOTE:")
print(f"Total samples: {len(X_res)}")
print(f"Bankruptcies: {np.sum(y_res)} ({100*np.mean(y_res):.1f}%)")

# Very simple base models to prevent overfitting
base_models = [
    ('lr', LogisticRegression(
        penalty='l2',
        C=1.0,
        class_weight='balanced',
        max_iter=2000,
        random_state=RANDOM_STATE
    )),
    ('dt', DecisionTreeClassifier(
        max_depth=4,  # Very shallow tree
        min_samples_split=20,
        class_weight='balanced',
        random_state=RANDOM_STATE
    )),
    ('svc', SVC(
        kernel='rbf',
        C=1.0,
        class_weight='balanced',
        probability=True,
        random_state=RANDOM_STATE
    )),
]

stacking_model = StackingClassifier(
    estimators=base_models,
    final_estimator=LogisticRegression(
        class_weight='balanced',
        random_state=RANDOM_STATE
    ),
    cv=3,  # Fewer folds due to tiny positive class
    n_jobs=-1
)

# Train on resampled data
stacking_model.fit(X_res, y_res)

from sklearn.metrics import confusion_matrix, classification_report

# Predict on ORIGINAL (non-resampled) data
y_pred = stacking_model.predict(X_selected)

# Confusion matrix
cm = confusion_matrix(y, y_pred)
print("\nConfusion Matrix:")
print(cm)

# Extract TT and TF for Table 3
TN, FP, FN, TP = cm.ravel()
TT = TP  # True positives (bankruptcies correctly identified)
TF = FN  # False negatives (bankruptcies missed)

print(f"\nTable 3 Values:")
print(f"TT (correct bankruptcies): {TT}")
print(f"TF (missed bankruptcies): {TF}")
print(f"Accuracy: {TT/(TT+TF)*100:.1f}%" if (TT+TF) > 0 else "N/A")
print(f"N_features: {X_selected.shape[1]}")

# Check predictions
print(f"\nTotal predicted bankruptcies: {np.sum(y_pred)}")

# Save preprocessing and model
dump(preproc_pipe, 'artifacts/preprocessing_pipeline_subgroup3.joblib')
dump(selector, 'artifacts/feature_selector_subgroup3.joblib')
dump(stacking_model, 'artifacts/stacking_model_subgroup3.joblib')

print("\nAll artifacts saved!")


Cluster 3 size: 1792 companies
Bankruptcies: 6 (0.33%)

5 features selected
Shape: (1792, 5)

8 features selected
Shape: (1792, 8)

10 features selected
Shape: (1792, 10)

After SMOTE:
Total samples: 3572
Bankruptcies: 1786 (50.0%)

Confusion Matrix:
[[1769   17]
 [   0    6]]

Table 3 Values:
TT (correct bankruptcies): 6
TF (missed bankruptcies): 0
Accuracy: 100.0%
N_features: 10

Total predicted bankruptcies: 23

All artifacts saved!
