In [1]:
# Install required packages
# !pip install imbalanced-learn xgboost --quiet

# Import libraries
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTENC
from imblearn.combine import SMOTEENN
from xgboost import XGBClassifier

# Configuration
RANDOM_STATE = 42
TARGET_COL = "renewal"

In [2]:
# Data Import - Choose one option below

# Option 1: Google Colab with Drive (uncomment to use)
# from google.colab import drive
# drive.mount('/content/drive')
# TRAIN_CSV_PATH = "/content/drive/MyDrive/Capstone Project/train_ZoGVYWq.csv"

# Option 2: Local file path (uncomment to use)
TRAIN_CSV_PATH = "train_ZoGVYWq.csv"  # Update with your local file path

# Load data
df = pd.read_csv(TRAIN_CSV_PATH)
print(f"Dataset shape: {df.shape}")
print(f"Target column '{TARGET_COL}' found: {TARGET_COL in df.columns}")

# Separate features and target
y = df[TARGET_COL].astype(int)
X = df.drop(columns=[TARGET_COL]).copy()

# Identify column types
cat_cols = X.select_dtypes(include=["object","category"]).columns.tolist()
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_idx = [X.columns.get_loc(c) for c in cat_cols]  # for SMOTENC

print(f"Categorical columns ({len(cat_cols)}): {cat_cols}")
print(f"Numerical columns ({len(num_cols)}): {num_cols}")
print(f"Class distribution: {Counter(y)}")


Dataset shape: (79853, 13)
Target column 'renewal' found: True
Categorical columns (2): ['sourcing_channel', 'residence_area_type']
Numerical columns (10): ['id', 'perc_premium_paid_by_cash_credit', 'age_in_days', 'Income', 'Count_3-6_months_late', 'Count_6-12_months_late', 'Count_more_than_12_months_late', 'application_underwriting_score', 'no_of_premiums_paid', 'premium']
Class distribution: Counter({1: 74855, 0: 4998})


In [3]:
# Train/Validation Split
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
)

print(f"Training set shape: {X_train.shape}")
print(f"Validation set shape: {X_valid.shape}")
print(f"Training class distribution: {Counter(y_train)}")
print(f"Validation class distribution: {Counter(y_valid)}")


Training set shape: (63882, 12)
Validation set shape: (15971, 12)
Training class distribution: Counter({1: 59884, 0: 3998})
Validation class distribution: Counter({1: 14971, 0: 1000})


In [4]:
# Data Preprocessing - Handle Missing Values

# Impute NaNs (NO OHE)
# Numeric: median (fit on train), Categorical: most frequent (fit on train)
num_medians = X_train[num_cols].median()
X_train[num_cols] = X_train[num_cols].fillna(num_medians)
X_valid[num_cols] = X_valid[num_cols].fillna(num_medians)

cat_modes = {c: X_train[c].mode(dropna=True)[0] for c in cat_cols}
X_train[cat_cols] = X_train[cat_cols].fillna(pd.Series(cat_modes))
X_valid[cat_cols] = X_valid[cat_cols].fillna(pd.Series(cat_modes))

# Ensure category dtype (needed for clean round-trip)
for c in cat_cols:
    X_train[c] = X_train[c].astype("category")
    # lock validation to same category set (unknown -> make category; fallback to string then category)
    X_valid[c] = pd.Categorical(X_valid[c].astype(str), categories=X_train[c].cat.categories)

print("Missing values after imputation:")
print(f"Training set: {X_train.isnull().sum().sum()}")
print(f"Validation set: {X_valid.isnull().sum().sum()}")


Missing values after imputation:
Training set: 0
Validation set: 0


In [5]:
# Encoding Functions for Resampling

def to_encoded_array(df_cat_num, cat_columns):
    """Convert DataFrame to encoded array for SMOTE resampling"""
    arr = df_cat_num.copy()
    # integer codes for categoricals; continuous stay numeric
    for c in cat_columns:
        # use existing 'category' codes; unseen become -1 -> fix to 0 by adding a 'UNK' if needed
        codes = arr[c].cat.codes.values
        # if -1 exists (unknown), add an extra code
        if (codes == -1).any():
            # extend categories
            new_cats = list(arr[c].cat.categories) + ["__UNK__"]
            arr[c] = pd.Categorical(arr[c].astype(str), categories=new_cats)
            codes = arr[c].cat.codes.values
        arr[c] = codes.astype(np.int64)
    return arr.values

def from_encoded_array(arr, ref_df, cat_columns):
    """Convert encoded array back to DataFrame with correct dtypes"""
    # rebuild DataFrame with original column names
    out = pd.DataFrame(arr, columns=ref_df.columns)
    # cast dtypes back: numeric as float, categoricals as pandas 'category' using original categories if possible
    for c in num_cols:
        out[c] = pd.to_numeric(out[c], errors="coerce")
    for c in cat_columns:
        # original categories (plus optional UNK if we added earlier)
        ref_cats = list(ref_df[c].cat.categories)
        # if any code >= len(ref_cats), add UNK
        max_code = int(out[c].max())
        if max_code >= len(ref_cats):
            ref_cats = ref_cats + ["__UNK__"]
        out[c] = pd.Categorical.from_codes(out[c].astype(int), categories=ref_cats)
    return out

# Prepare encoded arrays for resampling
Xtr_enc = to_encoded_array(X_train, cat_cols)
Xva_for_model = X_valid.copy()  # model expects category dtype, already set

print("Train class distribution BEFORE resampling:", Counter(y_train))


Train class distribution BEFORE resampling: Counter({1: 59884, 0: 3998})


In [6]:
# Resampling Methods Configuration

# Define resampling methods
resamplers = [
    ("SMOTENC 1:1", SMOTENC(categorical_features=cat_idx, sampling_strategy="auto",
                            k_neighbors=5, random_state=RANDOM_STATE)),
    ("SMOTEENN (SMOTENC+ENN)", SMOTEENN(
        smote=SMOTENC(categorical_features=cat_idx, sampling_strategy="auto",
                      k_neighbors=5, random_state=RANDOM_STATE),
        random_state=RANDOM_STATE
    )),
]

# Initialize storage for results
summary = []
conf_mats = {}

print("Resampling methods configured:")
for name, _ in resamplers:
    print(f"- {name}")


Resampling methods configured:
- SMOTENC 1:1
- SMOTEENN (SMOTENC+ENN)


In [7]:
# Model Training and Evaluation

for name, resampler in resamplers:
    print(f"\n=== {name} ===")
    
    # Resample on encoded arrays (handles NaNs already)
    X_res_arr, y_res = resampler.fit_resample(Xtr_enc, y_train.values)
    print("AFTER resample:", Counter(y_res))

    # Convert back to DataFrame with correct dtypes for model (categoricals as 'category')
    X_res = from_encoded_array(X_res_arr, X_train, cat_cols)

    # XGBoost with native categorical handling (no OHE)
    clf = XGBClassifier(
        eval_metric="logloss",
        random_state=RANDOM_STATE,
        tree_method="approx",
        enable_categorical=True,
        n_estimators=400,
        learning_rate=0.1,
        max_depth=5,
        subsample=1.0,
        colsample_bytree=1.0
    )
    clf.fit(X_res, y_res)

    # Evaluate
    y_prob = clf.predict_proba(Xva_for_model)  # Get probabilities for all classes
    y_pred = (y_prob[:, 1] >= 0.51).astype(int)

    # Calculate metrics
    acc = accuracy_score(y_valid, y_pred)
    prec = precision_score(y_valid, y_pred, zero_division=0, average=None)  # Per-class precision
    rec = recall_score(y_valid, y_pred, zero_division=0, average=None)      # Per-class recall
    f1 = f1_score(y_valid, y_pred, zero_division=0, average=None)           # Per-class F1
    
    # Per-class ROC_AUC
    roc_class_0 = roc_auc_score(1 - y_valid, y_prob[:, 0])
    roc_class_1 = roc_auc_score(y_valid, y_prob[:, 1])
    roc = [roc_class_0, roc_class_1]

    cm = confusion_matrix(y_valid, y_pred)

    # Store results
    conf_mats[name] = cm
    summary.append({
        "Method": name,
        "Accuracy": round(acc, 4),
        "Precision (Class 0)": round(prec[0], 4),
        "Precision (Class 1)": round(prec[1], 4),
        "Recall (Class 0)": round(rec[0], 4),
        "Recall (Class 1)": round(rec[1], 4),
        "F1 (Class 0)": round(f1[0], 4),
        "F1 (Class 1)": round(f1[1], 4),
        "ROC_AUC (Class 0)": round(roc[0], 4),
        "ROC_AUC (Class 1)": round(roc[1], 4)
    })



=== SMOTENC 1:1 ===
AFTER resample: Counter({np.int64(1): 59884, np.int64(0): 59884})

=== SMOTEENN (SMOTENC+ENN) ===
AFTER resample: Counter({np.int64(0): 52200, np.int64(1): 39607})


In [8]:
# Results Summary

# Create summary DataFrame
summary_df = pd.DataFrame(summary)

print("\n===== Summary (including per-class metrics) =====")
print(summary_df)

print("\n===== Confusion Matrices (rows=true, cols=pred) =====")
for k, v in conf_mats.items():
    print(f"\n{k}\n{v}")

# Optional: Save results to CSV
# summary_df.to_csv('smote_results.csv', index=False)
# print("\nResults saved to 'smote_results.csv'")



===== Summary (including per-class metrics) =====
                   Method  Accuracy  Precision (Class 0)  Precision (Class 1)  \
0             SMOTENC 1:1    0.9371               0.4916               0.9456   
1  SMOTEENN (SMOTENC+ENN)    0.9302               0.4050               0.9508   

   Recall (Class 0)  Recall (Class 1)  F1 (Class 0)  F1 (Class 1)  \
0             0.147            0.9898        0.2263        0.9672   
1             0.243            0.9762        0.3038        0.9633   

   ROC_AUC (Class 0)  ROC_AUC (Class 1)  
0             0.8406             0.8406  
1             0.8302             0.8302  

===== Confusion Matrices (rows=true, cols=pred) =====

SMOTENC 1:1
[[  147   853]
 [  152 14819]]

SMOTEENN (SMOTENC+ENN)
[[  243   757]
 [  357 14614]]


In [9]:
# All Available Classifiers with Default Parameters

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import ExtraTreesClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier

# All classifiers with default parameters
ALL_CLASSIFIERS = {
    'Random Forest': RandomForestClassifier(random_state=RANDOM_STATE),
    'Gradient Boosting': GradientBoostingClassifier(random_state=RANDOM_STATE),
    'AdaBoost': AdaBoostClassifier(random_state=RANDOM_STATE),
    'Logistic Regression': LogisticRegression(random_state=RANDOM_STATE, max_iter=1000),
    'Ridge Classifier': RidgeClassifier(random_state=RANDOM_STATE),
    'SVM': SVC(random_state=RANDOM_STATE, probability=True),
    'Decision Tree': DecisionTreeClassifier(random_state=RANDOM_STATE),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Naive Bayes': GaussianNB(),
    'Linear Discriminant': LinearDiscriminantAnalysis(),
    'Quadratic Discriminant': QuadraticDiscriminantAnalysis(),
    'SGD Classifier': SGDClassifier(random_state=RANDOM_STATE),
    'Extra Trees': ExtraTreesClassifier(random_state=RANDOM_STATE),
    'XGBoost': XGBClassifier(random_state=RANDOM_STATE, eval_metric='logloss', enable_categorical=True)
}

print(f"✅ {len(ALL_CLASSIFIERS)} classifiers loaded with default parameters")
print("Available classifiers:")
for name in ALL_CLASSIFIERS.keys():
    print(f"  - {name}")


✅ 14 classifiers loaded with default parameters
Available classifiers:
  - Random Forest
  - Gradient Boosting
  - AdaBoost
  - Logistic Regression
  - Ridge Classifier
  - SVM
  - Decision Tree
  - K-Nearest Neighbors
  - Naive Bayes
  - Linear Discriminant
  - Quadratic Discriminant
  - SGD Classifier
  - Extra Trees
  - XGBoost


In [11]:
# Model Comparison Function

def compare_models(X_train, X_test, y_train, y_test, models_dict):
    """
    Compare multiple classification models and print results.
    
    Parameters:
    -----------
    X_train, X_test : DataFrame
        Training and testing features
    y_train, y_test : Series
        Training and testing target variables
    models_dict : dict
        Dictionary with model names as keys and model instances as values
    """
    from sklearn.preprocessing import LabelEncoder
    import time
    
    results = []
    
    print("=" * 80)
    print("MODEL COMPARISON RESULTS")
    print("=" * 80)
    
    # Prepare data for models that don't handle categorical features natively
    X_train_encoded = X_train.copy()
    X_test_encoded = X_test.copy()
    
    # Encode categorical variables for non-XGBoost models
    label_encoders = {}
    for col in cat_cols:
        if col in X_train_encoded.columns:
            le = LabelEncoder()
            # Fit on training data and transform both train and test
            X_train_encoded[col] = le.fit_transform(X_train_encoded[col].astype(str))
            X_test_encoded[col] = le.transform(X_test_encoded[col].astype(str))
            label_encoders[col] = le
    
    for model_name, model in models_dict.items():
        print(f"\n🔄 Training {model_name}...")
        start_time = time.time()
        
        try:
            # Use original data for XGBoost (handles categorical natively with enable_categorical=True)
            if 'XGBoost' in model_name:
                # Ensure categorical columns are properly formatted for XGBoost
                X_train_xgb = X_train.copy()
                X_test_xgb = X_test.copy()
                
                # Convert categorical columns to category type for XGBoost
                for col in cat_cols:
                    if col in X_train_xgb.columns:
                        X_train_xgb[col] = X_train_xgb[col].astype('category')
                        X_test_xgb[col] = X_test_xgb[col].astype('category')
                
                model.fit(X_train_xgb, y_train)
                y_pred = model.predict(X_test_xgb)
                y_pred_proba = model.predict_proba(X_test_xgb)[:, 1] if hasattr(model, 'predict_proba') else None
            else:
                # Use encoded data for other models
                model.fit(X_train_encoded, y_train)
                y_pred = model.predict(X_test_encoded)
                y_pred_proba = model.predict_proba(X_test_encoded)[:, 1] if hasattr(model, 'predict_proba') else None
            
            training_time = time.time() - start_time
            
            # Calculate metrics - per class and overall
            accuracy = accuracy_score(y_test, y_pred)
            
            # Per-class metrics
            precision_per_class = precision_score(y_test, y_pred, average=None, zero_division=0)
            recall_per_class = recall_score(y_test, y_pred, average=None, zero_division=0)
            f1_per_class = f1_score(y_test, y_pred, average=None, zero_division=0)
            
            # Overall weighted metrics
            precision_weighted = precision_score(y_test, y_pred, average='weighted', zero_division=0)
            recall_weighted = recall_score(y_test, y_pred, average='weighted', zero_division=0)
            f1_weighted = f1_score(y_test, y_pred, average='weighted', zero_division=0)
            
            # ROC AUC (only if model supports probability prediction)
            roc_auc = None
            if y_pred_proba is not None:
                try:
                    roc_auc = roc_auc_score(y_test, y_pred_proba)
                except:
                    roc_auc = None
            
            # Store results with per-class metrics
            result = {
                'Model': model_name,
                'Accuracy': round(accuracy, 4),
                'Precision (Class 0)': round(precision_per_class[0], 4),
                'Precision (Class 1)': round(precision_per_class[1], 4),
                'Precision (Weighted)': round(precision_weighted, 4),
                'Recall (Class 0)': round(recall_per_class[0], 4),
                'Recall (Class 1)': round(recall_per_class[1], 4),
                'Recall (Weighted)': round(recall_weighted, 4),
                'F1-Score (Class 0)': round(f1_per_class[0], 4),
                'F1-Score (Class 1)': round(f1_per_class[1], 4),
                'F1-Score (Weighted)': round(f1_weighted, 4),
                'ROC-AUC': round(roc_auc, 4) if roc_auc else 'N/A',
                'Training Time (s)': round(training_time, 4)
            }
            
            results.append(result)
            
            # Print individual results
            print(f"   ✅ {model_name} completed in {training_time:.2f}s")
            print(f"   📊 Accuracy: {accuracy:.4f}")
            print(f"   📈 F1-Score - Class 0: {f1_per_class[0]:.4f}, Class 1: {f1_per_class[1]:.4f}, Weighted: {f1_weighted:.4f}")
            print(f"   🎯 Precision - Class 0: {precision_per_class[0]:.4f}, Class 1: {precision_per_class[1]:.4f}")
            print(f"   🔍 Recall - Class 0: {recall_per_class[0]:.4f}, Class 1: {recall_per_class[1]:.4f}")
            if roc_auc:
                print(f"   📈 ROC-AUC: {roc_auc:.4f}")
            
        except Exception as e:
            print(f"   ❌ Error training {model_name}: {str(e)}")
            continue
    
    # Create results DataFrame
    results_df = pd.DataFrame(results)
    
    # Sort by F1-Score (Weighted) (descending)
    if not results_df.empty:
        results_df = results_df.sort_values('F1-Score (Weighted)', ascending=False).reset_index(drop=True)
        
        print("\n" + "=" * 120)
        print("📊 COMPARISON SUMMARY - PER CLASS METRICS")
        print("=" * 120)
        
        # Display key metrics in a readable format
        display_cols = ['Model', 'Accuracy', 'Precision (Class 0)', 'Precision (Class 1)', 
                       'Recall (Class 0)', 'Recall (Class 1)', 'F1-Score (Class 0)', 
                       'F1-Score (Class 1)', 'F1-Score (Weighted)', 'ROC-AUC', 'Training Time (s)']
        print(results_df[display_cols].to_string(index=False))
        
        # Best model
        best_model = results_df.iloc[0]
        print(f"\n🏆 BEST MODEL: {best_model['Model']}")
        print(f"   Accuracy: {best_model['Accuracy']}")
        print(f"   F1-Score (Class 0 - No Renewal): {best_model['F1-Score (Class 0)']}")
        print(f"   F1-Score (Class 1 - Renewal): {best_model['F1-Score (Class 1)']}")
        print(f"   F1-Score (Weighted): {best_model['F1-Score (Weighted)']}")
        if best_model['ROC-AUC'] != 'N/A':
            print(f"   ROC-AUC: {best_model['ROC-AUC']}")
        
        # Additional insights
        print(f"\n📋 CLASS INTERPRETATION:")
        print(f"   Class 0: No Renewal (Minority class)")
        print(f"   Class 1: Renewal (Majority class)")
        print(f"   Focus on Class 0 metrics for detecting non-renewals!")
    
    return results_df

print("✅ Model comparison function ready!")
print("\nUsage: results = compare_models(X_train, X_test, y_train, y_test, ALL_CLASSIFIERS)")
print("Or use a subset: results = compare_models(X_train, X_test, y_train, y_test, {'Random Forest': RandomForestClassifier(), 'XGBoost': XGBClassifier()})")


✅ Model comparison function ready!

Usage: results = compare_models(X_train, X_test, y_train, y_test, ALL_CLASSIFIERS)
Or use a subset: results = compare_models(X_train, X_test, y_train, y_test, {'Random Forest': RandomForestClassifier(), 'XGBoost': XGBClassifier()})


In [12]:
# Example Usage - Run this cell to compare all models

# Compare all available classifiers
results = compare_models(X_train, X_valid, y_train, y_valid, ALL_CLASSIFIERS)

# Or compare specific models only
# selected_models = {
#     'Random Forest': RandomForestClassifier(random_state=RANDOM_STATE),
#     'XGBoost': XGBClassifier(random_state=RANDOM_STATE, eval_metric='logloss'),
#     'Logistic Regression': LogisticRegression(random_state=RANDOM_STATE, max_iter=1000)
# }
# results = compare_models(X_train, X_valid, y_train, y_valid, selected_models)


MODEL COMPARISON RESULTS

🔄 Training Random Forest...
   ✅ Random Forest completed in 12.41s
   📊 Accuracy: 0.9376
   📈 F1-Score - Class 0: 0.1821, Class 1: 0.9675, Weighted: 0.9184
   🎯 Precision - Class 0: 0.5068, Class 1: 0.9436
   🔍 Recall - Class 0: 0.1110, Class 1: 0.9928
   📈 ROC-AUC: 0.8260

🔄 Training Gradient Boosting...
   ✅ Gradient Boosting completed in 12.50s
   📊 Accuracy: 0.9376
   📈 F1-Score - Class 0: 0.1994, Class 1: 0.9676, Weighted: 0.9195
   🎯 Precision - Class 0: 0.5082, Class 1: 0.9443
   🔍 Recall - Class 0: 0.1240, Class 1: 0.9920
   📈 ROC-AUC: 0.8452

🔄 Training AdaBoost...
   ✅ AdaBoost completed in 4.21s
   📊 Accuracy: 0.9381
   📈 F1-Score - Class 0: 0.1821, Class 1: 0.9679, Weighted: 0.9187
   🎯 Precision - Class 0: 0.5288, Class 1: 0.9435
   🔍 Recall - Class 0: 0.1100, Class 1: 0.9935
   📈 ROC-AUC: 0.8352

🔄 Training Logistic Regression...


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


   ✅ Logistic Regression completed in 3.88s
   📊 Accuracy: 0.9383
   📈 F1-Score - Class 0: 0.2010, Class 1: 0.9679, Weighted: 0.9199
   🎯 Precision - Class 0: 0.5299, Class 1: 0.9443
   🔍 Recall - Class 0: 0.1240, Class 1: 0.9927
   📈 ROC-AUC: 0.8322

🔄 Training Ridge Classifier...
   ✅ Ridge Classifier completed in 0.03s
   📊 Accuracy: 0.9384
   📈 F1-Score - Class 0: 0.1183, Class 1: 0.9681, Weighted: 0.9149
   🎯 Precision - Class 0: 0.5690, Class 1: 0.9411
   🔍 Recall - Class 0: 0.0660, Class 1: 0.9967

🔄 Training SVM...
   ✅ SVM completed in 247.51s
   📊 Accuracy: 0.9374
   📈 F1-Score - Class 0: 0.0000, Class 1: 0.9677, Weighted: 0.9071
   🎯 Precision - Class 0: 0.0000, Class 1: 0.9374
   🔍 Recall - Class 0: 0.0000, Class 1: 1.0000
   📈 ROC-AUC: 0.5066

🔄 Training Decision Tree...
   ✅ Decision Tree completed in 0.67s
   📊 Accuracy: 0.8937
   📈 F1-Score - Class 0: 0.2205, Class 1: 0.9430, Weighted: 0.8977
   🎯 Precision - Class 0: 0.2039, Class 1: 0.9486
   🔍 Recall - Class 0: 0.240

In [None]:
# Practical Example: Using the Model Comparison Function

# Step 1: Define models to compare
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

# Define your models dictionary
models_to_compare = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE),
    'Gradient Boosting': GradientBoostingClassifier(random_state=RANDOM_STATE),
    'Logistic Regression': LogisticRegression(random_state=RANDOM_STATE, max_iter=1000),
    'SVM': SVC(random_state=RANDOM_STATE, probability=True),
    'Decision Tree': DecisionTreeClassifier(random_state=RANDOM_STATE),
    'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=5),
    'Naive Bayes': GaussianNB(),
    'XGBoost': XGBClassifier(random_state=RANDOM_STATE, eval_metric='logloss')
}

print("📋 Models defined for comparison:")
for name in models_to_compare.keys():
    print(f"   - {name}")

print(f"\n💡 To run the comparison, use:")
print(f"   results = compare_models(X_train, X_valid, y_train, y_valid, models_to_compare)")
print(f"\n💡 For resampling comparison, use:")
print(f"   resampler = SMOTENC(categorical_features=cat_idx, sampling_strategy='auto', k_neighbors=5, random_state={RANDOM_STATE})")
print(f"   results = compare_models(X_train, X_valid, y_train, y_valid, models_to_compare, resampler=resampler, cat_cols=cat_cols)")

# Uncomment the lines below to run the actual comparison
results = compare_models(X_train, X_valid, y_train, y_valid, models_to_compare)
# print("\nResults saved in 'results' variable")


In [13]:
# Summary Analysis for models_to_compare Results

print("=" * 100)
print("📊 SUMMARY FOR MODELS_TO_COMPARE - PER CLASS ANALYSIS")
print("=" * 100)

# Check if results variable exists from the previous cell
if 'results' in locals():
    print(f"\n🎯 FOCUS: CLASS 0 (NO RENEWAL) PERFORMANCE")
    print("-" * 80)
    
    # Sort by Class 0 F1-Score
    class0_focus = results.sort_values('F1-Score (Class 0)', ascending=False)
    print("Ranking by F1-Score (Class 0 - No Renewal Detection):")
    print("=" * 80)
    
    for i, (idx, row) in enumerate(class0_focus.iterrows(), 1):
        print(f"{i:2d}. {row['Model']:<20} | F1: {row['F1-Score (Class 0)']:.4f} | "
              f"Precision: {row['Precision (Class 0)']:.4f} | Recall: {row['Recall (Class 0)']:.4f}")
    
    # Best performers
    best_class0 = class0_focus.iloc[0]
    best_overall = results.iloc[0]
    
    print(f"\n🏆 BEST FOR DETECTING NON-RENEWALS (Class 0):")
    print(f"   Model: {best_class0['Model']}")
    print(f"   F1-Score: {best_class0['F1-Score (Class 0)']:.4f}")
    print(f"   Precision: {best_class0['Precision (Class 0)']:.4f}")
    print(f"   Recall: {best_class0['Recall (Class 0)']:.4f}")
    
    print(f"\n🥇 BEST OVERALL MODEL (Weighted F1):")
    print(f"   Model: {best_overall['Model']}")
    print(f"   Weighted F1-Score: {best_overall['F1-Score (Weighted)']:.4f}")
    print(f"   Class 0 F1-Score: {best_overall['F1-Score (Class 0)']:.4f}")
    print(f"   Class 1 F1-Score: {best_overall['F1-Score (Class 1)']:.4f}")
    
    # Performance categories
    print(f"\n📊 PERFORMANCE CATEGORIES:")
    print("-" * 50)
    
    high_class0 = results[results['F1-Score (Class 0)'] > 0.25]
    print(f"\n🟢 EXCELLENT at detecting non-renewals (F1 > 0.25):")
    for _, row in high_class0.iterrows():
        print(f"   • {row['Model']}: F1={row['F1-Score (Class 0)']:.4f}")
    
    medium_class0 = results[(results['F1-Score (Class 0)'] > 0.15) & (results['F1-Score (Class 0)'] <= 0.25)]
    print(f"\n🟡 GOOD at detecting non-renewals (0.15 < F1 ≤ 0.25):")
    for _, row in medium_class0.iterrows():
        print(f"   • {row['Model']}: F1={row['F1-Score (Class 0)']:.4f}")
    
    poor_class0 = results[results['F1-Score (Class 0)'] <= 0.15]
    print(f"\n🔴 POOR at detecting non-renewals (F1 ≤ 0.15):")
    for _, row in poor_class0.iterrows():
        print(f"   • {row['Model']}: F1={row['F1-Score (Class 0)']:.4f}")
    
    print(f"\n" + "=" * 100)
    print("✅ SUMMARY COMPLETE - Focus on Class 0 metrics for business success!")
    print("=" * 100)
    
else:
    print("❌ No results found. Please run the previous cell first.")
    print("💡 Run: results = compare_models(X_train, X_valid, y_train, y_valid, models_to_compare)")


📊 SUMMARY FOR MODELS_TO_COMPARE - PER CLASS ANALYSIS

🎯 FOCUS: CLASS 0 (NO RENEWAL) PERFORMANCE
--------------------------------------------------------------------------------
Ranking by F1-Score (Class 0 - No Renewal Detection):
 1. Quadratic Discriminant | F1: 0.3585 | Precision: 0.3039 | Recall: 0.4370
 2. Linear Discriminant  | F1: 0.3013 | Precision: 0.4103 | Recall: 0.2380
 3. Decision Tree        | F1: 0.2205 | Precision: 0.2039 | Recall: 0.2400
 4. XGBoost              | F1: 0.2195 | Precision: 0.4517 | Recall: 0.1450
 5. Logistic Regression  | F1: 0.2010 | Precision: 0.5299 | Recall: 0.1240
 6. Gradient Boosting    | F1: 0.1994 | Precision: 0.5082 | Recall: 0.1240
 7. Random Forest        | F1: 0.1821 | Precision: 0.5068 | Recall: 0.1110
 8. AdaBoost             | F1: 0.1821 | Precision: 0.5288 | Recall: 0.1100
 9. Extra Trees          | F1: 0.1797 | Precision: 0.4911 | Recall: 0.1100
10. Ridge Classifier     | F1: 0.1183 | Precision: 0.5690 | Recall: 0.0660
11. SGD Classifie