In [127]:
# 1. Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, confusion_matrix, classification_report
import pickle
import warnings
warnings.filterwarnings('ignore')

In [128]:
# 2. Load the original customer churn dataset
original_df = pd.read_csv('customer_churn_data.csv')
print(f"Original dataset shape: {original_df.shape}")
original_df.head()

Original dataset shape: (1000, 10)


Unnamed: 0,CustomerID,Age,Gender,Tenure,MonthlyCharges,ContractType,InternetService,TotalCharges,TechSupport,Churn
0,1,49,Male,4,88.35,Month-to-Month,Fiber Optic,353.4,Yes,Yes
1,2,43,Male,0,36.67,Month-to-Month,Fiber Optic,0.0,Yes,Yes
2,3,51,Female,2,63.79,Month-to-Month,Fiber Optic,127.58,No,Yes
3,4,60,Female,8,102.34,One-Year,DSL,818.72,Yes,Yes
4,5,42,Male,32,69.01,Month-to-Month,,2208.32,No,Yes


In [129]:
# 3. Generate realistic synthetic data to remove churn bias
from sklearn.utils import resample

# Identify minority and majority classes (assumes 'Churn' column is 'Yes'/'No')
churn_yes = original_df[original_df['Churn'] == 'Yes']
churn_no = original_df[original_df['Churn'] == 'No']

if len(churn_yes) == 0 or len(churn_no) == 0:
    print("Error: One of the classes is missing in the dataset. Cannot balance churn.")
    balanced_df = original_df.copy()
else:
    # Upsample minority class (No) to match majority class (Yes)
    churn_no_upsampled = resample(
        churn_no,
        replace=True,
        n_samples=len(churn_yes),
        random_state=42
    )
    # Combine and shuffle for 50-50 balance
    balanced_df = pd.concat([churn_yes, churn_no_upsampled]).sample(frac=1, random_state=42).reset_index(drop=True)
    print(f"Balanced dataset shape: {balanced_df.shape}")
    print(f"Churn distribution: {balanced_df['Churn'].value_counts(normalize=True)}")
balanced_df.head()

Balanced dataset shape: (1766, 10)
Churn distribution: Churn
No     0.5
Yes    0.5
Name: proportion, dtype: float64


Unnamed: 0,CustomerID,Age,Gender,Tenure,MonthlyCharges,ContractType,InternetService,TotalCharges,TechSupport,Churn
0,620,47,Female,21,65.56,Two-Year,DSL,1376.76,Yes,No
1,242,43,Female,3,61.98,One-Year,,185.94,No,Yes
2,289,47,Female,17,66.61,One-Year,Fiber Optic,1132.37,Yes,No
3,950,47,Female,3,114.13,One-Year,Fiber Optic,342.39,Yes,Yes
4,947,29,Female,15,98.06,One-Year,Fiber Optic,1470.9,Yes,No


In [130]:
# 4. Save the unbiased balanced dataset
balanced_df.to_csv('combined_customer_churn_data_balanced.csv', index=False)
print("Balanced dataset saved to combined_customer_churn_data_balanced.csv")

Balanced dataset saved to combined_customer_churn_data_balanced.csv


In [131]:
# 5. Preprocess the balanced data
# Handle missing values, encode categorical variables, scale features
df = balanced_df.dropna()
categorical_cols = df.select_dtypes(include=['object']).columns
le = LabelEncoder()
for col in categorical_cols:
    df[col] = le.fit_transform(df[col])

scaler = StandardScaler()
feature_cols = [col for col in df.columns if col != 'Churn']
df[feature_cols] = scaler.fit_transform(df[feature_cols])

print("Preprocessing complete.")
df.head()

Preprocessing complete.


Unnamed: 0,CustomerID,Age,Gender,Tenure,MonthlyCharges,ContractType,InternetService,TotalCharges,TechSupport,Churn
0,0.451385,0.380753,-0.845056,-0.19498,-0.094457,1.292908,-1.220586,-0.171857,0.393541,0
2,-0.677266,0.380753,-0.845056,-0.401897,-0.050633,-0.076437,0.819279,-0.351396,0.393541,0
3,1.576626,0.380753,-0.845056,-1.126108,1.932704,-0.076437,0.819279,-0.931747,0.393541,1
4,1.566397,-1.515267,-0.845056,-0.505356,1.261992,-0.076437,0.819279,-0.102698,0.393541,0
5,1.525479,0.591422,-0.845056,-0.557085,1.057481,-0.076437,0.819279,-0.225134,0.393541,0


In [137]:
# 6. Feature engineering (polynomial features)
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(df[feature_cols])
X_poly_df = pd.DataFrame(X_poly, columns=poly.get_feature_names_out(feature_cols))
X_poly_df['Churn'] = df['Churn'].values

print(f"Feature engineered dataset shape: {X_poly_df.shape}")
X_poly_df.head()

Feature engineered dataset shape: (1469, 55)


Unnamed: 0,CustomerID,Age,Gender,Tenure,MonthlyCharges,ContractType,InternetService,TotalCharges,TechSupport,CustomerID^2,...,ContractType InternetService,ContractType TotalCharges,ContractType TechSupport,InternetService^2,InternetService TotalCharges,InternetService TechSupport,TotalCharges^2,TotalCharges TechSupport,TechSupport^2,Churn
0,0.451385,0.380753,-0.845056,-0.19498,-0.094457,1.292908,-1.220586,-0.171857,0.393541,0.203748,...,-1.578105,-0.222196,0.508812,1.489831,0.209767,-0.48035,0.029535,-0.067633,0.154874,0
1,-0.677266,0.380753,-0.845056,-0.401897,-0.050633,-0.076437,0.819279,-0.351396,0.393541,0.45869,...,-0.062623,0.02686,-0.030081,0.671217,-0.287891,0.322419,0.123479,-0.138289,0.154874,0
2,1.576626,0.380753,-0.845056,-1.126108,1.932704,-0.076437,0.819279,-0.931747,0.393541,2.48575,...,-0.062623,0.07122,-0.030081,0.671217,-0.76336,0.322419,0.868152,-0.36668,0.154874,1
3,1.566397,-1.515267,-0.845056,-0.505356,1.261992,-0.076437,0.819279,-0.102698,0.393541,2.453598,...,-0.062623,0.00785,-0.030081,0.671217,-0.084139,0.322419,0.010547,-0.040416,0.154874,0
4,1.525479,0.591422,-0.845056,-0.557085,1.057481,-0.076437,0.819279,-0.225134,0.393541,2.327085,...,-0.062623,0.017209,-0.030081,0.671217,-0.184447,0.322419,0.050685,-0.088599,0.154874,0


In [133]:
# 7. Train/test split
X = X_poly_df.drop('Churn', axis=1)
y = X_poly_df['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")

Train shape: (1175, 54), Test shape: (294, 54)


In [134]:
# 8. Train high-level models
models = {
    'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42),
    'GradientBoosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'AdaBoost': AdaBoostClassifier(n_estimators=100, random_state=42),
    'ExtraTrees': ExtraTreesClassifier(n_estimators=100, random_state=42),
    'LogisticRegression': LogisticRegression(max_iter=1000, random_state=42)
}

results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    results[name] = {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'roc_auc': roc_auc
    }

results_df = pd.DataFrame(results).T
results_df

Unnamed: 0,accuracy,f1,precision,recall,roc_auc
RandomForest,1.0,1.0,1.0,1.0,1.0
GradientBoosting,1.0,1.0,1.0,1.0,1.0
AdaBoost,1.0,1.0,1.0,1.0,1.0
ExtraTrees,1.0,1.0,1.0,1.0,1.0
LogisticRegression,0.993197,0.991379,1.0,0.982906,0.991453


In [135]:
# 9. Evaluate and select the best model
best_model_name = results_df['roc_auc'].idxmax()
best_model = models[best_model_name]
print(f"Best model: {best_model_name}")
print(results_df.loc[best_model_name])

Best model: RandomForest
accuracy     1.0
f1           1.0
precision    1.0
recall       1.0
roc_auc      1.0
Name: RandomForest, dtype: float64


In [136]:
# 10. Save the best model, scaler, and feature names for Streamlit app
with open('churn_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
with open('feature_names.pkl', 'wb') as f:
    pickle.dump(feature_cols, f)
model_info = {
    'model_name': best_model_name,
    'metrics': results_df.loc[best_model_name].to_dict()
}
with open('model_info.pkl', 'wb') as f:
    pickle.dump(model_info, f)
print("Model, scaler, and feature names saved for Streamlit app.")

Model, scaler, and feature names saved for Streamlit app.


In [141]:
# Save a simpler model for the Streamlit app (using basic features only)
print("Creating simple model for Streamlit app...")

# Define basic feature names (without polynomial features)
basic_features = ['CustomerID', 'Age', 'Gender', 'Tenure', 'MonthlyCharges', 'ContractType', 'InternetService', 'TotalCharges', 'TechSupport']

# Use the balanced dataset but without polynomial features
X_simple = df_balanced[basic_features].copy()  # Basic features only
y_simple = df_balanced['Churn'].copy()

# Convert target to numeric (0/1 instead of No/Yes)
y_simple = (y_simple == 'Yes').astype(int)

# Convert CustomerID to numeric for consistency
def convert_customer_id(cid):
    if isinstance(cid, str):
        if cid.startswith('SYN'):
            return int(cid[3:])  # 'SYN1126' → 1126
        else:
            return abs(hash(cid)) % 10000
    return int(cid)

X_simple['CustomerID'] = X_simple['CustomerID'].apply(convert_customer_id)

# Encode categorical variables for simple model
le_simple = LabelEncoder()
categorical_cols_simple = X_simple.select_dtypes(include=['object']).columns

for col in categorical_cols_simple:
    X_simple[col] = le_simple.fit_transform(X_simple[col])

print(f"Features shape before scaling: {X_simple.shape}")
print(f"Target distribution: {y_simple.value_counts()}")

# Scale features
scaler_simple = StandardScaler()
X_simple_scaled = scaler_simple.fit_transform(X_simple[basic_features])

# Convert back to DataFrame
X_simple_scaled = pd.DataFrame(X_simple_scaled, columns=basic_features, index=X_simple.index)

# Train-test split (exclude CustomerID for training)
feature_cols_for_training = [col for col in basic_features if col != 'CustomerID']
X_train_simple, X_test_simple, y_train_simple, y_test_simple = train_test_split(
    X_simple_scaled[feature_cols_for_training], y_simple, test_size=0.2, random_state=42, stratify=y_simple
)

# Train a simple logistic regression model
lr_simple = LogisticRegression(random_state=42, max_iter=1000)
lr_simple.fit(X_train_simple, y_train_simple)

# Evaluate simple model
y_pred_simple = lr_simple.predict(X_test_simple)
y_pred_proba_simple = lr_simple.predict_proba(X_test_simple)[:, 1]

simple_accuracy = accuracy_score(y_test_simple, y_pred_simple)
simple_precision = precision_score(y_test_simple, y_pred_simple)
simple_recall = recall_score(y_test_simple, y_pred_simple)
simple_f1 = f1_score(y_test_simple, y_pred_simple)
simple_roc_auc = roc_auc_score(y_test_simple, y_pred_proba_simple)

print(f"Simple Model Performance:")
print(f"Accuracy: {simple_accuracy:.4f}")
print(f"Precision: {simple_precision:.4f}")
print(f"Recall: {simple_recall:.4f}")
print(f"F1-Score: {simple_f1:.4f}")
print(f"ROC-AUC: {simple_roc_auc:.4f}")

# Save simple model for Streamlit app
with open('churn_model.pkl', 'wb') as f:
    pickle.dump(lr_simple, f)
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler_simple, f)
with open('feature_names.pkl', 'wb') as f:
    pickle.dump(basic_features, f)

# Save simple model info
simple_model_info = {
    'model_name': 'LogisticRegression (Simple)',
    'metrics': {
        'accuracy': simple_accuracy,
        'precision': simple_precision,
        'recall': simple_recall,
        'f1': simple_f1,
        'roc_auc': simple_roc_auc
    }
}

with open('model_info.pkl', 'wb') as f:
    pickle.dump(simple_model_info, f)

print("✅ Simple model saved successfully for Streamlit app!")
print("✅ This model uses basic features without polynomial engineering")
print(f"✅ Model trained on {len(feature_cols_for_training)} features (excluding CustomerID)")
print(f"✅ Scaler trained on {len(basic_features)} features (including CustomerID for consistency)")

Creating simple model for Streamlit app...
Features shape before scaling: (1760, 9)
Target distribution: Churn
1    883
0    877
Name: count, dtype: int64
Simple Model Performance:
Accuracy: 0.8097
Precision: 0.7926
Recall: 0.8418
F1-Score: 0.8164
ROC-AUC: 0.8946
✅ Simple model saved successfully for Streamlit app!
✅ This model uses basic features without polynomial engineering
✅ Model trained on 8 features (excluding CustomerID)
✅ Scaler trained on 9 features (including CustomerID for consistency)


In [144]:
# Advanced Model Training on Balanced Dataset
print("Training advanced models on balanced dataset...")
print("=" * 60)

# Use the balanced dataset with proper feature engineering
X_balanced_enh = df_balanced[basic_features].copy()  
y_balanced_enh = (df_balanced['Churn'] == 'Yes').astype(int)

# Convert CustomerID properly
X_balanced_enh['CustomerID'] = X_balanced_enh['CustomerID'].apply(convert_customer_id)

# Encode categorical variables
le_enh = LabelEncoder()
categorical_cols_enh = X_balanced_enh.select_dtypes(include=['object']).columns

for col in categorical_cols_enh:
    X_balanced_enh[col] = le_enh.fit_transform(X_balanced_enh[col])

# Feature Engineering: Add interaction features
X_balanced_enh['MonthlyPerYear'] = X_balanced_enh['MonthlyCharges'] * 12
X_balanced_enh['ChargesPerTenure'] = X_balanced_enh['TotalCharges'] / (X_balanced_enh['Tenure'] + 1)
X_balanced_enh['AgeGroup'] = pd.cut(X_balanced_enh['Age'], bins=[0, 25, 45, 65, 100], labels=[0, 1, 2, 3])
X_balanced_enh['TenureGroup'] = pd.cut(X_balanced_enh['Tenure'], bins=[0, 12, 24, 48, 100], labels=[0, 1, 2, 3])
X_balanced_enh['ChargeRatio'] = X_balanced_enh['MonthlyCharges'] / (X_balanced_enh['TotalCharges'] + 1)

# Handle any NaN or infinite values
print(f"Checking for NaN values...")
nan_count = X_balanced_enh.isnull().sum().sum()
print(f"Total NaN values: {nan_count}")

if nan_count > 0:
    print("Filling NaN values...")
    # Fill NaN values with median for numerical columns
    numerical_cols = X_balanced_enh.select_dtypes(include=[np.number]).columns
    for col in numerical_cols:
        if X_balanced_enh[col].isnull().sum() > 0:
            median_val = X_balanced_enh[col].median()
            X_balanced_enh[col].fillna(median_val, inplace=True)
            print(f"  Filled {col} NaN values with median: {median_val}")

# Handle infinite values
print("Checking for infinite values...")
inf_mask = np.isinf(X_balanced_enh.select_dtypes(include=[np.number]))
inf_count = inf_mask.sum().sum()
print(f"Total infinite values: {inf_count}")

if inf_count > 0:
    print("Replacing infinite values...")
    X_balanced_enh.replace([np.inf, -np.inf], np.nan, inplace=True)
    # Fill the new NaNs with median
    for col in numerical_cols:
        if X_balanced_enh[col].isnull().sum() > 0:
            median_val = X_balanced_enh[col].median()
            X_balanced_enh[col].fillna(median_val, inplace=True)

print(f"Enhanced features shape: {X_balanced_enh.shape}")
print(f"Final NaN count: {X_balanced_enh.isnull().sum().sum()}")

# Scale features (excluding CustomerID for training)
feature_cols_enh = [col for col in X_balanced_enh.columns if col != 'CustomerID']
scaler_enh = StandardScaler()
X_balanced_enh_scaled = scaler_enh.fit_transform(X_balanced_enh[feature_cols_enh])
X_balanced_enh_scaled = pd.DataFrame(X_balanced_enh_scaled, columns=feature_cols_enh, index=X_balanced_enh.index)

# Final check for NaN/infinite in scaled data
print(f"Scaled data NaN count: {X_balanced_enh_scaled.isnull().sum().sum()}")
print(f"Scaled data infinite count: {np.isinf(X_balanced_enh_scaled.select_dtypes(include=[np.number])).sum().sum()}")

# Train-test split
X_train_enh, X_test_enh, y_train_enh, y_test_enh = train_test_split(
    X_balanced_enh_scaled, y_balanced_enh, test_size=0.2, random_state=42, stratify=y_balanced_enh
)

print(f"Training set size: {X_train_enh.shape}")
print(f"Test set size: {X_test_enh.shape}")

# Define advanced models to test (using ones that can handle various data better)
advanced_models = {
    'RandomForest': RandomForestClassifier(n_estimators=200, max_depth=10, min_samples_split=5, random_state=42, n_jobs=-1),
    'ExtraTrees': ExtraTreesClassifier(n_estimators=200, max_depth=10, min_samples_split=5, random_state=42, n_jobs=-1),
    'AdaBoost': AdaBoostClassifier(n_estimators=100, learning_rate=1.0, random_state=42),
    'LogisticRegression_L2': LogisticRegression(C=1.0, max_iter=1000, random_state=42),
    'LogisticRegression_L1': LogisticRegression(C=1.0, penalty='l1', solver='liblinear', max_iter=1000, random_state=42),
    'LogisticRegression_Elastic': LogisticRegression(C=1.0, penalty='elasticnet', solver='saga', l1_ratio=0.5, max_iter=1000, random_state=42)
}

# Train and evaluate all models
advanced_results = {}

for model_name, model in advanced_models.items():
    print(f"\nTraining {model_name}...")
    
    try:
        # Train model
        model.fit(X_train_enh, y_train_enh)
        
        # Make predictions
        y_pred = model.predict(X_test_enh)
        y_pred_proba = model.predict_proba(X_test_enh)[:, 1]
        
        # Calculate metrics
        accuracy = accuracy_score(y_test_enh, y_pred)
        precision = precision_score(y_test_enh, y_pred)
        recall = recall_score(y_test_enh, y_pred)
        f1 = f1_score(y_test_enh, y_pred)
        roc_auc = roc_auc_score(y_test_enh, y_pred_proba)
        
        # Store results
        advanced_results[model_name] = {
            'model': model,
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'roc_auc': roc_auc
        }
        
        print(f"  ✅ Accuracy: {accuracy:.4f}")
        print(f"  ✅ Precision: {precision:.4f}")
        print(f"  ✅ Recall: {recall:.4f}")
        print(f"  ✅ F1-Score: {f1:.4f}")
        print(f"  ✅ ROC-AUC: {roc_auc:.4f}")
        
    except Exception as e:
        print(f"  ❌ Error training {model_name}: {str(e)}")
        continue

print("\n" + "=" * 60)
print("ADVANCED MODELS COMPARISON")
print("=" * 60)

# Create comparison DataFrame
results_comparison = []
for model_name, results in advanced_results.items():
    results_comparison.append({
        'Model': model_name,
        'Accuracy': results['accuracy'],
        'Precision': results['precision'],
        'Recall': results['recall'],
        'F1-Score': results['f1'],
        'ROC-AUC': results['roc_auc']
    })

comparison_df = pd.DataFrame(results_comparison)
comparison_df = comparison_df.sort_values('Accuracy', ascending=False).reset_index(drop=True)

print(comparison_df.round(4))

# Find best model
best_model_name_overall = comparison_df.iloc[0]['Model']
best_accuracy_overall = comparison_df.iloc[0]['Accuracy']
best_model = advanced_results[best_model_name_overall]['model']

print(f"\n🏆 BEST MODEL: {best_model_name_overall}")
print(f"🎯 Best Accuracy: {best_accuracy_overall:.4f} ({best_accuracy_overall*100:.2f}%)")

# Save best model for Streamlit app
with open('churn_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler_enh, f)
with open('feature_names.pkl', 'wb') as f:
    pickle.dump(feature_cols_enh, f)

# Save model info
model_info_final = {
    'model_name': best_model_name_overall,
    'metrics': {
        'accuracy': advanced_results[best_model_name_overall]['accuracy'],
        'precision': advanced_results[best_model_name_overall]['precision'],
        'recall': advanced_results[best_model_name_overall]['recall'],
        'f1': advanced_results[best_model_name_overall]['f1'],
        'roc_auc': advanced_results[best_model_name_overall]['roc_auc']
    },
    'features': feature_cols_enh,
    'feature_count': len(feature_cols_enh)
}

with open('model_info.pkl', 'wb') as f:
    pickle.dump(model_info_final, f)

print(f"\n✅ Best model ({best_model_name_overall}) saved successfully!")
print(f"✅ Enhanced features used: {len(feature_cols_enh)}")
print(f"✅ Dataset is perfectly balanced: 50% churn / 50% no-churn")
print(f"✅ Model achieves {best_accuracy_overall*100:.2f}% accuracy on balanced data!")

Training advanced models on balanced dataset...
Checking for NaN values...
Total NaN values: 54
Filling NaN values...
Checking for infinite values...
Total infinite values: 0
Enhanced features shape: (1760, 14)
Final NaN count: 54
Scaled data NaN count: 54
Scaled data infinite count: 0
Training set size: (1408, 13)
Test set size: (352, 13)

Training RandomForest...
  ✅ Accuracy: 0.9062
  ✅ Precision: 0.8871
  ✅ Recall: 0.9322
  ✅ F1-Score: 0.9091
  ✅ ROC-AUC: 0.9721

Training ExtraTrees...
  ✅ Accuracy: 0.8892
  ✅ Precision: 0.8632
  ✅ Recall: 0.9266
  ✅ F1-Score: 0.8937
  ✅ ROC-AUC: 0.9719

Training AdaBoost...
  ❌ Error training AdaBoost: Input X contains NaN.
AdaBoostClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer

In [145]:
# Properly handle NaN values and test additional models
print("\n" + "=" * 60)
print("FIXING NaN VALUES AND TESTING MORE MODELS")
print("=" * 60)

# Create a clean copy without NaN values
X_clean = X_balanced_enh_scaled.copy()

# Use SimpleImputer to properly handle all NaN values
from sklearn.impute import SimpleImputer

print("Using SimpleImputer to handle all NaN values...")
imputer = SimpleImputer(strategy='median')
X_clean_array = imputer.fit_transform(X_clean)
X_clean = pd.DataFrame(X_clean_array, columns=feature_cols_enh, index=X_clean.index)

print(f"Clean data NaN count: {X_clean.isnull().sum().sum()}")
print(f"Clean data infinite count: {np.isinf(X_clean.select_dtypes(include=[np.number])).sum().sum()}")

# Train-test split with clean data
X_train_clean, X_test_clean, y_train_clean, y_test_clean = train_test_split(
    X_clean, y_balanced_enh, test_size=0.2, random_state=42, stratify=y_balanced_enh
)

# Test additional models with clean data
additional_models = {
    'GradientBoosting': GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42),
    'AdaBoost': AdaBoostClassifier(n_estimators=100, learning_rate=1.0, random_state=42),
    'SVM_RBF': SVC(kernel='rbf', C=1.0, probability=True, random_state=42),
    'LogisticRegression_L2': LogisticRegression(C=1.0, max_iter=1000, random_state=42),
    'LogisticRegression_L1': LogisticRegression(C=1.0, penalty='l1', solver='liblinear', max_iter=1000, random_state=42)
}

# Re-train the working models on clean data
all_models = {**advanced_models, **additional_models}

# Train and evaluate all models on clean data
all_results = {}

for model_name, model in all_models.items():
    print(f"\nTraining {model_name} on clean data...")
    
    try:
        # Train model
        model.fit(X_train_clean, y_train_clean)
        
        # Make predictions
        y_pred = model.predict(X_test_clean)
        y_pred_proba = model.predict_proba(X_test_clean)[:, 1]
        
        # Calculate metrics
        accuracy = accuracy_score(y_test_clean, y_pred)
        precision = precision_score(y_test_clean, y_pred)
        recall = recall_score(y_test_clean, y_pred)
        f1 = f1_score(y_test_clean, y_pred)
        roc_auc = roc_auc_score(y_test_clean, y_pred_proba)
        
        # Store results
        all_results[model_name] = {
            'model': model,
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'roc_auc': roc_auc
        }
        
        print(f"  ✅ Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
        print(f"  ✅ Precision: {precision:.4f}")
        print(f"  ✅ Recall: {recall:.4f}")
        print(f"  ✅ F1-Score: {f1:.4f}")
        print(f"  ✅ ROC-AUC: {roc_auc:.4f}")
        
    except Exception as e:
        print(f"  ❌ Error training {model_name}: {str(e)}")
        continue

print("\n" + "=" * 60)
print("FINAL COMPLETE MODELS COMPARISON")
print("=" * 60)

# Create final comparison DataFrame
all_results_df = []
for model_name, results in all_results.items():
    all_results_df.append({
        'Model': model_name,
        'Accuracy': results['accuracy'],
        'Precision': results['precision'],
        'Recall': results['recall'],
        'F1-Score': results['f1'],
        'ROC-AUC': results['roc_auc']
    })

all_results_df = pd.DataFrame(all_results_df)
all_results_df = all_results_df.sort_values('Accuracy', ascending=False).reset_index(drop=True)

print(all_results_df.round(4))

# Find best model overall
final_best_model_name = all_results_df.iloc[0]['Model']
final_best_accuracy = all_results_df.iloc[0]['Accuracy']
final_best_model = all_results[final_best_model_name]['model']

print(f"\n🏆 FINAL BEST MODEL: {final_best_model_name}")
print(f"🎯 FINAL Best Accuracy: {final_best_accuracy:.4f} ({final_best_accuracy*100:.2f}%)")

# Show top 3 models
print(f"\n📊 TOP 3 MODELS:")
for i in range(min(3, len(all_results_df))):
    model_info = all_results_df.iloc[i]
    print(f"  {i+1}. {model_info['Model']}: {model_info['Accuracy']:.4f} ({model_info['Accuracy']*100:.2f}%)")

# Save the best model for Streamlit app
with open('churn_model.pkl', 'wb') as f:
    pickle.dump(final_best_model, f)
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler_enh, f)
with open('feature_names.pkl', 'wb') as f:
    pickle.dump(feature_cols_enh, f)

# Save final model info
final_model_info = {
    'model_name': final_best_model_name,
    'metrics': {
        'accuracy': all_results[final_best_model_name]['accuracy'],
        'precision': all_results[final_best_model_name]['precision'],
        'recall': all_results[final_best_model_name]['recall'],
        'f1': all_results[final_best_model_name]['f1'],
        'roc_auc': all_results[final_best_model_name]['roc_auc']
    },
    'features': feature_cols_enh,
    'feature_count': len(feature_cols_enh),
    'training_data_balance': '50% churn / 50% no-churn'
}

with open('model_info.pkl', 'wb') as f:
    pickle.dump(final_model_info, f)

print(f"\n✅ FINAL best model ({final_best_model_name}) saved successfully!")
print(f"✅ Model trained on enhanced features: {len(feature_cols_enh)}")
print(f"✅ Dataset is perfectly balanced: 50% churn / 50% no-churn")
print(f"✅ Model achieves {final_best_accuracy*100:.2f}% accuracy on unbiased data!")
print(f"✅ This is a {((final_best_accuracy - 0.8097) / 0.8097) * 100:.1f}% improvement over the simple model!")


FIXING NaN VALUES AND TESTING MORE MODELS
Using SimpleImputer to handle all NaN values...
Clean data NaN count: 0
Clean data infinite count: 0

Training RandomForest on clean data...
  ✅ Accuracy: 0.9006 (90.06%)
  ✅ Precision: 0.8777
  ✅ Recall: 0.9322
  ✅ F1-Score: 0.9041
  ✅ ROC-AUC: 0.9714

Training ExtraTrees on clean data...
  ✅ Accuracy: 0.8949 (89.49%)
  ✅ Precision: 0.8723
  ✅ Recall: 0.9266
  ✅ F1-Score: 0.8986
  ✅ ROC-AUC: 0.9725

Training AdaBoost on clean data...
  ✅ Accuracy: 0.8494 (84.94%)
  ✅ Precision: 0.8483
  ✅ Recall: 0.8531
  ✅ F1-Score: 0.8507
  ✅ ROC-AUC: 0.9527

Training LogisticRegression_L2 on clean data...
  ✅ Accuracy: 0.8295 (82.95%)
  ✅ Precision: 0.8232
  ✅ Recall: 0.8418
  ✅ F1-Score: 0.8324
  ✅ ROC-AUC: 0.9015

Training LogisticRegression_L1 on clean data...
  ✅ Accuracy: 0.8295 (82.95%)
  ✅ Precision: 0.8232
  ✅ Recall: 0.8418
  ✅ F1-Score: 0.8324
  ✅ ROC-AUC: 0.9015

Training LogisticRegression_Elastic on clean data...
  ✅ Accuracy: 0.8295 (82.95%)
