## 🎛️ Hyperparameter Tuning (Full Performance Mode)

*Note: This section is optional and will significantly increase runtime (20-60 minutes)*

In [None]:
# Full hyperparameter tuning (uncomment to enable)
# WARNING: This will take 20-60 minutes depending on your hardware

ENABLE_HYPERPARAMETER_TUNING = False  # Set to True for full tuning

if ENABLE_HYPERPARAMETER_TUNING and best_model_name:
    print(f"🔧 Full hyperparameter tuning for {best_model_name}...")
    
    from sklearn.model_selection import GridSearchCV
    
    # Full parameter grids
    param_grids = {
        'Logistic Regression': {
            'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
            'penalty': ['l1', 'l2'],
            'solver': ['liblinear', 'saga'],
            'max_iter': [1000, 2000]
        },
        'Random Forest': {
            'n_estimators': [50, 100, 200, 300],
            'max_depth': [10, 20, 30, None],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4],
            'bootstrap': [True, False]
        },
        'XGBoost': {
            'n_estimators': [50, 100, 200, 300],
            'max_depth': [3, 4, 5, 6, 8],
            'learning_rate': [0.01, 0.05, 0.1, 0.2],
            'subsample': [0.8, 0.9, 1.0],
            'colsample_bytree': [0.8, 0.9, 1.0]
        }
    }
    
    if best_model_name in param_grids:
        base_model = trained_models[best_model_name]
        param_grid = param_grids[best_model_name]
        
        print(f"Grid search with {len(param_grid)} parameters...")
        print(f"Estimated combinations: {np.prod([len(v) for v in param_grid.values()])}")
        
        grid_search = GridSearchCV(
            base_model, 
            param_grid, 
            cv=LAPTOP_CONFIG['grid_cv'],
            scoring='roc_auc',
            n_jobs=LAPTOP_CONFIG['n_jobs'],
            verbose=2,
            return_train_score=True
        )
        
        start_time = time.time()
        grid_search.fit(X_train, y_train)
        tuning_time = time.time() - start_time
        
        print(f"\\n✅ Hyperparameter tuning completed in {tuning_time/60:.1f} minutes")
        print(f"✅ Best parameters: {grid_search.best_params_}")
        print(f"✅ Best CV score: {grid_search.best_score_:.4f}")
        print(f"✅ Improvement: {grid_search.best_score_ - model_results[best_model_name]['cv_mean']:.4f}")
        
        # Update best model
        best_tuned_model = grid_search.best_estimator_
        trained_models[f"{best_model_name} (Tuned)"] = best_tuned_model
        
    else:
        print(f"No parameter grid defined for {best_model_name}")
        
else:
    print("⏭️ Hyperparameter tuning skipped (set ENABLE_HYPERPARAMETER_TUNING = True to enable)")
    print("💡 This saves 20-60 minutes of computation time")

# 💻 Customer Churn Prediction - Reduced Edition (8GB RAM Optimized)

This notebook is specifically optimized for my laptop with 8GB RAM for faster execution.

## 💻 Laptop Configuration & Memory Setup

In [None]:
# Laptop-optimized settings
import psutil
import gc

def check_memory():
    """Monitor memory usage"""
    mem = psutil.virtual_memory()
    print(f"Memory: {mem.percent:.1f}% used ({mem.used/1024**3:.1f}GB/{mem.total/1024**3:.1f}GB)")
    
def cleanup_memory():
    """Force garbage collection"""
    gc.collect()
    print(" Memory cleaned up")


In [None]:
OPTIM_CONFIG = {
    'cv_folds': 3,           # Reduced CV folds
    'n_jobs': 2,             # Limited parallel processing  
    'rf_estimators': 50,     # Fewer trees
    'max_iter': 500,         # Fewer iterations
    'grid_cv': 2,            # Smaller grid search
    'verbose': True          # Progress tracking
}

print(" Laptop mode configuration loaded!")
check_memory()

##  Import Libraries

In [None]:
# Core libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import time
warnings.filterwarnings('ignore')

# Scikit-learn
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, accuracy_score

# XGBoost
try:
    from xgboost import XGBClassifier
    XGBOOST_AVAILABLE = True
    print("XGBoost available (will use minimal settings)")
except ImportError:
    XGBOOST_AVAILABLE = False
    print("XGBoost not available (skipping)")

# Custom preprocessing
from src.preprocessing import ChurnDataPreprocessor

# Simple plotting style
plt.style.use('default')

print("Lightweight imports complete!")
check_memory()

## Load and Preprocess Data

In [None]:
start_time = time.time()

# Load data
print("Loading dataset...")
df = pd.read_csv('data/WA_Fn-UseC_-Telco-Customer-Churn.csv')
print(f"Dataset shape: {df.shape}")

# Initialize preprocessor
print("\n Initializing preprocessor...")
preprocessor = ChurnDataPreprocessor()

# Quick preprocessing
print("Running FAST preprocessing pipeline...")
X_train, X_test, y_train, y_test = preprocessor.full_preprocessing_pipeline(
    df, target_col='Churn', test_size=0.2, random_state=42
)

preprocessing_time = time.time() - start_time
print(f"\n Preprocessing completed in {preprocessing_time:.1f} seconds!")
print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"Class distribution: {np.bincount(y_train)}")

check_memory()
cleanup_memory()

## 🚀 Laptop-Optimized Model Setup

In [None]:
# Initialize laptop-friendly models
print("🔧 Setting up laptop-optimized models...")

models = {
    'Logistic Regression': LogisticRegression(
        random_state=42,
        max_iter=OPTIM_CONFIG['max_iter'],
        solver='liblinear',  # Faster solver
        n_jobs=1  # Single core for stability
    ),
    'Random Forest': RandomForestClassifier(
        random_state=42,
        n_estimators=OPTIM_CONFIG['rf_estimators'],  # Reduced trees
        max_depth=15,  # Limit depth
        n_jobs=OPTIM_CONFIG['n_jobs'],  # Limited parallel
        min_samples_split=5,  # Faster training
        min_samples_leaf=2
    )
}

# Optional XGBoost with minimal settings
if XGBOOST_AVAILABLE:
    models['XGBoost (Lite)'] = XGBClassifier(
        random_state=42,
        n_estimators=30,  # Very few estimators
        max_depth=4,      # Shallow trees
        learning_rate=0.1,
        eval_metric='logloss',
        enable_categorical=False,
        use_label_encoder=False,
        verbosity=0,
        n_jobs=1  # Single thread
    )
    print("➕ XGBoost Lite added (minimal resource usage)")

print(f"🎯 {len(models)} models ready for training")
print(f"Models: {list(models.keys())}")
check_memory()

## ⚡ Fast Model Training (3-Fold CV)

In [None]:
# Quick training with progress tracking
model_results = {}
trained_models = {}
training_times = {}

# Fast cross-validation setup
cv = StratifiedKFold(n_splits=OPTIM_CONFIG['cv_folds'], shuffle=True, random_state=42)

print(f"⚡ FAST training mode: {OPTIM_CONFIG['cv_folds']}-fold CV")
print("=" * 50)

overall_start = time.time()

for i, (name, model) in enumerate(models.items(), 1):
    print(f"\n[{i}/{len(models)}] Training {name}...")
    model_start = time.time()
    
    try:
        # Fit model
        model.fit(X_train, y_train)
        trained_models[name] = model
        
        # Fast cross-validation
        if 'XGBoost' in name:
            # Manual CV for XGBoost compatibility
            cv_scores = []
            for train_idx, val_idx in cv.split(X_train, y_train):
                X_fold_train = X_train.iloc[train_idx]
                X_fold_val = X_train.iloc[val_idx]
                y_fold_train = y_train.iloc[train_idx]
                y_fold_val = y_train.iloc[val_idx]
                
                fold_model = XGBClassifier(
                    random_state=42, n_estimators=30, max_depth=4,
                    learning_rate=0.1, eval_metric='logloss',
                    enable_categorical=False, use_label_encoder=False,
                    verbosity=0, n_jobs=1
                )
                fold_model.fit(X_fold_train, y_fold_train)
                y_pred_proba = fold_model.predict_proba(X_fold_val)[:, 1]
                score = roc_auc_score(y_fold_val, y_pred_proba)
                cv_scores.append(score)
            cv_scores = np.array(cv_scores)
        else:
            # Standard CV for other models
            cv_scores = cross_val_score(
                model, X_train, y_train, 
                cv=cv, scoring='roc_auc', 
                n_jobs=1  # Single job for laptop
            )
        
        # Store results
        model_results[name] = {
            'cv_mean': cv_scores.mean(),
            'cv_std': cv_scores.std(),
            'cv_scores': cv_scores
        }
        
        model_time = time.time() - model_start
        training_times[name] = model_time
        
        print(f" ROC-AUC: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
        print(f" Time: {model_time:.1f}s")
        
        # Memory check after each model
        if OPTIM_CONFIG['verbose']:
            check_memory()
            
    except Exception as e:
        print(f" Error training {name}: {str(e)[:100]}...")
        print(f" Skipping and continuing...")
        continue

total_time = time.time() - overall_start
print(f"\n Training completed in {total_time:.1f} seconds!")
print(f" Successfully trained {len(trained_models)} models")

cleanup_memory()

## 📊 Quick Performance Summary

In [None]:
# Fast performance overview
print(" LAPTOP PERFORMANCE SUMMARY")
print("=" * 50)

if model_results:
    # Create simple performance table
    results_data = []
    for name, results in model_results.items():
        results_data.append({
            'Model': name,
            'ROC-AUC': f"{results['cv_mean']:.4f}",
            'Std': f"{results['cv_std']:.4f}",
            'Time (s)': f"{training_times[name]:.1f}"
        })
    
    performance_df = pd.DataFrame(results_data)
    performance_df = performance_df.sort_values('ROC-AUC', ascending=False)
    
    print(performance_df.to_string(index=False))
    
    # Best model
    best_model_name = performance_df.iloc[0]['Model']
    best_score = performance_df.iloc[0]['ROC-AUC']
    print(f"\n Best model: {best_model_name} (ROC-AUC: {best_score})")
    
else:
    print("No models trained successfully")
    best_model_name = None

check_memory()

## 🎯 Test Set Evaluation (Best Model Only)

In [None]:
# Quick test evaluation for best model only (saves time/memory)
if best_model_name and best_model_name in trained_models:
    print(f" Evaluating {best_model_name} on test set...")
    
    best_model = trained_models[best_model_name]
    
    # Predictions
    y_pred = best_model.predict(X_test)
    y_pred_proba = best_model.predict_proba(X_test)[:, 1]
    
    # Metrics
    test_accuracy = accuracy_score(y_test, y_pred)
    test_roc_auc = roc_auc_score(y_test, y_pred_proba)
    
    print(f"\n TEST SET RESULTS:")
    print(f"Accuracy: {test_accuracy:.4f}")
    print(f"ROC-AUC:  {test_roc_auc:.4f}")
    
    # Simple confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    print(f"\n Confusion Matrix:")
    print(f"True Neg: {cm[0,0]:4d} | False Pos: {cm[0,1]:4d}")
    print(f"False Neg: {cm[1,0]:4d} | True Pos:  {cm[1,1]:4d}")
    
else:
    print(" No model available for evaluation")
    
cleanup_memory()

## 💾 Quick Model Save (Essential Only)

In [None]:
# Save only the best model (saves disk space)
import joblib
import os

if best_model_name and best_model_name in trained_models:
    print(" Saving best model and preprocessor...")
    
    # Create models directory
    os.makedirs('models', exist_ok=True)
    
    # Save best model
    model_filename = f'models/laptop_best_churn_model.pkl'
    joblib.dump(trained_models[best_model_name], model_filename)
    print(f"Model saved: {model_filename}")
    
    # Save preprocessor
    preprocessor_filename = 'models/laptop_churn_preprocessor.pkl'
    preprocessor.save_preprocessor(preprocessor_filename)
    
    # Save model info
    model_info = {
        'model_name': best_model_name,
        'test_accuracy': test_accuracy,
        'test_roc_auc': test_roc_auc,
        'training_time': training_times[best_model_name],
        'config': OPTIM_CONFIG
    }
    
    import json
    with open('models/laptop_model_info.json', 'w') as f:
        json.dump(model_info, f, indent=2)
    
    print(f" Model info saved: models/laptop_model_info.json")
    
else:
    print("❌ No model to save")

check_memory()

## 🎉 Laptop Training Summary

In [None]:
# Final summary optimized for laptop
print("\n" + "=" * 60)
print("LAPTOP TRAINING COMPLETE!")
print("=" * 60)

if best_model_name:
    print(f" Best Model: {best_model_name}")
    print(f" Test Accuracy: {test_accuracy:.4f}")
    print(f" Test ROC-AUC: {test_roc_auc:.4f}")
    print(f" Training Time: {training_times[best_model_name]:.1f}s")
    print(f" Model saved and ready for Streamlit app!")
else:
    print(" No models were successfully trained")

print(f"\n Optimised/reduced Configuration Used:")
for key, value in OPTIM_CONFIG.items():
    print(f"  {key}: {value}")

print(f"\n Total Runtime: {time.time() - overall_start:.1f} seconds")
print(f" Optimized for 8GB RAM laptops ✅")

final_memory = psutil.virtual_memory()
print(f" Final Memory Usage: {final_memory.percent:.1f}%")
print("\n Ready for deployment!")