# Customer Churn Prediction
### Production-ready Customer Churn Prediction Model
**Author:** Ashish Jha

This notebook implements a comprehensive customer churn prediction system using multiple machine learning algorithms.

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import joblib
import warnings
warnings.filterwarnings('ignore')

## 2. Load and Preprocess Data

In [None]:
def load_and_preprocess_data(filepath):
    df = pd.read_csv(filepath)
    
    # Handle missing values
    df.fillna(df.median(numeric_only=True), inplace=True)
    
    # Encode categorical variables
    le = LabelEncoder()
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = le.fit_transform(df[col].astype(str))
    
    # Feature engineering
    if 'tenure' in df.columns and 'MonthlyCharges' in df.columns:
        df['TotalCharges_Tenure_Ratio'] = df['MonthlyCharges'] * df['tenure']
    
    return df

## 3. Train Models

In [None]:
# Load data
df = load_and_preprocess_data('customer_data.csv')

# Prepare features and target
X = df.drop('Churn', axis=1)
y = df['Churn']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Handle class imbalance
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_balanced)
X_test_scaled = scaler.transform(X_test)

## 4. Train Multiple Models

In [None]:
# Logistic Regression
lr = LogisticRegression(random_state=42, max_iter=1000)
lr.fit(X_train_scaled, y_train_balanced)
lr_pred = lr.predict(X_test_scaled)
print('Logistic Regression Results:')
print(classification_report(y_test, lr_pred))
print(f'ROC-AUC Score: {roc_auc_score(y_test, lr.predict_proba(X_test_scaled)[:, 1]):.4f}')

# Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_scaled, y_train_balanced)
rf_pred = rf.predict(X_test_scaled)
print('\nRandom Forest Results:')
print(classification_report(y_test, rf_pred))
print(f'ROC-AUC Score: {roc_auc_score(y_test, rf.predict_proba(X_test_scaled)[:, 1]):.4f}')

# XGBoost
xgb = XGBClassifier(random_state=42, eval_metric='logloss')
xgb.fit(X_train_scaled, y_train_balanced)
xgb_pred = xgb.predict(X_test_scaled)
print('\nXGBoost Results:')
print(classification_report(y_test, xgb_pred))
print(f'ROC-AUC Score: {roc_auc_score(y_test, xgb.predict_proba(X_test_scaled)[:, 1]):.4f}')

## 5. Save Best Model

In [None]:
# Save the best performing model
joblib.dump(xgb, 'churn_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
print('Models saved successfully!')