# Telecom Customer Churn Prediction

## 1. Problem Statement
**Goal:** Predict which customers are likely to churn (leave) the telecom provider based on demographic and usage data. This helps the business identify at-risk customers for retention campaigns.

**Dataset:** Telco Customer Churn Dataset (Kaggle)
**Target Column:** `Churn` (Yes/No)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc, f1_score, accuracy_score, precision_score, recall_score
import joblib

# Set visual style
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)

# Create folders
if not os.path.exists('eda_plots'):
    os.makedirs('eda_plots')
if not os.path.exists('models'):
    os.makedirs('models')

## 2. Data Loading & Cleaning

In [None]:
# Load the dataset
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

# Drop customerID
df.drop('customerID', axis=1, inplace=True)

# Convert TotalCharges to numeric (handle " " blanks as NaN)
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Impute missing values in TotalCharges with median
df['TotalCharges'] = df['TotalCharges'].fillna(df['TotalCharges'].median())

# Convert SeniorCitizen to object for proper encoding
df['SeniorCitizen'] = df['SeniorCitizen'].astype('object')

# Convert target Churn to binary (1 = Yes, 0 = No)
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

print("Data cleaning completed. Missing values in TotalCharges:", df['TotalCharges'].isnull().sum())
df.head()

## 3. Exploratory Data Analysis (EDA)

In [None]:
# Overall churn rate
plt.figure()
df['Churn'].value_counts(normalize=True).plot(kind='bar', color=['skyblue', 'salmon'])
plt.title('Overall Churn Rate (0: No, 1: Yes)')
plt.ylabel('Percentage')
plt.savefig('eda_plots/overall_churn_rate.png')
plt.close()

# Churn rate by key predictors
categorical_cols = ['Contract', 'InternetService', 'PaymentMethod', 'gender', 'SeniorCitizen']
for col in categorical_cols:
    plt.figure()
    sns.barplot(x=col, y='Churn', data=df, palette='viridis')
    plt.title(f'Churn Rate by {col}')
    plt.savefig(f'eda_plots/churn_by_{col}.png')
    plt.close()

# Churn rate by MonthlyCharges (boxplot)
plt.figure()
sns.boxplot(x='Churn', y='MonthlyCharges', data=df)
plt.title('Monthly Charges vs Churn')
plt.savefig('eda_plots/churn_monthlycharges_boxplot.png')
plt.close()

# Churn rate by tenure (histogram + KDE)
plt.figure()
sns.histplot(data=df, x='tenure', hue='Churn', kde=True, element="step")
plt.title('Tenure Distribution by Churn')
plt.savefig('eda_plots/churn_tenure_hist.png')
plt.close()

# Correlation heatmap of numerical features
plt.figure(figsize=(10, 8))
numeric_df = df.select_dtypes(include=[np.number])
sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.savefig('eda_plots/correlation_heatmap.png')
plt.close()

# Violin plots highlighting key predictors
plt.figure()
sns.violinplot(x='Contract', y='MonthlyCharges', hue='Churn', data=df, split=True)
plt.title('Monthly Charges vs Contract type by Churn')
plt.savefig('eda_plots/violin_contract_monthly_churn.png')
plt.close()

## 4. Feature Engineering & Preprocessing

In [None]:
# One-hot encode all categorical columns
df_encoded = pd.get_dummies(df, drop_first=True)

# Separate features and target
X = df_encoded.drop('Churn', axis=1)
y = df_encoded['Churn']

# Train-test split (80/20, stratify=y, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Scale numerical features (tenure, MonthlyCharges, TotalCharges)
scaler = StandardScaler()
num_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

print("Preprocessing completed. Shape of X_train:", X_train.shape)

## 5. Model Training

In [None]:
# DecisionTreeClassifier
dt_model = DecisionTreeClassifier(class_weight='balanced', random_state=42)
dt_model.fit(X_train, y_train)
dt_pred = dt_model.predict(X_test)

print("Decision Tree Performance:")
print(classification_report(y_test, dt_pred))

# GradientBoostingClassifier (preliminary)
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train, y_train)
gb_pred = gb_model.predict(X_test)

print("Gradient Boosting (Baseline) Performance:")
print(classification_report(y_test, gb_pred))

## 6. Model Optimization

In [None]:
# Hyperparameter tuning with GridSearchCV on Gradient Boosting model
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'subsample': [0.8, 1.0],
    'min_samples_split': [2, 5]
}

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(estimator=GradientBoostingClassifier(random_state=42), 
                           param_grid=param_grid, 
                           cv=skf, 
                           scoring='f1', 
                           n_jobs=-1, 
                           verbose=1)

grid_search.fit(X_train, y_train)
best_gb_model = grid_search.best_estimator_

print("Best Parameters:", grid_search.best_params_)
print("Best F1 Score from CV:", grid_search.best_score_)

## 7. Evaluation

In [None]:
# Final predictions
y_pred = best_gb_model.predict(X_test)
y_prob = best_gb_model.predict_proba(X_test)[:, 1]

# Reports
print("--- Final Model: Gradient Boosting Classification Report ---")
print(classification_report(y_test, y_pred))
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred):.4f}")
print(f"Recall: {recall_score(y_test, y_pred):.4f}")
print(f"F1-Score: {f1_score(y_test, y_pred):.4f}")

# Confusion Matrix
plt.figure()
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.close()

# ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.close()
print(f"ROC-AUC: {roc_auc:.4f}")

# Save the best model
joblib.dump(best_gb_model, 'models/best_gradient_boosting_model.pkl')
print("Model saved to models/best_gradient_boosting_model.pkl")

## 8. Visualization: Feature Importance

In [None]:
# Feature importance bar plot
importances = best_gb_model.feature_importances_
feature_names = X.columns
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False).head(15)

plt.figure(figsize=(12, 8))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df, palette='magma')
plt.title('Top 15 Important Features (Gradient Boosting)')
plt.xlabel('Relative Importance')
plt.ylabel('Feature')
plt.savefig('eda_plots/feature_importance.png')
plt.close()

## 9. Conclusion
The Gradient Boosting model performed well in predicting customer churn. 
Key predictors identified include:
1. **Contract type (Month-to-month)**
2. **Tenure**
3. **Monthly Charges**
4. **Internet Service (Fiber Optic)**

Business recommendations focus on increasing customer loyalty through longer contracts and monitoring high-usage new customers.