# Customer Churn Prediction using Machine Learning

This notebook demonstrates a complete machine learning pipeline for predicting customer churn using scikit-learn and pandas. The workflow includes data loading, preprocessing, exploratory data analysis (EDA), model training, evaluation, and model export.

**Goals:**
1. Load and preprocess the dataset.
2. Handle missing or categorical data appropriately.
3. Perform exploratory data analysis (EDA) with plots (Matplotlib or Seaborn).
4. Encode categorical variables and scale numeric features.
5. Split the data into training and testing sets.
6. Train at least three classification models:
   - Logistic Regression
   - Random Forest
   - XGBoost (optional)
7. Evaluate all models using accuracy, precision, recall, F1 score, and ROC AUC.
8. Plot confusion matrix and ROC curves for each model.
9. Select the best model based on performance.
10. Optionally, export the final model as a .pkl file using joblib.

In [None]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
import joblib


: 

In [None]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
import joblib


: 

: 

: 

In [None]:
df = pd.read_csv('dataset.csv')
df.head()

: 

In [None]:
import pandas as pd
df = pd.read_csv('dataset.csv')
df.head()

## Data Overview and Initial Exploration

Let's explore the dataset structure, check for missing values, and understand the data types.

In [None]:
df.info()
print("\nMissing values per column:")
print(df.isnull().sum())
df.describe(include='all')

## Exploratory Data Analysis (EDA)

Let's visualize the distribution of key features and the target variable (Churn).

In [None]:
plt.figure(figsize=(6,4))
sns.countplot(x='Churn', data=df)
plt.title('Churn Distribution')
plt.show()

num_cols = df.select_dtypes(include=['int64', 'float64']).columns

df[num_cols].hist(figsize=(12, 8), bins=20)
plt.tight_layout()
plt.show()

## Data Preprocessing

We will handle missing values, encode categorical variables, and scale numeric features.

In [None]:
def preprocess_data(df):
    # Drop customerID if present
    if 'customerID' in df.columns:
        df = df.drop('customerID', axis=1)
    
    # Fill missing values for numeric columns with median
    for col in df.select_dtypes(include=['int64', 'float64']).columns:
        df[col] = df[col].fillna(df[col].median())
    
    # Fill missing values for categorical columns with mode
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = df[col].fillna(df[col].mode()[0])
    
    # Encode categorical variables
    for col in df.select_dtypes(include=['object']).columns:
        if col != 'Churn':
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col])
    
    # Encode target
    df['Churn'] = df['Churn'].map({'Yes':1, 'No':0})
    
    # Scale numeric features
    scaler = StandardScaler()
    num_cols = df.select_dtypes(include=['int64', 'float64']).columns
    df[num_cols] = scaler.fit_transform(df[num_cols])
    return df

df_processed = preprocess_data(df.copy())
df_processed.head()

## Train-Test Split and Model Training

We will split the data and train Logistic Regression, Random Forest, and (optionally) XGBoost models.

In [None]:
# Split data and train models
X = df_processed.drop('Churn', axis=1)
y = df_processed['Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)

# Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

try:
    from xgboost import XGBClassifier
    xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
    xgb.fit(X_train, y_train)
    xgb_available = True
except ImportError:
    xgb_available = False
    print('XGBoost not installed. Skipping XGBoost model.')

## Model Evaluation

We will evaluate all models using accuracy, precision, recall, F1 score, ROC AUC, confusion matrix, and ROC curves.

In [None]:
# Evaluate models and plot confusion matrix and ROC curves
def evaluate_model(model, X_test, y_test, model_name):
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:,1] if hasattr(model, 'predict_proba') else None
    print(f"\n{model_name} Classification Report:")
    print(classification_report(y_test, y_pred))
    print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}")
    if y_proba is not None:
        auc = roc_auc_score(y_test, y_proba)
        print(f"ROC AUC: {auc:.4f}")
        fpr, tpr, _ = roc_curve(y_test, y_proba)
        plt.plot(fpr, tpr, label=f'{model_name} (AUC = {auc:.2f})')
    else:
        print("No probability scores available for ROC curve.")
    
    # Plot confusion matrix
    sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues')
    plt.title(f'{model_name} Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()

evaluate_model(logreg, X_test, y_test, 'Logistic Regression')
evaluate_model(rf, X_test, y_test, 'Random Forest')
if 'xgb_available' in locals() and xgb_available:
    evaluate_model(xgb, X_test, y_test, 'XGBoost')
    plt.plot([0,1],[0,1],'k--')
    plt.title('ROC Curves')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend()
    plt.show()
else:
    plt.plot([0,1],[0,1],'k--')
    plt.title('ROC Curves')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend()
    plt.show()

## Model Selection and Export

Select the best model based on evaluation metrics and export it as a .pkl file using joblib.

In [None]:
best_model = rf  
joblib.dump(best_model, 'best_churn_model.pkl')
print('Best model exported as best_churn_model.pkl')

: 