# Customer Churn Prediction - Machine Learning Model

## Project Overview
This notebook implements a complete Machine Learning pipeline to predict customer churn for an e-commerce business. The model identifies customers who are likely to discontinue using the company's services, enabling proactive retention strategies.

## Objectives
- Build a Machine Learning Prediction model to predict Customer Churn
- Handle imbalanced datasets using SMOTE
- Evaluate models using appropriate metrics
- Generate Confusion Matrix and ROC Curve visualizations
- Explain evaluation metrics and visualizations


In [None]:
# Import required libraries
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, roc_curve, confusion_matrix, classification_report
)
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTETomek
import kagglehub
import warnings
warnings.filterwarnings('ignore')

# Set style for plots
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

print("✓ All libraries imported successfully!")


## Step 1: Download Dataset

Download the dataset from Kaggle using `kagglehub`.


In [None]:
# Download dataset from Kaggle
print("=" * 60)
print("STEP 1: Downloading Dataset")
print("=" * 60)

try:
    # Download latest version
    path = kagglehub.dataset_download("ankitverma2010/ecommerce-customer-churn-analysis-and-prediction")
    print(f"✓ Dataset downloaded successfully!")
    print(f"Path to dataset files: {path}")
    
    # Find data files (CSV or Excel) in the downloaded directory
    data_files = []
    for root, dirs, files in os.walk(path):
        for file in files:
            if file.endswith(('.csv', '.xlsx', '.xls')):
                data_files.append(os.path.join(root, file))
    
    if data_files:
        print(f"\nFound data files:")
        for data_file in data_files:
            print(f"  - {data_file}")
        file_path = data_files[0]  # Use first data file
    else:
        raise FileNotFoundError("No CSV or Excel files found in downloaded dataset")
        
except Exception as e:
    print(f"Error downloading dataset: {e}")
    raise


## Step 2: Load and Explore Data

Load the dataset and explore its structure, missing values, and basic statistics.


In [None]:
# Load data based on file extension
print("\n" + "=" * 60)
print("STEP 2: Loading and Exploring Data")
print("=" * 60)

if file_path.endswith('.csv'):
    df = pd.read_csv(file_path)
elif file_path.endswith(('.xlsx', '.xls')):
    # Try to find the data sheet (skip metadata sheets)
    xl_file = pd.ExcelFile(file_path)
    sheet_names = xl_file.sheet_names
    
    # Look for common data sheet names or use the largest sheet
    data_sheet = None
    for sheet in sheet_names:
        sheet_lower = sheet.lower()
        # Skip metadata/dictionary sheets
        if 'dict' not in sheet_lower and 'meta' not in sheet_lower and 'info' not in sheet_lower:
            # Check if this sheet has substantial data
            test_df = pd.read_excel(file_path, sheet_name=sheet, nrows=5)
            if len(test_df.columns) > 2:  # Has multiple columns (likely data)
                data_sheet = sheet
                break
    
    # If no suitable sheet found, try the largest sheet
    if data_sheet is None:
        max_rows = 0
        for sheet in sheet_names:
            test_df = pd.read_excel(file_path, sheet_name=sheet)
            if len(test_df) > max_rows:
                max_rows = len(test_df)
                data_sheet = sheet
    
    # Load the data sheet
    if data_sheet:
        print(f"\nLoading data from sheet: '{data_sheet}'")
        df = pd.read_excel(file_path, sheet_name=data_sheet)
    else:
        # Fallback to first sheet
        print(f"\nWarning: Could not determine data sheet, using first sheet: '{sheet_names[0]}'")
        df = pd.read_excel(file_path, sheet_name=sheet_names[0])
else:
    raise ValueError(f"Unsupported file format: {file_path}")

print(f"\n✓ Data loaded successfully!")
print(f"Dataset shape: {df.shape}")
print(f"\nFirst few rows:")
df.head()


In [None]:
# Dataset information
print("Dataset Info:")
df.info()
print("\nMissing values:")
print(df.isnull().sum())
print("\nStatistical Summary:")
df.describe()
