# 📊 Customer Churn Data Exploration

This notebook explores the Telco Customer Churn dataset from Kaggle to understand the data characteristics and relationships.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Configure display
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

print("📊 Data Exploration Setup Complete")

## 📁 Load Dataset

In [None]:
# Load the customer churn dataset
data_path = "../data/raw/customer_data.csv"

try:
    df = pd.read_csv(data_path)
    print(f"✅ Dataset loaded successfully: {df.shape}")
    print(f"📈 Churn rate: {(df['churn'] == 'Yes').mean():.2%}")
except FileNotFoundError:
    print("❌ Dataset not found. Please run the download script first:")
    print("   python scripts/download_kaggle_data.py")
    df = None

## 🔍 Dataset Overview

In [None]:
if df is not None:
    print("📋 Dataset Info:")
    print(f"Shape: {df.shape}")
    print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
    print("\n📊 Column Info:")
    df.info()

In [None]:
if df is not None:
    print("🎯 Target Variable Distribution:")
    churn_counts = df['churn'].value_counts()
    print(churn_counts)
    print(f"\nChurn Rate: {churn_counts['Yes'] / len(df):.2%}")
    
    # Visualize target distribution
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
    
    # Count plot
    sns.countplot(data=df, x='churn', ax=ax1)
    ax1.set_title('Churn Distribution (Count)')
    ax1.set_xlabel('Churn')
    ax1.set_ylabel('Count')
    
    # Pie chart
    colors = ['#FF6B6B', '#4ECDC4']
    ax2.pie(churn_counts.values, labels=churn_counts.index, autopct='%1.1f%%', colors=colors)
    ax2.set_title('Churn Distribution (Percentage)')
    
    plt.tight_layout()
    plt.show()

## 📈 Numerical Features Analysis

In [None]:
if df is not None:
    # Identify numerical columns
    numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    if 'customer_id' in numerical_cols:
        numerical_cols.remove('customer_id')
    
    print(f"📊 Numerical Features ({len(numerical_cols)}): {numerical_cols}")
    
    # Descriptive statistics
    print("\n📈 Descriptive Statistics:")
    print(df[numerical_cols].describe())

In [None]:
if df is not None and len(numerical_cols) > 0:
    # Distribution plots
    n_cols = min(3, len(numerical_cols))
    n_rows = (len(numerical_cols) + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 4 * n_rows))
    if n_rows == 1:
        axes = [axes] if n_cols == 1 else axes
    else:
        axes = axes.flatten()
    
    for i, col in enumerate(numerical_cols):
        if i < len(axes):
            # Histogram with churn overlay
            for churn_val in ['No', 'Yes']:
                data = df[df['churn'] == churn_val][col]
                axes[i].hist(data, alpha=0.7, label=f'Churn={churn_val}', bins=30)
            
            axes[i].set_title(f'{col} Distribution by Churn')
            axes[i].set_xlabel(col)
            axes[i].set_ylabel('Frequency')
            axes[i].legend()
    
    # Hide unused subplots
    for i in range(len(numerical_cols), len(axes)):
        axes[i].set_visible(False)
    
    plt.tight_layout()
    plt.show()

## 🏷️ Categorical Features Analysis

In [None]:
if df is not None:
    # Identify categorical columns
    categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
    if 'customer_id' in categorical_cols:
        categorical_cols.remove('customer_id')
    if 'churn' in categorical_cols:
        categorical_cols.remove('churn')
    
    print(f"🏷️ Categorical Features ({len(categorical_cols)}): {categorical_cols}")
    
    # Show unique values for each categorical feature
    print("\n🔍 Unique Values per Feature:")
    for col in categorical_cols[:5]:  # Show first 5 to avoid clutter
        unique_vals = df[col].unique()[:10]  # Show first 10 unique values
        print(f"{col}: {len(df[col].unique())} unique values - {unique_vals}")

In [None]:
if df is not None and len(categorical_cols) > 0:
    # Churn rate by categorical features
    important_categoricals = categorical_cols[:6]  # Show top 6
    
    n_cols = 2
    n_rows = (len(important_categoricals) + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 4 * n_rows))
    if n_rows == 1:
        axes = [axes] if n_cols == 1 else axes
    else:
        axes = axes.flatten()
    
    for i, col in enumerate(important_categoricals):
        if i < len(axes):
            # Calculate churn rate by category
            churn_rate = df.groupby(col)['churn'].apply(lambda x: (x == 'Yes').mean()).sort_values(ascending=False)
            
            # Bar plot
            bars = axes[i].bar(range(len(churn_rate)), churn_rate.values, 
                              color=['#FF6B6B' if rate > 0.3 else '#4ECDC4' for rate in churn_rate.values])
            
            axes[i].set_title(f'Churn Rate by {col}')
            axes[i].set_xlabel(col)
            axes[i].set_ylabel('Churn Rate')
            axes[i].set_xticks(range(len(churn_rate)))
            axes[i].set_xticklabels(churn_rate.index, rotation=45)
            
            # Add value labels on bars
            for bar, rate in zip(bars, churn_rate.values):
                axes[i].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
                           f'{rate:.2%}', ha='center', va='bottom', fontsize=9)
    
    # Hide unused subplots
    for i in range(len(important_categoricals), len(axes)):
        axes[i].set_visible(False)
    
    plt.tight_layout()
    plt.show()

## 🔗 Feature Correlations

In [None]:
if df is not None and len(numerical_cols) > 1:
    # Correlation matrix for numerical features
    plt.figure(figsize=(10, 8))
    
    # Create correlation matrix
    corr_matrix = df[numerical_cols].corr()
    
    # Heatmap
    mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
    sns.heatmap(corr_matrix, mask=mask, annot=True, cmap='RdYlBu_r', center=0,
                square=True, linewidths=0.5, cbar_kws={"shrink": 0.8})
    
    plt.title('Feature Correlation Matrix')
    plt.tight_layout()
    plt.show()
    
    # Show strongest correlations
    print("\n🔗 Strongest Correlations:")
    corr_pairs = []
    for i in range(len(corr_matrix.columns)):
        for j in range(i+1, len(corr_matrix.columns)):
            corr_pairs.append((
                corr_matrix.columns[i], 
                corr_matrix.columns[j], 
                corr_matrix.iloc[i, j]
            ))
    
    corr_pairs.sort(key=lambda x: abs(x[2]), reverse=True)
    for feat1, feat2, corr in corr_pairs[:5]:
        print(f"{feat1} ↔ {feat2}: {corr:.3f}")

## 💡 Key Insights

In [None]:
if df is not None:
    print("💡 Key Insights from Data Exploration:")
    print("=" * 50)
    
    # Overall churn rate
    churn_rate = (df['churn'] == 'Yes').mean()
    print(f"📈 Overall churn rate: {churn_rate:.1%}")
    
    # High-risk segments (if we have categorical data)
    if 'contract' in df.columns:
        contract_churn = df.groupby('contract')['churn'].apply(lambda x: (x == 'Yes').mean())
        highest_risk_contract = contract_churn.idxmax()
        print(f"📋 Highest risk contract type: {highest_risk_contract} ({contract_churn[highest_risk_contract]:.1%} churn rate)")
    
    if 'internet_service' in df.columns:
        internet_churn = df.groupby('internet_service')['churn'].apply(lambda x: (x == 'Yes').mean())
        highest_risk_internet = internet_churn.idxmax()
        print(f"🌐 Highest risk internet service: {highest_risk_internet} ({internet_churn[highest_risk_internet]:.1%} churn rate)")
    
    # Tenure analysis
    if 'tenure' in df.columns:
        avg_tenure_churned = df[df['churn'] == 'Yes']['tenure'].mean()
        avg_tenure_retained = df[df['churn'] == 'No']['tenure'].mean()
        print(f"📊 Average tenure - Churned: {avg_tenure_churned:.1f} months, Retained: {avg_tenure_retained:.1f} months")
    
    # Monthly charges analysis
    if 'monthly_charges' in df.columns:
        avg_charges_churned = df[df['churn'] == 'Yes']['monthly_charges'].mean()
        avg_charges_retained = df[df['churn'] == 'No']['monthly_charges'].mean()
        print(f"💰 Average monthly charges - Churned: ${avg_charges_churned:.2f}, Retained: ${avg_charges_retained:.2f}")
    
    print("\n🎯 Recommendations for Model Training:")
    print("• Focus on contract type and tenure as key predictive features")
    print("• Consider feature engineering: tenure categories, charges ratios")
    print("• Handle class imbalance in churn prediction")
    print("• Monitor model performance on high-risk customer segments")

## 🚀 Next Steps

After exploring the data, you can:

1. **Train the model**: `python src/models/train.py --data-path data/raw/customer_data.csv`
2. **Start the API**: `python scripts/start_api.py`
3. **Monitor the model**: Access Grafana at http://localhost:3000
4. **Run A/B tests**: `python scripts/ab_test_cli.py create --help`

The insights from this exploration will help inform feature engineering and model selection decisions.