In [None]:
# %% [markdown]
# # Exploratory Data Analysis (EDA) - Customer Churn Prediction
# 
# **Objective:** Understand the data, identify patterns, and discover insights about customer churn.
# 
# **Business Context:**
# - Goal: Predict customers likely to churn in next 30 days
# - Churn Definition: Inactive 60+ days OR subscription cancelled
# - Dataset: Telecom customer data

# %% [markdown]
# ## 1. Setup and Data Loading

# %%
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')

# Set plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

# %%
# Load processed data
df = pd.read_csv('../data/processed/churn_processed.csv')

print(f"Dataset Shape: {df.shape}")
print(f"\nFirst few rows:")
df.head()

# %% [markdown]
# ## 2. Basic Data Overview

# %%
# Data info
print("=== DATA INFORMATION ===\n")
print(df.info())

# %%
# Basic statistics
print("\n=== NUMERICAL FEATURES STATISTICS ===\n")
df.describe()

# %%
# Check missing values
print("\n=== MISSING VALUES ===\n")
missing = df.isnull().sum()
if missing.sum() == 0:
    print("âœ… No missing values!")
else:
    print(missing[missing > 0])

# %% [markdown]
# ## 3. Target Variable Analysis (CHURN)

# %%
# Churn distribution
print("=== CHURN DISTRIBUTION ===\n")

churn_counts = df['Churn'].value_counts()
churn_pct = df['Churn'].value_counts(normalize=True) * 100

print(f"No Churn (0): {churn_counts[0]:,} ({churn_pct[0]:.2f}%)")
print(f"Churn (1):    {churn_counts[1]:,} ({churn_pct[1]:.2f}%)")
print(f"\nChurn Rate: {churn_pct[1]:.2f}%")

# %%
# Visualize churn distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Count plot
sns.countplot(data=df, x='Churn', ax=axes[0])
axes[0].set_title('Churn Distribution (Count)', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Churn')
axes[0].set_ylabel('Count')
axes[0].set_xticklabels(['No Churn', 'Churn'])

# Pie chart
colors = ['#2ecc71', '#e74c3c']
axes[1].pie(churn_counts, labels=['No Churn', 'Churn'], autopct='%1.1f%%', 
            colors=colors, startangle=90)
axes[1].set_title('Churn Distribution (%)', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.savefig('../reports/figures/churn_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nðŸ’¡ Insight: This is a class imbalance problem. We'll need to handle this during modeling.")

# %% [markdown]
# ## 4. Tenure Analysis (Most Important Feature)

# %%
# Tenure statistics by churn
print("=== TENURE STATISTICS BY CHURN ===\n")
print(df.groupby('Churn')['tenure'].describe())

# %%
# Visualize tenure distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Box plot
sns.boxplot(data=df, x='Churn', y='tenure', ax=axes[0])
axes[0].set_title('Tenure Distribution by Churn', fontsize=14, fontweight='bold')
axes[0].set_xticklabels(['No Churn', 'Churn'])
axes[0].set_ylabel('Tenure (months)')

# Histogram
for churn_val, label, color in [(0, 'No Churn', '#2ecc71'), (1, 'Churn', '#e74c3c')]:
    axes[1].hist(df[df['Churn']==churn_val]['tenure'], bins=30, alpha=0.6, 
                 label=label, color=color)
axes[1].set_title('Tenure Distribution by Churn', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Tenure (months)')
axes[1].set_ylabel('Frequency')
axes[1].legend()

plt.tight_layout()
plt.savefig('../reports/figures/tenure_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nðŸ’¡ Insight: New customers (low tenure) have much higher churn rates!")

# %%
# Churn rate by tenure groups
df['TenureGroup'] = pd.cut(df['tenure'], bins=[0, 12, 36, 72], 
                             labels=['New (0-12)', 'Growing (12-36)', 'Loyal (36+)'])

tenure_churn = df.groupby('TenureGroup')['Churn'].agg(['sum', 'count', 'mean'])
tenure_churn.columns = ['Churned', 'Total', 'ChurnRate']
tenure_churn['ChurnRate'] = tenure_churn['ChurnRate'] * 100

print("\n=== CHURN RATE BY TENURE GROUP ===\n")
print(tenure_churn)

# Visualize
plt.figure(figsize=(10, 6))
sns.barplot(data=df, x='TenureGroup', y='Churn', estimator=lambda x: x.mean()*100)
plt.title('Churn Rate by Tenure Group', fontsize=14, fontweight='bold')
plt.ylabel('Churn Rate (%)')
plt.xlabel('Tenure Group')
plt.axhline(y=df['Churn'].mean()*100, color='red', linestyle='--', label='Overall Churn Rate')
plt.legend()
plt.savefig('../reports/figures/tenure_group_churn.png', dpi=300, bbox_inches='tight')
plt.show()

# %% [markdown]
# ## 5. Contract Type Analysis

# %%
# Churn by contract type
if 'Contract' in df.columns:
    print("=== CHURN BY CONTRACT TYPE ===\n")
    contract_churn = df.groupby('Contract')['Churn'].agg(['sum', 'count', 'mean'])
    contract_churn.columns = ['Churned', 'Total', 'ChurnRate']
    contract_churn['ChurnRate'] = contract_churn['ChurnRate'] * 100
    print(contract_churn)
    
    # Visualize
    plt.figure(figsize=(10, 6))
    sns.barplot(data=df, x='Contract', y='Churn', estimator=lambda x: x.mean()*100)
    plt.title('Churn Rate by Contract Type', fontsize=14, fontweight='bold')
    plt.ylabel('Churn Rate (%)')
    plt.xlabel('Contract Type')
    plt.axhline(y=df['Churn'].mean()*100, color='red', linestyle='--', label='Overall')
    plt.legend()
    plt.savefig('../reports/figures/contract_churn.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print("\nðŸ’¡ Insight: Month-to-month customers have MUCH higher churn rates!")

# %% [markdown]
# ## 6. Monthly Charges Analysis

# %%
# Monthly charges statistics
print("=== MONTHLY CHARGES BY CHURN ===\n")
print(df.groupby('Churn')['MonthlyCharges'].describe())

# %%
# Visualize monthly charges
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Box plot
sns.boxplot(data=df, x='Churn', y='MonthlyCharges', ax=axes[0])
axes[0].set_title('Monthly Charges by Churn', fontsize=14, fontweight='bold')
axes[0].set_xticklabels(['No Churn', 'Churn'])
axes[0].set_ylabel('Monthly Charges (â‚¹)')

# Violin plot
sns.violinplot(data=df, x='Churn', y='MonthlyCharges', ax=axes[1])
axes[1].set_title('Monthly Charges Distribution', fontsize=14, fontweight='bold')
axes[1].set_xticklabels(['No Churn', 'Churn'])
axes[1].set_ylabel('Monthly Charges (â‚¹)')

plt.tight_layout()
plt.savefig('../reports/figures/charges_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nðŸ’¡ Insight: Churned customers tend to have higher monthly charges!")

# %% [markdown]
# ## 7. Internet Service Analysis

# %%
if 'InternetService' in df.columns:
    print("=== CHURN BY INTERNET SERVICE ===\n")
    internet_churn = df.groupby('InternetService')['Churn'].agg(['sum', 'count', 'mean'])
    internet_churn.columns = ['Churned', 'Total', 'ChurnRate']
    internet_churn['ChurnRate'] = internet_churn['ChurnRate'] * 100
    print(internet_churn)
    
    # Visualize
    plt.figure(figsize=(10, 6))
    sns.barplot(data=df, x='InternetService', y='Churn', estimator=lambda x: x.mean()*100)
    plt.title('Churn Rate by Internet Service Type', fontsize=14, fontweight='bold')
    plt.ylabel('Churn Rate (%)')
    plt.xlabel('Internet Service')
    plt.axhline(y=df['Churn'].mean()*100, color='red', linestyle='--', label='Overall')
    plt.legend()
    plt.savefig('../reports/figures/internet_churn.png', dpi=300, bbox_inches='tight')
    plt.show()

# %% [markdown]
# ## 8. Payment Method Analysis

# %%
if 'PaymentMethod' in df.columns:
    print("=== CHURN BY PAYMENT METHOD ===\n")
    payment_churn = df.groupby('PaymentMethod')['Churn'].agg(['sum', 'count', 'mean'])
    payment_churn.columns = ['Churned', 'Total', 'ChurnRate']
    payment_churn['ChurnRate'] = payment_churn['ChurnRate'] * 100
    print(payment_churn)
    
    # Visualize
    plt.figure(figsize=(12, 6))
    sns.barplot(data=df, x='PaymentMethod', y='Churn', estimator=lambda x: x.mean()*100)
    plt.title('Churn Rate by Payment Method', fontsize=14, fontweight='bold')
    plt.ylabel('Churn Rate (%)')
    plt.xlabel('Payment Method')
    plt.xticks(rotation=45, ha='right')
    plt.axhline(y=df['Churn'].mean()*100, color='red', linestyle='--', label='Overall')
    plt.legend()
    plt.tight_layout()
    plt.savefig('../reports/figures/payment_churn.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print("\nðŸ’¡ Insight: Electronic check users have highest churn rate!")

# %% [markdown]
# ## 9. Correlation Analysis

# %%
# Select numeric columns for correlation
numeric_cols = df.select_dtypes(include=[np.number]).columns
correlation_matrix = df[numeric_cols].corr()

# Plot correlation heatmap
plt.figure(figsize=(14, 10))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
            center=0, square=True, linewidths=1)
plt.title('Feature Correlation Heatmap', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.savefig('../reports/figures/correlation_heatmap.png', dpi=300, bbox_inches='tight')
plt.show()

# %%
# Correlation with Churn
churn_corr = correlation_matrix['Churn'].sort_values(ascending=False)
print("\n=== TOP CORRELATIONS WITH CHURN ===\n")
print(churn_corr[churn_corr != 1.0].head(10))

# Visualize
plt.figure(figsize=(10, 8))
churn_corr[churn_corr != 1.0].head(10).plot(kind='barh')
plt.title('Top 10 Features Correlated with Churn', fontsize=14, fontweight='bold')
plt.xlabel('Correlation')
plt.tight_layout()
plt.savefig('../reports/figures/churn_correlations.png', dpi=300, bbox_inches='tight')
plt.show()

# %% [markdown]
# ## 10. Key Findings Summary

# %%
print("="*70)
print("KEY FINDINGS FROM EDA")
print("="*70)
print("""
1. CLASS IMBALANCE:
   - Churn rate: ~26.5%
   - Need to handle imbalance in modeling (SMOTE or class weights)

2. TENURE IS CRITICAL:
   - New customers (0-12 months): Highest churn rate
   - Loyal customers (36+ months): Lowest churn rate
   - Action: Target retention efforts at new customers

3. CONTRACT TYPE MATTERS:
   - Month-to-month: Very high churn
   - Long-term contracts: Much lower churn
   - Action: Incentivize annual/biennial contracts

4. PRICE SENSITIVITY:
   - Higher monthly charges â†’ Higher churn
   - Need to balance price with perceived value
   - Action: Bundle services to justify pricing

5. PAYMENT METHOD:
   - Electronic check: Highest churn risk
   - Automatic payments: Lower churn
   - Action: Encourage automatic payment setup

6. INTERNET SERVICE:
   - Fiber optic customers churn more (higher price)
   - Action: Ensure fiber optic delivers strong value

NEXT STEPS:
1. Feature Engineering (create tenure groups, service counts, etc.)
2. Model Training (Logistic Regression + XGBoost)
3. Business Cost Analysis
""")
print("="*70)

# %% [markdown]
# ## Save EDA Report

# %%
# Create EDA summary report
report_path = '../reports/eda_summary.txt'

with open(report_path, 'w') as f:
    f.write("="*70 + "\n")
    f.write("EXPLORATORY DATA ANALYSIS - SUMMARY REPORT\n")
    f.write("="*70 + "\n\n")
    
    f.write(f"Dataset Shape: {df.shape}\n")
    f.write(f"Churn Rate: {df['Churn'].mean()*100:.2f}%\n\n")
    
    f.write("KEY INSIGHTS:\n")
    f.write("-"*70 + "\n")
    f.write("1. New customers (0-12 months) have highest churn risk\n")
    f.write("2. Month-to-month contracts show 3x higher churn than long-term\n")
    f.write("3. Electronic check payment method is highest risk\n")
    f.write("4. Higher monthly charges correlate with churn\n")
    f.write("5. Fiber optic customers churn more despite premium service\n\n")
    
    f.write("RECOMMENDATIONS:\n")
    f.write("-"*70 + "\n")
    f.write("1. Early intervention program for new customers (0-6 months)\n")
    f.write("2. Contract upgrade incentives (month-to-month â†’ annual)\n")
    f.write("3. Automatic payment promotion\n")
    f.write("4. Service bundling to increase perceived value\n")
    f.write("5. Fiber optic customer experience improvement\n")

print(f"\nâœ… EDA report saved to: {report_path}")
print("\nðŸŽ¯ Next: Run 02_feature_engineering.ipynb")