## Notebook 1: Data Exploration
Explore the customer churn dataset and understand patterns

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

In [None]:
# Load the dataset
df = pd.read_csv('../data/raw/WA_Fn-UseC_-Telco-Customer-Churn.csv')

print("Dataset Shape:", df.shape)
print("\nFirst 5 rows:")
df.head()

# %%
# Dataset Information
print("Dataset Info:")
print("="*60)
df.info()

# %%
# Basic Statistics
print("\nBasic Statistics:")
print("="*60)
df.describe()

# %%
# Check for missing values
print("\nMissing Values:")
print("="*60)
missing = df.isnull().sum()
missing[missing > 0]

# %%
# Target variable distribution
print("\nChurn Distribution:")
print("="*60)
churn_dist = df['Churn'].value_counts()
print(churn_dist)
print(f"\nChurn Rate: {(churn_dist['Yes'] / len(df)) * 100:.2f}%")

# %%
# Visualize churn distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Pie chart
axes[0].pie(churn_dist.values, labels=churn_dist.index, autopct='%1.1f%%',
            colors=['#2ecc71', '#e74c3c'], startangle=90)
axes[0].set_title('Churn Distribution', fontsize=14, fontweight='bold')

# Bar chart
axes[1].bar(churn_dist.index, churn_dist.values, color=['#2ecc71', '#e74c3c'])
axes[1].set_xlabel('Churn Status')
axes[1].set_ylabel('Count')
axes[1].set_title('Customer Count by Churn Status', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.savefig('../outputs/figures/churn_distribution.png', dpi=300)
plt.show()

# %%
# Numerical features analysis
print("\nNumerical Features:")
print("="*60)
numerical_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']

for col in numerical_cols:
    print(f"\n{col}:")
    print(f"  Mean: {df[col].mean():.2f}")
    print(f"  Median: {df[col].median():.2f}")
    print(f"  Std: {df[col].std():.2f}")

# %%
# Visualize numerical features by churn
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for idx, col in enumerate(numerical_cols):
    if col == 'TotalCharges':
        df[col] = pd.to_numeric(df[col], errors='coerce')
    
    df[df['Churn']=='No'][col].hist(bins=30, alpha=0.7, label='No Churn', 
                                     color='green', ax=axes[idx])
    df[df['Churn']=='Yes'][col].hist(bins=30, alpha=0.7, label='Churn', 
                                      color='red', ax=axes[idx])
    axes[idx].set_xlabel(col)
    axes[idx].set_ylabel('Frequency')
    axes[idx].set_title(f'{col} Distribution by Churn')
    axes[idx].legend()

plt.tight_layout()
plt.savefig('../outputs/figures/numerical_features.png', dpi=300)
plt.show()

# %%
# Categorical features analysis
categorical_cols = ['gender', 'Contract', 'InternetService', 'PaymentMethod']

fig, axes = plt.subplots(2, 2, figsize=(16, 10))
axes = axes.ravel()

for idx, col in enumerate(categorical_cols):
    churn_rate = df.groupby(col)['Churn'].apply(lambda x: (x=='Yes').sum() / len(x) * 100)
    churn_rate.plot(kind='bar', ax=axes[idx], color='steelblue')
    axes[idx].set_title(f'Churn Rate by {col}', fontweight='bold')
    axes[idx].set_ylabel('Churn Rate (%)')
    axes[idx].set_xlabel(col)
    axes[idx].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.savefig('../outputs/figures/categorical_features.png', dpi=300)
plt.show()

# %%
# Correlation heatmap
# Convert TotalCharges to numeric
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['Churn_Binary'] = (df['Churn'] == 'Yes').astype(int)

correlation_cols = ['tenure', 'MonthlyCharges', 'TotalCharges', 'Churn_Binary']
correlation = df[correlation_cols].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation, annot=True, fmt='.3f', cmap='coolwarm', 
            center=0, square=True, linewidths=1)
plt.title('Correlation Matrix', fontsize=16, fontweight='bold')
plt.savefig('../outputs/figures/correlation_heatmap.png', dpi=300)
plt.show()

# %%
# Key insights summary
print("\n" + "="*60)
print("KEY INSIGHTS FROM EXPLORATION")
print("="*60)

print("\n1. Dataset Overview:")
print(f"   - Total customers: {len(df)}")
print(f"   - Features: {df.shape[1]}")
print(f"   - Churn rate: {(df['Churn']=='Yes').sum() / len(df) * 100:.1f}%")

print("\n2. Churn Patterns:")
print(f"   - Month-to-month contracts have highest churn")
print(f"   - New customers (low tenure) churn more")
print(f"   - Fiber optic users show higher churn")

print("\n3. Data Quality:")
print(f"   - Missing values in TotalCharges: {df['TotalCharges'].isna().sum()}")
print(f"   - Need to handle categorical encoding")
print(f"   - Feature engineering opportunities identified")

print("\n✓ Exploration complete! Ready for preprocessing.")

# %%