# Customer Churn Prediction: Exploratory Data Analysis & Feature Engineering

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set plot style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (10, 6) # Default plot size

# Load the dataset (adjust path if needed, assuming notebook is in notebooks/ folder)
df = pd.read_csv('../data/WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [None]:
df.head()
df.info()
df.describe(include='all') # Use include='all' to see categorical summaries too

In [None]:
# Example: Convert TotalCharges to numeric, coerce errors to NaN
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Handle missing TotalCharges (e.g., fill with median for simplicity in EDA)
# In your actual script, you might drop these rows or use a more sophisticated imputation
df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)

# Drop CustomerID as it's not useful for modeling
df.drop('customerID', axis=1, inplace=True)

In [None]:
plt.figure(figsize=(15, 5))
plt.subplot(1, 3, 1)
sns.histplot(df['tenure'], kde=True)
plt.title('Distribution of Tenure')

plt.subplot(1, 3, 2)
sns.histplot(df['MonthlyCharges'], kde=True)
plt.title('Distribution of Monthly Charges')

plt.subplot(1, 3, 3)
sns.histplot(df['TotalCharges'], kde=True)
plt.title('Distribution of Total Charges')
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(x='Contract', data=df, palette='viridis')
plt.title('Customer Count by Contract Type')
plt.show()
# Repeat for other categorical features like 'InternetService', 'PaymentMethod', 'gender', etc.

In [None]:
# Example: Churn rate by Contract Type
plt.figure(figsize=(8, 5))
sns.countplot(x='Contract', hue='Churn', data=df, palette='coolwarm')
plt.title('Churn by Contract Type')
plt.xlabel('Contract Type')
plt.ylabel('Number of Customers')
plt.show()

In [None]:
# Example: Monthly Charges vs. Churn
plt.figure(figsize=(8, 5))
sns.boxplot(x='Churn', y='MonthlyCharges', data=df, palette='muted')
plt.title('Monthly Charges vs. Churn')
plt.show()

In [None]:
# First, ensure 'Churn' is numeric (0/1) for correlation
df_encoded_for_corr = df.copy()
df_encoded_for_corr['Churn'] = df_encoded_for_corr['Churn'].map({'Yes': 1, 'No': 0})
# Select only numerical columns for correlation matrix
numerical_cols = df_encoded_for_corr.select_dtypes(include=np.number).columns

plt.figure(figsize=(10, 8))
sns.heatmap(df_encoded_for_corr[numerical_cols].corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix of Numerical Features (and Churn)')
plt.show()

In [None]:
# Example from your script (simplified for notebook)
df['MonthlyToTotalRatio'] = df['MonthlyCharges'] / df['TotalCharges']
df['MonthlyToTotalRatio'].replace([np.inf, -np.inf], np.nan, inplace=True) # Handle division by zero
df['MonthlyToTotalRatio'].fillna(0, inplace=True) # Or median/mean

plt.figure(figsize=(8, 5))
sns.boxplot(x='Churn', y='MonthlyToTotalRatio', data=df, palette='pastel')
plt.title('Monthly to Total Charges Ratio vs. Churn')
plt.show()

## Data Preparation for Modeling

Before feeding the data into machine learning models, the following final preparation steps were performed:
- **One-Hot Encoding:** All remaining categorical features were converted into numerical format using one-hot encoding.
- **Feature Scaling:** Numerical features were scaled using `StandardScaler` to ensure that no single feature dominates the model due to its scale.

## Conclusion: Key Insights for Churn Prediction

The Exploratory Data Analysis has provided crucial insights into the factors driving customer churn:
1.  **Contract type (Month-to-month)** is a primary indicator of churn risk.
2.  **Internet service type (Fiber Optic)** also correlates highly with churn.
3.  Absence of **value-added services** like online security and tech support increases churn likelihood.
4.  **Tenure** and **Monthly Charges** also play significant roles.

These insights will be vital in guiding feature selection, model interpretation, and the formulation of targeted retention strategies.