
# Exploratory Data Analysis – Customer Churn


In [None]:
## Objective
#The objective of this analysis is to explore customer churn data in order to identify patterns, trends, and variables potentially associated with customer retention and churn.

In [None]:
## Data Loading and Initial Inspection

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set(color_codes=True)

df = pd.read_csv("../data/churn_dataset.csv")
df.head()

In [None]:
df.info()
df.describe()

In [None]:
## Data Cleaning

# 1) Drop irrelevant columns (only if clearly identifiers or empty)
cols_to_drop = ["Column_1", "Column_2"]  # e.g., IDs, unnamed index columns
df = df.drop(columns=[c for c in cols_to_drop if c in df.columns], errors="ignore")

# 2) Standardize column names
rename_map = {"Column_1": "C1", "Column_2": "C2"}
df = df.rename(columns={k: v for k, v in rename_map.items() if k in df.columns})

# 3) Remove duplicates
n_before = len(df)
df = df.drop_duplicates()
print(f"Removed duplicates: {n_before - len(df)}")

# 4) Missing values
missing = df.isna().mean().sort_values(ascending=False)
display(missing.head(15))

# Example strategy:
# - drop rows only if target is missing (replace 'Churn' with your target)
target_col = "Churn"
if target_col in df.columns:
    n_before = len(df)
    df = df.dropna(subset=[target_col])
    print(f"Dropped rows with missing target: {n_before - len(df)}")

# - for numerical columns: impute with median (simple, robust baseline)
num_cols = df.select_dtypes(include="number").columns
df[num_cols] = df[num_cols].apply(lambda s: s.fillna(s.median()))

# - for categorical columns: impute with mode
cat_cols = df.select_dtypes(exclude="number").columns
for c in cat_cols:
    if df[c].isna().any():
        df[c] = df[c].fillna(df[c].mode().iloc[0])

# 5) Type conversions
# df["C3"] = pd.to_numeric(df["C3"], errors="coerce")

# 6) Outliers (IQR method) — apply per feature
def iqr_filter(dataframe, col, k=1.5):
    q1 = dataframe[col].quantile(0.25)
    q3 = dataframe[col].quantile(0.75)
    iqr = q3 - q1
    lower = q1 - k * iqr
    upper = q3 + k * iqr
    return dataframe[(dataframe[col] >= lower) & (dataframe[col] <= upper)]

# Choose only the columns where outliers are truly problematic
outlier_cols = [c for c in ["Price"] if c in df.columns]

n_before = len(df)
for col in outlier_cols:
    df = iqr_filter(df, col, k=1.5)
print(f"Removed rows due to outliers: {n_before - len(df)}")

print("Final shape:", df.shape)


In [None]:
## Outlier Identification (IQR Method – Flagging, Not Removal)

#Outliers are identified using the Interquartile Range (IQR) method and **flagged rather than removed**. This approach preserves potentially meaningful extreme observations (e.g. high-value customers), while still allowing their impact to be analyzed explicitly in subsequent steps.

def iqr_flags(s, k=1.5):
    q1, q3 = s.quantile(0.25), s.quantile(0.75)
    iqr = q3 - q1
    lower, upper = q1 - k*iqr, q3 + k*iqr
    return (s < lower) | (s > upper)

if "Price" in df.columns:
    df["Price_is_outlier"] = iqr_flags(df["Price"]).astype(int)


In [None]:
## Univariate Analysis

# The distribution of individual variables is explored to understand the overall data structure and identify potential skewness, imbalance or anomalous values.

# 1) Bar plot
plt.figure(figsize=(10, 5))
df["Make"].value_counts().nlargest(20).plot(kind="bar")
plt.title("Distribution of Make (Top 20 Categories)")
plt.xlabel("Make")
plt.ylabel("Count")
plt.tight_layout()
plt.show()

# 2) Histogram
plt.figure(figsize=(6, 4))
sns.histplot(df["Tenure"], bins=30)
plt.title("Distribution of Customer Tenure")
plt.xlabel("Tenure")
plt.ylabel("Count")
plt.tight_layout()
plt.show()

# 3) Target variable – Class balance
plt.figure(figsize=(4, 3))
df["Churn"].value_counts(normalize=True).plot(kind="bar")
plt.title("Churn Class Distribution")
plt.xlabel("Churn")
plt.ylabel("Proportion")
plt.tight_layout()
plt.show()


In [None]:
## Bivariate Analysis

# Relationships between pairs of variables are explored to identify potential associations with customer churn and to highlight patterns useful for further analysis.

# 1) Scatterplot
plt.figure(figsize=(7,5))
sns.scatterplot(
    x="Tenure",
    y="C1",
    hue="Churn",
    data=df,
    alpha=0.6
)
plt.title("Tenure vs C1 by Churn Status")
plt.xlabel("Tenure")
plt.ylabel("C1")
plt.tight_layout()
plt.show()


# 2) Heatmap
# Select only numerical features for correlation analysis
num_features = df.select_dtypes(include="number")

plt.figure(figsize=(10,8))
sns.heatmap(
    num_features.corr(),
    cmap="coolwarm",
    center=0,
    annot=False
)
plt.title("Correlation Matrix of Numerical Features")
plt.tight_layout()
plt.show()


In [None]:
### Bivariate Insights

- Tenure shows different distribution patterns between churned and non-churned customers.
- Certain numerical features appear moderately correlated, suggesting potential redundancy.
- These relationships guide feature selection for subsequent analysis.