# 01 - Exploratory Data AnalysisQuick EDA for the **Customer Personality Analysis** dataset to validate cleaning rules and derive baseline visualizations aligned with the methodology (outlier checks, RFM distributions, response lift by segment).

In [None]:
from pathlib import Path
import sys

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

PROJECT_ROOT = Path(__file__).resolve().parents[1]
if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (10, 6)
DATA_DIR = PROJECT_ROOT / "data" / "raw"
CSV_NAME = "marketing_campaign.csv"


In [None]:
from customer_segmentation.src.data.load import load_raw_data
from customer_segmentation.src.data.preprocess import clean_data
from customer_segmentation.src.data.features import build_rfm_features, add_structural_features, add_response_label

try:
    raw_df = load_raw_data(DATA_DIR, filename=CSV_NAME, parse_dates=["Dt_Customer"])
    print(f"Loaded raw dataset with shape: {raw_df.shape}")
except FileNotFoundError as exc:
    print(exc)
    raw_df = pd.DataFrame()


## Basic structure- Schema inspection- Missing value ratios- Preview of numeric distributions

In [None]:
if not raw_df.empty:
    display(raw_df.head())
    display(raw_df.describe(include="all").transpose().head(15))
    missing = raw_df.isna().mean().sort_values(ascending=False)
    display(missing)
else:
    print("Raw dataframe is empty; please place marketing_campaign.csv in data/raw.")


## Cleaned data with derived Age/Income clippingUsing the shared preprocessing helper to mirror the experiment pipeline.

In [None]:
if not raw_df.empty:
    cleaned_df = clean_data(raw_df)
    print(f"Cleaned shape: {cleaned_df.shape}")
    display(cleaned_df[[c for c in ["Income", "Age", "Year_Birth"] if c in cleaned_df]].describe())
else:
    cleaned_df = pd.DataFrame()


## Feature engineering snapshotConstruct RFM, structural features, and the promotion-response label.

In [None]:
if not cleaned_df.empty:
    engineered = add_structural_features(build_rfm_features(cleaned_df))
    try:
        engineered = add_response_label(engineered)
    except KeyError as err:
        print(err)
    display(engineered.head())
else:
    engineered = pd.DataFrame()


## Univariate distributionsCore numeric variables after cleaning.

In [None]:
if not engineered.empty:
    numeric_cols = [col for col in ["Income", "Age", "recency", "frequency", "monetary"] if col in engineered]
    melted = engineered[numeric_cols].melt(var_name="feature", value_name="value")
    g = sns.displot(data=melted, x="value", col="feature", col_wrap=3, sharex=False, kde=True)
    plt.tight_layout()
    plt.show()
else:
    print("No engineered data to plot.")


## Response rate by marital status / educationIllustrates segmentation relevance of demographic factors.

In [None]:
if not engineered.empty and "response" in engineered:
    demo_cols = [col for col in ["Marital_Status", "Education"] if col in engineered]
    for col in demo_cols:
        rates = engineered.groupby(col)["response"].mean().sort_values(ascending=False)
        sns.barplot(x=rates.index, y=rates.values)
        plt.title(f"Response rate by {col}")
        plt.ylabel("Response rate")
        plt.xticks(rotation=30)
        plt.show()
else:
    print("Response label or demographic columns missing; skipping bar plots.")


## Bivariate viewsIncome vs. monetary spend and recency vs. frequency to spot heterogeneity.

In [None]:
if not engineered.empty:
    if {"Income", "monetary"}.issubset(engineered.columns):
        sns.scatterplot(data=engineered, x="Income", y="monetary", hue=engineered.get("response"))
        plt.title("Income vs. Monetary (colored by response)")
        plt.show()
    if {"recency", "frequency"}.issubset(engineered.columns):
        sns.scatterplot(data=engineered, x="recency", y="frequency", hue=engineered.get("response"))
        plt.title("Recency vs. Frequency")
        plt.show()
else:
    print("Insufficient data for bivariate plots.")


## CorrelationsHeatmap over engineered numeric features to spot co-linearity before modeling.

In [None]:
if not engineered.empty:
    num_cols = engineered.select_dtypes(include=["number"]).columns
    corr = engineered[num_cols].corr()
    sns.heatmap(corr, cmap="coolwarm", center=0)
    plt.title("Numeric correlation heatmap")
    plt.show()
