# 02 - Debug PrototypeFast, end-to-end smoke tests for clustering and downstream response modeling on a small sample. This notebook mirrors the experiment scripts with lighter settings to validate data flow and metric calculations.

In [None]:
from pathlib import Path
import sys

import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

PROJECT_ROOT = Path(__file__).resolve().parents[1]
if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))


In [None]:
from customer_segmentation.src.data.load import load_raw_data
from customer_segmentation.src.data.preprocess import clean_data
from customer_segmentation.src.data.features import assemble_feature_table
from customer_segmentation.src.models.kmeans_baseline import run_kmeans
from customer_segmentation.src.models.gmm_baseline import run_gmm
from customer_segmentation.src.models.rajc import RAJCConfig, RAJCModel
from customer_segmentation.src.utils.seed_utils import seed_everything
from customer_segmentation.src.utils.metrics_utils import response_rate_by_cluster, classification_summary

seed_everything(42)
DATA_DIR = PROJECT_ROOT / "data" / "raw"
CSV_NAME = "marketing_campaign.csv"


In [None]:
try:
    raw_df = load_raw_data(DATA_DIR, filename=CSV_NAME, parse_dates=["Dt_Customer"])
    cleaned_df = clean_data(raw_df)
    print(f"Loaded and cleaned: {cleaned_df.shape}")
except FileNotFoundError as exc:
    print(exc)
    cleaned_df = pd.DataFrame()


## Feature assembly and quick train/val splitThe engineered feature table matches the experiment scripts; we keep a small hold-out for downstream checks.

In [None]:
if not cleaned_df.empty:
    features_df, labels, transformer = assemble_feature_table(cleaned_df)
    X_train, X_val, y_train, y_val = train_test_split(
        features_df, labels, test_size=0.2, random_state=42, stratify=labels
    )
    print(f"Features: {features_df.shape}, train: {X_train.shape}, val: {X_val.shape}")
else:
    features_df = pd.DataFrame()


## Baseline clustering sanity checksRun lightweight K-Means and GMM on the full feature set and inspect response heterogeneity per cluster.

In [None]:
if not features_df.empty:
    km_model, km_labels = run_kmeans(features_df, n_clusters=4, random_state=42)
    print(f"K-Means inertia: {km_model.inertia():.2f}")
    print(response_rate_by_cluster(km_labels, labels))

    gmm_model, gmm_labels = run_gmm(features_df, n_components=4, random_state=42)
    print(response_rate_by_cluster(gmm_labels, labels))
else:
    print("No features available; skipping clustering checks.")


## RAJC small runA brief alternating optimization pass to validate the joint objective wiring.

In [None]:
if not features_df.empty:
    config = RAJCConfig(n_clusters=4, lambda_response=0.5, max_iter=5, random_state=42)
    rajc = RAJCModel(config=config)
    rajc.fit(features_df, labels)
    rajc_labels = rajc.predict(features_df)
    print(response_rate_by_cluster(rajc_labels, labels))
else:
    print("No features available; skipping RAJC run.")


## Downstream logistic regressionCompare base features vs. RAJC cluster IDs concatenated as additional signals.

In [None]:
if not features_df.empty:
    def evaluate_with_extra_clusters(cluster_labels=None):
        X_train_use, X_val_use = X_train.copy(), X_val.copy()
        if cluster_labels is not None:
            cluster_dummies = pd.get_dummies(cluster_labels, prefix="cluster")
            X_full = pd.concat([features_df, cluster_dummies], axis=1)
            X_train_use = X_full.loc[X_train.index]
            X_val_use = X_full.loc[X_val.index]

        clf = LogisticRegression(max_iter=200)
        clf.fit(X_train_use, y_train)
        val_proba = pd.Series(clf.predict_proba(X_val_use)[:, 1], index=X_val.index)
        return classification_summary(y_val, val_proba, top_fracs=(0.1, 0.2, 0.3))

    print("Base only:")
    display(evaluate_with_extra_clusters())

    print("+ RAJC clusters:")
    display(evaluate_with_extra_clusters(rajc_labels if not features_df.empty else None))
else:
    print("No features available; skipping downstream logistic regression.")
