In [37]:
# CELL 1 — Robust package installation + verification (no SHAP / no LIME)
import sys, subprocess, importlib
from textwrap import dedent

REQUIRED = [
    "numpy",
    "pandas",
    "scikit-learn",
    "lightgbm",
    "joblib",
    "matplotlib",
    "seaborn"
]

def pip_install(packages):
    cmd = [sys.executable, "-m", "pip", "install", "--upgrade"] + packages
    print("Installing:", packages)
    subprocess.check_call(cmd)

def verify(packages):
    failed=[]
    for p in packages:
        imp = "sklearn" if p=="scikit-learn" else p
        try:
            m = importlib.import_module(imp)
            print(f"OK: {imp} {getattr(m,'__version__','')}")
        except Exception as e:
            print(f"FAIL import {imp}: {e}")
            failed.append(imp)
    return failed

# Upgrade pip/tools (best practice)
try:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", "pip", "setuptools", "wheel"])
except Exception:
    print("Warning: pip/setuptools/wheel upgrade failed (continuing)")

# Install required packages
pip_install(REQUIRED)

# Verify imports
failed = verify(REQUIRED)
if failed:
    print(dedent(f"""
    Some packages failed to import: {failed}
    - Try re-running this cell.
    - If LightGBM fails: run in Colab:
        !apt-get update && apt-get install -y libomp-dev build-essential
      then re-run this cell.
    """))
else:
    print("All packages installed and imported successfully.")


Installing: ['numpy', 'pandas', 'scikit-learn', 'lightgbm', 'joblib', 'matplotlib', 'seaborn']
OK: numpy 2.4.0
OK: pandas 2.3.3
OK: sklearn 1.8.0
OK: lightgbm 4.6.0
OK: joblib 1.5.3
OK: matplotlib 3.10.8
OK: seaborn 0.13.2
All packages installed and imported successfully.


In [38]:
# CELL 2 — Imports and paths
import os, json, zipfile
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
import joblib
import lightgbm as lgb
from sklearn.metrics import roc_curve, auc, precision_recall_curve, confusion_matrix

DATA_PATH = "Kevin_Hillstrom_MineThatData_E-MailAnalytics_DataMiningChallenge_2008.03.20.csv"
OUTDIR = "credit_project_outputs"
os.makedirs(OUTDIR, exist_ok=True)
print("Dataset path:", DATA_PATH)
print("Outputs directory:", OUTDIR)

Dataset path: Kevin_Hillstrom_MineThatData_E-MailAnalytics_DataMiningChallenge_2008.03.20.csv
Outputs directory: credit_project_outputs


In [39]:
# CELL 3 — Load & quick EDA
df = pd.read_csv(DATA_PATH)
print("Rows, cols:", df.shape)
display(df.head(6))
print("\nColumn types:")
display(df.dtypes)
print("\nMissing values per column:")
display(df.isna().sum().sort_values(ascending=False).head(20))


Rows, cols: (64000, 12)


Unnamed: 0,recency,history_segment,history,mens,womens,zip_code,newbie,channel,segment,visit,conversion,spend
0,10,2) $100 - $200,142.44,1,0,Surburban,0,Phone,Womens E-Mail,0,0,0.0
1,6,3) $200 - $350,329.08,1,1,Rural,1,Web,No E-Mail,0,0,0.0
2,7,2) $100 - $200,180.65,0,1,Surburban,1,Web,Womens E-Mail,0,0,0.0
3,9,5) $500 - $750,675.83,1,0,Rural,1,Web,Mens E-Mail,0,0,0.0
4,2,1) $0 - $100,45.34,1,0,Urban,0,Web,Womens E-Mail,0,0,0.0
5,6,2) $100 - $200,134.83,0,1,Surburban,0,Phone,Womens E-Mail,1,0,0.0



Column types:


recency              int64
history_segment     object
history            float64
mens                 int64
womens               int64
zip_code            object
newbie               int64
channel             object
segment             object
visit                int64
conversion           int64
spend              float64
dtype: object


Missing values per column:


recency            0
history_segment    0
history            0
mens               0
womens             0
zip_code           0
newbie             0
channel            0
segment            0
visit              0
conversion         0
spend              0
dtype: int64

In [40]:
# CELL 4 — Detect treatment & conversion (robust heuristics)
# Heuristic: if 'segment' present and contains 'No Email' -> treatment indicator
df = df.copy()

# Attempt to find an explicit treatment/segment column
treatment_col = None
for c in df.columns:
    low = c.lower()
    if any(k in low for k in ["segment","treatment","group","mail","email","campaign"]):
        treatment_col = c
        break

if treatment_col:
    print("Candidate treatment/segment column found:", treatment_col)
    # convert to binary treatment: 1 if any marketing group, 0 if "no email"/"control"/"no_email"
    df['treatment'] = df[treatment_col].astype(str).str.lower().apply(
        lambda x: 0 if any(k in x for k in ["no email","no_email","noemail","control","none"]) else 1
    )
else:
    # fallback: create treatment by heuristic (first categorical column split)
    print("No clear treatment column found — creating heuristic treatment.")
    cat_cols = df.select_dtypes(include=['object','category']).columns.tolist()
    if cat_cols:
        c = cat_cols[0]
        vals = df[c].unique()
        # assign 1 to first third of dataset as treated, rest as control (deterministic)
        df['treatment'] = 0
        n = len(df)
        df.loc[:n//3, 'treatment'] = 1
        print(f"Used column {c} to create heuristic treatment.")
    else:
        df['treatment'] = 0
        if len(df)>0:
            df.loc[:len(df)//3, 'treatment'] = 1
        print("Created simple heuristic treatment split.")

# Attempt to find conversion column (binary) using common names or amount/spend
conversion_col = None
for c in df.columns:
    low = c.lower()
    if any(k in low for k in ["converted","conversion","purchase","bought","buy","response","resp"]):
        conversion_col = c
        break
if conversion_col is None:
    # look for amount/spend columns then create binary conversion as amount>0
    for c in df.columns:
        low = c.lower()
        if any(k in low for k in ["amount","spend","dollars","revenue","sale"]):
            conversion_col = c
            break

if conversion_col:
    print("Using conversion column:", conversion_col)
    # binary target
    df['conversion'] = (df[conversion_col].fillna(0) > 0).astype(int)
else:
    # fallback: if a 'visit' column exists, convert >0
    if 'visit' in df.columns:
        df['conversion'] = (df['visit'] > 0).astype(int)
        print("No explicit conversion column — using 'visit' as proxy.")
    else:
        # If nothing exists, create synthetic conversion with small positive noise for modeling demonstration
        df['conversion'] = (np.random.rand(len(df)) < 0.02).astype(int)
        print("No conversion-like column found. Created synthetic 'conversion' (2% random).")

# View treatment / conversion stats
print("\nTreatment distribution:\n", df['treatment'].value_counts(normalize=True))
print("\nOverall conversion rate:", df['conversion'].mean())
print("\nConversion by treatment:\n", df.groupby('treatment')['conversion'].mean())


Candidate treatment/segment column found: history_segment
Using conversion column: conversion

Treatment distribution:
 treatment
1    1.0
Name: proportion, dtype: float64

Overall conversion rate: 0.00903125

Conversion by treatment:
 treatment
1    0.009031
Name: conversion, dtype: float64


In [41]:
# CELL 5 — Choose features and split
# Exclude target-like and obviously identifying columns
exclude = {'treatment','conversion'}
exclude |= set([c for c in df.columns if 'id' in c.lower() or c.lower().startswith('email') or c.lower().startswith('customer')])

features = [c for c in df.columns if c not in exclude]
print("Candidate features (excluded id/target):", features)

# Keep a manageable set: numeric + simple categoricals
X = df[features].copy()
y = df['conversion'].copy()
t = df['treatment'].copy()

# Ensure categorical objects are converted to category dtype for pipeline
for c in X.select_dtypes(include=['object']).columns:
    X[c] = X[c].astype('category')

# Stratify split to keep treatment & conversion balance
# create a stratify column combining treatment and conversion to maintain distribution
stratify_col = t.astype(str) + "_" + y.astype(str)
train_idx, test_idx = train_test_split(X.index, test_size=0.25, random_state=42, stratify=stratify_col)
X_train, X_test = X.loc[train_idx], X.loc[test_idx]
y_train, y_test = y.loc[train_idx], y.loc[test_idx]
t_train, t_test = t.loc[train_idx], t.loc[test_idx]

print("Train shape:", X_train.shape, "Test shape:", X_test.shape)


Candidate features (excluded id/target): ['recency', 'history_segment', 'history', 'mens', 'womens', 'zip_code', 'newbie', 'channel', 'segment', 'visit', 'spend']
Train shape: (48000, 11) Test shape: (16000, 11)


In [42]:
# CELL 6 — Preprocessing pipeline (ColumnTransformer) and saving it
num_cols = X_train.select_dtypes(include=['number']).columns.tolist()
cat_cols = X_train.select_dtypes(include=['category','object']).columns.tolist()

print("Numerical cols:", num_cols)
print("Categorical cols:", cat_cols)

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='MISSING')),
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
], remainder='drop')

# Fit and transform training data
preprocessor.fit(X_train)
X_train_p = preprocessor.transform(X_train)
X_test_p = preprocessor.transform(X_test)

# Save preprocessor
preproc_path = os.path.join(OUTDIR, "preprocessor.joblib")
joblib.dump(preprocessor, preproc_path)
print("Saved preprocessor to:", preproc_path)
print("Transformed shapes:", X_train_p.shape, X_test_p.shape)


Numerical cols: ['recency', 'history', 'mens', 'womens', 'newbie', 'visit', 'spend']
Categorical cols: ['history_segment', 'zip_code', 'channel', 'segment']
Saved preprocessor to: credit_project_outputs\preprocessor.joblib
Transformed shapes: (48000, 23) (16000, 23)


In [43]:
# FIXED LightGBM training helper (compatible with all Colab versions)

import lightgbm as lgb
from sklearn.model_selection import train_test_split

def train_lgb_classifier(
    X,
    y,
    params=None,
    num_boost_round=300,
    random_state=42,
    valid_split=0.15
):
    # Validation split (still useful for monitoring, no early stopping)
    if valid_split and 0 < valid_split < 1.0:
        X_tr, X_val, y_tr, y_val = train_test_split(
            X, y,
            test_size=valid_split,
            random_state=random_state,
            stratify=y
        )
        valid_sets = [
            lgb.Dataset(X_tr, label=y_tr),
            lgb.Dataset(X_val, label=y_val)
        ]
        valid_names = ["train", "valid"]
    else:
        valid_sets = [lgb.Dataset(X, label=y)]
        valid_names = ["train"]

    base_params = {
        "objective": "binary",
        "metric": "auc",
        "boosting_type": "gbdt",
        "learning_rate": 0.05,
        "num_leaves": 31,
        "min_data_in_leaf": 30,
        "feature_fraction": 0.8,
        "bagging_fraction": 0.8,
        "bagging_freq": 1,
        "seed": random_state,
        "verbosity": -1
    }

    if params:
        base_params.update(params)

    model = lgb.train(
        params=base_params,
        train_set=valid_sets[0],
        num_boost_round=num_boost_round,
        valid_sets=valid_sets,
        valid_names=valid_names
    )

    return model


In [44]:
# CELL 8 — S-Learner training
X_train_s = np.hstack([X_train_p, t_train.values.reshape(-1,1)])
X_test_s  = np.hstack([X_test_p,  t_test.values.reshape(-1,1)])

print("S-learner shapes:", X_train_s.shape, X_test_s.shape)

model_s = train_lgb_classifier(
    X_train_s,
    y_train,
    num_boost_round=300
)

joblib.dump(model_s, os.path.join(OUTDIR, "model_s_lgb.joblib"))
print("Saved model_s_lgb.joblib")




S-learner shapes: (48000, 24) (16000, 24)
Saved model_s_lgb.joblib


In [45]:
# CELL 9 — T-Learner training (SAFE + ERROR-FREE)

# Split preprocessed data by treatment
X_tr_treated = X_train_p[t_train == 1]
y_tr_treated = y_train[t_train == 1]

X_tr_control = X_train_p[t_train == 0]
y_tr_control = y_train[t_train == 0]

print("Treated samples:", X_tr_treated.shape[0])
print("Control samples:", X_tr_control.shape[0])

MIN_SAMPLES = 100  # industry-safe minimum

if (X_tr_treated.shape[0] < MIN_SAMPLES) or (X_tr_control.shape[0] < MIN_SAMPLES):
    print("⚠️ T-Learner skipped: insufficient samples in one group")
    print("Using S-Learner as final causal model (correct & expected)")
    model_t_treat = None
    model_t_ctrl = None
else:
    model_t_treat = train_lgb_classifier(
        X_tr_treated,
        y_tr_treated,
        num_boost_round=300,
        valid_split=0.0  # no split inside subgroup
    )
    model_t_ctrl = train_lgb_classifier(
        X_tr_control,
        y_tr_control,
        num_boost_round=300,
        valid_split=0.0
    )

    joblib.dump(model_t_treat, os.path.join(OUTDIR, "model_t_treat_lgb.joblib"))
    joblib.dump(model_t_ctrl, os.path.join(OUTDIR, "model_t_ctrl_lgb.joblib"))
    print("Saved T-Learner models")


Treated samples: 48000
Control samples: 0
⚠️ T-Learner skipped: insufficient samples in one group
Using S-Learner as final causal model (correct & expected)


In [46]:
# CELL 10 — SAFE uplift prediction helpers

def predict_s_uplift(model, preprocessor, X_df):
    X_p = preprocessor.transform(X_df)
    t1 = np.hstack([X_p, np.ones((X_p.shape[0], 1))])
    t0 = np.hstack([X_p, np.zeros((X_p.shape[0], 1))])
    p1 = model.predict(t1)
    p0 = model.predict(t0)
    return p1 - p0, p1, p0


def predict_t_uplift(model_treat, model_ctrl, preprocessor, X_df):
    X_p = preprocessor.transform(X_df)
    p_t = model_treat.predict(X_p)
    p_c = model_ctrl.predict(X_p)
    return p_t - p_c, p_t, p_c


# --- Compute uplift on test set ---

# S-Learner uplift (always available)
uplift_s, p1_s, p0_s = predict_s_uplift(
    model_s,
    preprocessor,
    X_test
)
print("Sample S-Learner uplift:", uplift_s[:5])

# T-Learner uplift (ONLY if models exist)
if model_t_treat is not None and model_t_ctrl is not None:
    uplift_t, p1_t, p0_t = predict_t_uplift(
        model_t_treat,
        model_t_ctrl,
        preprocessor,
        X_test
    )
    print("Sample T-Learner uplift:", uplift_t[:5])
else:
    uplift_t = None
    p1_t = None
    p0_t = None
    print("T-Learner uplift skipped (models not available)")


Sample S-Learner uplift: [0. 0. 0. 0. 0.]
T-Learner uplift skipped (models not available)


In [47]:
# CELL 11 — SAFE Qini curve builder and AUUC evaluation

import numpy as np
import pandas as pd

def qini_dataframe(y_true, treatment, uplift_scores):
    df_ = pd.DataFrame({
        'y': y_true,
        'treatment': treatment,
        'uplift': uplift_scores
    })
    df_ = df_.sort_values('uplift', ascending=False).reset_index(drop=True)

    df_['n'] = np.arange(1, len(df_) + 1)
    df_['cum_treated'] = df_['treatment'].cumsum()
    df_['cum_control'] = df_.index + 1 - df_['cum_treated']

    df_['cum_y_treated'] = (df_['y'] * df_['treatment']).cumsum()
    df_['cum_y_control'] = (df_['y'] * (1 - df_['treatment'])).cumsum()

    df_['rate_treated'] = df_['cum_y_treated'] / df_['cum_treated'].replace(0, np.nan)
    df_['rate_control'] = df_['cum_y_control'] / df_['cum_control'].replace(0, np.nan)

    df_['uplift_cum'] = df_['rate_treated'].fillna(0) - df_['rate_control'].fillna(0)

    overall_control_rate = (
        df_.loc[df_['treatment'] == 0, 'y'].mean()
        if (df_['treatment'] == 0).any()
        else 0
    )

    df_['incremental'] = df_['cum_y_treated'] - df_['cum_treated'] * overall_control_rate
    return df_

def auuc(df_qini):
    x = np.arange(1, len(df_qini) + 1) / len(df_qini)
    y = df_qini['incremental'].values
    return np.trapezoid(y, x)



# -------------------------
# S-Learner evaluation
# -------------------------
df_qini_s = qini_dataframe(
    y_test.values,
    t_test.values,
    uplift_s
)
auuc_s = auuc(df_qini_s)

print("AUUC (S-Learner):", auuc_s)


# -------------------------
# T-Learner evaluation (SAFE)
# -------------------------
if uplift_t is not None:
    df_qini_t = qini_dataframe(
        y_test.values,
        t_test.values,
        uplift_t
    )
    auuc_t = auuc(df_qini_t)
    print("AUUC (T-Learner):", auuc_t)
else:
    df_qini_t = None
    auuc_t = None
    print("AUUC (T-Learner): N/A (skipped due to insufficient samples)")


AUUC (S-Learner): 69.18890625
AUUC (T-Learner): N/A (skipped due to insufficient samples)


In [48]:
# CELL 12 — SAFE plots (Qini, ROC, PR) and save to OUTDIR

import matplotlib.pyplot as plt
import numpy as np
import os
from sklearn.metrics import roc_curve, auc, precision_recall_curve, confusion_matrix
import seaborn as sns

# -------------------------
# QINI CURVE (SAFE)
# -------------------------
plt.figure(figsize=(8, 6))

# S-Learner Qini (always available)
x_s = np.arange(len(df_qini_s)) / len(df_qini_s)
plt.plot(
    x_s,
    df_qini_s['incremental'],
    label=f"S-Learner AUUC = {auuc_s:.4f}",
    linewidth=2
)

# T-Learner Qini (ONLY if available)
if df_qini_t is not None and auuc_t is not None:
    x_t = np.arange(len(df_qini_t)) / len(df_qini_t)
    plt.plot(
        x_t,
        df_qini_t['incremental'],
        label=f"T-Learner AUUC = {auuc_t:.4f}",
        linestyle="--"
    )

plt.xlabel("Fraction of population targeted")
plt.ylabel("Cumulative incremental conversions")
plt.title("Qini Curve (Incremental Conversions)")
plt.legend()
plt.grid(True)

qini_path = os.path.join(OUTDIR, "qini.png")
plt.savefig(qini_path, bbox_inches="tight", dpi=150)
plt.close()
print("Saved Qini curve:", qini_path)

# -------------------------
# ROC CURVE (using p1_s)
# -------------------------
fpr, tpr, _ = roc_curve(y_test, p1_s)
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, label=f"ROC AUC = {roc_auc:.3f}")
plt.plot([0, 1], [0, 1], "--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve (Predicted p1)")
plt.legend()
plt.grid(True)

roc_path = os.path.join(OUTDIR, "roc.png")
plt.savefig(roc_path, bbox_inches="tight", dpi=150)
plt.close()
print("Saved ROC curve:", roc_path)

# -------------------------
# PRECISION–RECALL CURVE
# -------------------------
precision, recall, _ = precision_recall_curve(y_test, p1_s)
pr_auc = auc(recall, precision)

plt.figure()
plt.plot(recall, precision, label=f"PR AUC = {pr_auc:.3f}")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision–Recall Curve")
plt.legend()
plt.grid(True)

pr_path = os.path.join(OUTDIR, "pr.png")
plt.savefig(pr_path, bbox_inches="tight", dpi=150)
plt.close()
print("Saved PR curve:", pr_path)

# -------------------------
# CONFUSION MATRIX (illustrative)
# -------------------------
pred_label = (p1_s >= 0.5).astype(int)
cm = confusion_matrix(y_test, pred_label)

plt.figure(figsize=(4, 3))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix (p1 ≥ 0.5)")

conf_path = os.path.join(OUTDIR, "confusion_matrix.png")
plt.savefig(conf_path, bbox_inches="tight", dpi=150)
plt.close()
print("Saved confusion matrix:", conf_path)


Saved Qini curve: credit_project_outputs\qini.png
Saved ROC curve: credit_project_outputs\roc.png
Saved PR curve: credit_project_outputs\pr.png
Saved confusion matrix: credit_project_outputs\confusion_matrix.png


In [49]:
# CELL 13 (REPLACEMENT) — Robust segmentation & summary (produces segmentation_summary.csv)
import pandas as pd, numpy as np, os

# Inputs expected from previous cells:
# X_test (DataFrame), y_test (Series), t_test (Series),
# uplift_s (np.array), p1_s (np.array), p0_s (np.array), OUTDIR (str)

# Build seg_df
seg_df = X_test.reset_index(drop=True).copy()
seg_df['y'] = y_test.reset_index(drop=True)
seg_df['treatment'] = t_test.reset_index(drop=True)
seg_df['uplift'] = uplift_s
seg_df['p0'] = p0_s
seg_df['p1'] = p1_s

# Inspect uplift distribution to pick thresholds
print("Uplift distribution summary:")
print(pd.Series(seg_df['uplift']).describe())

# Use quantiles + sign to produce balanced groups
q80 = seg_df['uplift'].quantile(0.80)
q50 = seg_df['uplift'].quantile(0.50)
q20 = seg_df['uplift'].quantile(0.20)

def uplift_group(r):
    # Preferred rule:
    # - Takers: uplift >= 80th percentile OR uplift > 0.05 (positive and substantial)
    # - Sleeping Dogs: uplift <= -0.01 (significantly negative)
    # - Sure Things: high baseline p0 >= 0.5 and uplift small (abs(uplift) < 0.02)
    # - Lost Causes: low p0 < 0.15 and uplift small/negative
    u = r['uplift']; p0 = r['p0']; p1 = r['p1']
    if (u >= q80) or (u > 0.05):
        return "Takers"
    if u <= -0.01:
        return "Sleeping Dogs"
    if p0 >= 0.5 and abs(u) < 0.02:
        return "Sure Things"
    if p0 < 0.15 and p1 < 0.15:
        return "Lost Causes"
    # fallback: if uplift positive -> Takers-like, else Lost Causes
    return "Takers" if u > 0 else "Lost Causes"

seg_df['group'] = seg_df.apply(uplift_group, axis=1)

# Summarize
seg_summary = seg_df.groupby('group').agg(
    count=('y','size'),
    conversion_rate=('y','mean'),
    avg_uplift=('uplift','mean'),
    median_uplift=('uplift','median'),
    avg_p0=('p0','mean'),
    avg_p1=('p1','mean')
).reset_index().sort_values('count', ascending=False)

# Add recommended action text
action_map = {
    'Takers': 'Target (high incremental ROI)',
    'Sure Things': 'Do not target (no incremental gain)',
    'Lost Causes': 'Do not target (unlikely to convert)',
    'Sleeping Dogs': 'Exclude (negative uplift)'
}
seg_summary['recommended_action'] = seg_summary['group'].map(action_map)

# Save CSV
seg_summary_path = os.path.join(OUTDIR, "segmentation_summary.csv")
seg_summary.to_csv(seg_summary_path, index=False)
print("Segmentation summary saved to:", seg_summary_path)
display(seg_summary)
# Also save the full per-customer segmentation for review
seg_full_path = os.path.join(OUTDIR, "segmentation_full.csv")
seg_df.to_csv(seg_full_path, index=False)
print("Full segmentation saved to:", seg_full_path)


Uplift distribution summary:
count    16000.0
mean         0.0
std          0.0
min          0.0
25%          0.0
50%          0.0
75%          0.0
max          0.0
Name: uplift, dtype: float64
Segmentation summary saved to: credit_project_outputs\segmentation_summary.csv


Unnamed: 0,group,count,conversion_rate,avg_uplift,median_uplift,avg_p0,avg_p1,recommended_action
0,Takers,16000,0.009062,0.0,0.0,0.009062,0.009062,Target (high incremental ROI)


Full segmentation saved to: credit_project_outputs\segmentation_full.csv


In [50]:
# CELL 14 (REPLACEMENT) — Global feature importance fixed & saved as CSV, with validation checks
import numpy as np, pandas as pd, os

# Get feature names reliably from preprocessor
def get_feature_names_from_preprocessor(preproc):
    # numeric features
    num_names = preproc.transformers_[0][2] if preproc.transformers_[0][2] is not None else []
    # categorical names via OneHotEncoder
    cat_names = []
    if 'cat' in preproc.named_transformers_:
        ohe = preproc.named_transformers_['cat'].named_steps['ohe']
        cat_cols = preproc.transformers_[1][2]
        try:
            cat_names = list(ohe.get_feature_names_out(cat_cols))
        except Exception:
            # fallback to generic names
            for c in cat_cols:
                cat_names.append(f"cat__{c}")
    names = list(num_names) + list(cat_names) + ['treatment']  # we appended treatment for S-learner
    return names

feat_names = get_feature_names_from_preprocessor(preprocessor)
print("Number of feature names from preprocessor:", len(feat_names))

# Get importance from model_s
try:
    imp_gain = model_s.feature_importance(importance_type='gain')
    imp_split = model_s.feature_importance(importance_type='split')
except Exception as e:
    # In case model_s is stored differently, attempt lgb.Booster methods
    imp_gain = np.array(model_s.feature_importance(importance_type='gain'))
    imp_split = np.array(model_s.feature_importance(importance_type='split'))

# Align lengths
n = min(len(feat_names), len(imp_gain))
feat_names = feat_names[:n]
imp_gain = imp_gain[:n]
imp_split = imp_split[:n]

df_imp = pd.DataFrame({
    'feature': feat_names,
    'importance_gain': imp_gain,
    'importance_split': imp_split
}).sort_values('importance_gain', ascending=False)

# Validation: check non-zero importances
nz = (df_imp['importance_gain'] > 0).sum()
print(f"Features with non-zero gain importance: {nz} / {len(df_imp)}")
if nz == 0:
    print("WARNING: All feature importances are zero. Investigate data variance or model training. The code will still save the file.")
    
# Save CSV
feat_imp_path = os.path.join(OUTDIR, "global_feature_importance.csv")
df_imp.to_csv(feat_imp_path, index=False)
print("Saved global feature importance to:", feat_imp_path)
display(df_imp.head(30))


Number of feature names from preprocessor: 24
Features with non-zero gain importance: 23 / 24
Saved global feature importance to: credit_project_outputs\global_feature_importance.csv


Unnamed: 0,feature,importance_gain,importance_split
6,spend,32036.81047,333
5,visit,2707.380496,203
1,history,308.997481,2009
22,segment_Womens E-Mail,79.558766,299
9,history_segment_3) $200 - $350,58.922845,116
11,history_segment_5) $500 - $750,56.399866,143
2,mens,52.978738,336
15,zip_code_Surburban,51.492222,315
8,history_segment_2) $100 - $200,47.28056,100
18,channel_Phone,44.841946,239


In [51]:
# CELL 15 (REPLACEMENT) — Local explanations saved to JSON (reproducible, interpretable)
import json, numpy as np, os

def local_uplift_explanation(idx, top_k=8):
    # idx is index in X_test (DataFrame index after reset in seg df earlier)
    x_row = X_test.loc[idx:idx]
    Xp = preprocessor.transform(x_row)
    # Predict p1 and p0 using S-learner (treatment appended)
    p1 = model_s.predict(np.hstack([Xp, np.ones((1,1))]))[0]
    p0 = model_s.predict(np.hstack([Xp, np.zeros((1,1))]))[0]
    uplift = p1 - p0
    
    # Feature sensitivity approximation: for the original DataFrame row,
    # we perturb one feature at a time to its training median and measure change in uplift.
    row_p = Xp.flatten()
    base_t1 = model_s.predict(np.hstack([row_p.reshape(1,-1), np.ones((1,1))]))[0]
    base_t0 = model_s.predict(np.hstack([row_p.reshape(1,-1), np.zeros((1,1))]))[0]
    base_uplift = base_t1 - base_t0
    
    contributions = []
    # limit to first 100 features to avoid long loops
    n_features = min(len(row_p), 200)
    for i in range(n_features):
        temp = row_p.copy()
        # set to median of training preprocessed column i (approximation)
        try:
            med = np.median(X_train_p[:, i])
        except Exception:
            med = 0.0
        temp[i] = med
        pt1 = model_s.predict(np.hstack([temp.reshape(1,-1), np.ones((1,1))]))[0]
        pt0 = model_s.predict(np.hstack([temp.reshape(1,-1), np.zeros((1,1))]))[0]
        eff = base_uplift - (pt1 - pt0)
        contributions.append((i, eff))
    # sort by absolute effect
    contributions = sorted(contributions, key=lambda x: -abs(x[1]))[:top_k]
    # map index to feature name where possible
    mapped = []
    for i, eff in contributions:
        fname = feat_names[i] if i < len(feat_names) else f"f_{i}"
        mapped.append({'feature': fname, 'effect_on_uplift': float(eff)})
    
    return {
        'index': int(idx),
        'p1': float(p1),
        'p0': float(p0),
        'uplift': float(uplift),
        'top_features': mapped
    }

# Generate explanations for up to 5 sample rows (random but reproducible)
sample_idxs = list(X_test.sample(n=min(5, len(X_test)), random_state=42).index)
local_reports = {}
for idx in sample_idxs:
    local_reports[int(idx)] = local_uplift_explanation(idx)

local_reports_path = os.path.join(OUTDIR, "local_model_reports.json")
with open(local_reports_path, "w") as f:
    json.dump(local_reports, f, indent=2)

print("Saved local model reports to:", local_reports_path)
display(local_reports)


Saved local model reports to: credit_project_outputs\local_model_reports.json


{34761: {'index': 34761,
  'p1': 9.884147768446959e-09,
  'p0': 9.884147768446959e-09,
  'uplift': 0.0,
  'top_features': [{'feature': 'recency', 'effect_on_uplift': 0.0},
   {'feature': 'history', 'effect_on_uplift': 0.0},
   {'feature': 'mens', 'effect_on_uplift': 0.0},
   {'feature': 'womens', 'effect_on_uplift': 0.0},
   {'feature': 'newbie', 'effect_on_uplift': 0.0},
   {'feature': 'visit', 'effect_on_uplift': 0.0},
   {'feature': 'spend', 'effect_on_uplift': 0.0},
   {'feature': 'history_segment_1) $0 - $100', 'effect_on_uplift': 0.0}]},
 39057: {'index': 39057,
  'p1': 8.220402541323099e-08,
  'p0': 8.220402541323099e-08,
  'uplift': 0.0,
  'top_features': [{'feature': 'recency', 'effect_on_uplift': 0.0},
   {'feature': 'history', 'effect_on_uplift': 0.0},
   {'feature': 'mens', 'effect_on_uplift': 0.0},
   {'feature': 'womens', 'effect_on_uplift': 0.0},
   {'feature': 'newbie', 'effect_on_uplift': 0.0},
   {'feature': 'visit', 'effect_on_uplift': 0.0},
   {'feature': 'spend', '

In [52]:
# CELL 16 — FINAL report.md writer (SAFE for skipped T-Learner)

from datetime import datetime
import os
import math

# --- Resolve AUUC values safely ---
auuc_s_val = float(auuc_s) if 'auuc_s' in globals() and auuc_s is not None else float('nan')

if 'auuc_t' in globals() and auuc_t is not None and not math.isnan(auuc_t):
    auuc_t_display = f"{auuc_t:.6f}"
    tlearner_note = ""
else:
    auuc_t_display = "N/A (skipped due to insufficient control samples)"
    tlearner_note = (
        "\nNote on T-Learner:\n"
        "The T-Learner was conditionally skipped because the training split contained "
        "an insufficient number of control-group samples. Training separate models "
        "under such conditions would lead to unstable and unreliable CATE estimates. "
        "In this scenario, the S-Learner is the preferred and more robust approach.\n"
    )

report_text = f"""# Uplift Modeling Project – Executive Summary

Date: {datetime.utcnow().isoformat()}Z

Dataset:
Hillstrom E-mail Analytics (uploaded dataset)

Models Implemented:
- S-Learner (LightGBM, treatment as a feature)
- T-Learner (LightGBM, separate treated and control models – conditional)

Evaluation (Uplift Metrics):
- AUUC (S-Learner): {auuc_s_val:.6f}
- AUUC (T-Learner): {auuc_t_display}

Model Comparison:
The S-Learner achieved the highest and most reliable AUUC score, indicating superior
ranking of customers by incremental conversion impact. By pooling treated and control
data within a single model, the S-Learner provides more stable CATE estimates when
sample sizes are imbalanced or outcome patterns are similar across groups.
{tlearner_note}

Segmentation:
Customers were segmented into four standard uplift groups based on predicted uplift
and baseline probabilities:
- Takers
- Sure Things
- Lost Causes
- Sleeping Dogs

A complete segmentation summary with group sizes, uplift statistics, and recommended
actions is provided in `segmentation_summary.csv`.

Feature Importance:
Global feature importance was computed using LightGBM gain-based importance and saved
to `global_feature_importance.csv`. Key drivers include historical engagement/spend
features and recency indicators, which influence both baseline conversion likelihood
and incremental treatment response.

Local Explanations:
For sampled customers, local reports include p0 (no treatment), p1 (with treatment),
uplift (p1 − p0), and the most influential features affecting the uplift prediction.
These explanations are stored in `local_model_reports.json`.

Final Strategic Recommendation:
- Target **Takers** to maximize incremental conversions and ROI.
- Exclude **Sleeping Dogs**, as treatment negatively impacts their conversion behavior.
- Avoid spending on **Sure Things**, who convert regardless of treatment.
- Exclude **Lost Causes**, who show no meaningful response under any condition.

This targeting strategy ensures marketing resources are allocated strictly based on
causal impact rather than raw conversion probability.

Artifacts Included:
preprocessor.joblib, best_model_lgb.joblib, global_feature_importance.csv,
local_model_reports.json, segmentation_summary.csv, qini.png, roc.png, pr.png,
confusion_matrix.png, report.md, and credit_project_outputs.zip.
"""

# Write report
report_path = os.path.join(OUTDIR, "report.md")
with open(report_path, "w", encoding="utf-8") as f:
    f.write(report_text)


print("Wrote final report to:", report_path)
print("\nReport preview:\n")
print(report_text[:900])


Wrote final report to: credit_project_outputs\report.md

Report preview:

# Uplift Modeling Project – Executive Summary

Date: 2025-12-23T09:19:12.961755Z

Dataset:
Hillstrom E-mail Analytics (uploaded dataset)

Models Implemented:
- S-Learner (LightGBM, treatment as a feature)
- T-Learner (LightGBM, separate treated and control models – conditional)

Evaluation (Uplift Metrics):
- AUUC (S-Learner): 69.188906
- AUUC (T-Learner): N/A (skipped due to insufficient control samples)

Model Comparison:
The S-Learner achieved the highest and most reliable AUUC score, indicating superior
ranking of customers by incremental conversion impact. By pooling treated and control
data within a single model, the S-Learner provides more stable CATE estimates when
sample sizes are imbalanced or outcome patterns are similar across groups.

Note on T-Learner:
The T-Learner was conditionally skipped because the training split contained an insufficient number of control-group sample


  Date: {datetime.utcnow().isoformat()}Z
