In [25]:
# CELL 1 — Robust package installation + verification (no SHAP / no LIME)
import sys, subprocess, importlib
from textwrap import dedent

REQUIRED = [
    "numpy",
    "pandas",
    "scikit-learn",
    "lightgbm",
    "joblib",
    "matplotlib",
    "seaborn"
]

def pip_install(packages):
    cmd = [sys.executable, "-m", "pip", "install", "--upgrade"] + packages
    print("Installing:", packages)
    subprocess.check_call(cmd)

def verify(packages):
    failed=[]
    for p in packages:
        imp = "sklearn" if p=="scikit-learn" else p
        try:
            m = importlib.import_module(imp)
            print(f"OK: {imp} {getattr(m,'__version__','')}")
        except Exception as e:
            print(f"FAIL import {imp}: {e}")
            failed.append(imp)
    return failed

# Upgrade pip/tools (best practice)
try:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", "pip", "setuptools", "wheel"])
except Exception:
    print("Warning: pip/setuptools/wheel upgrade failed (continuing)")

# Install required packages
pip_install(REQUIRED)

# Verify imports
failed = verify(REQUIRED)
if failed:
    print(dedent(f"""
    Some packages failed to import: {failed}
    - Try re-running this cell.
    - If LightGBM fails: run in Colab:
        !apt-get update && apt-get install -y libomp-dev build-essential
      then re-run this cell.
    """))
else:
    print("All packages installed and imported successfully.")


Installing: ['numpy', 'pandas', 'scikit-learn', 'lightgbm', 'joblib', 'matplotlib', 'seaborn']
OK: numpy 2.4.0
OK: pandas 2.3.3
OK: sklearn 1.8.0
OK: lightgbm 4.6.0
OK: joblib 1.5.3
OK: matplotlib 3.10.8
OK: seaborn 0.13.2
All packages installed and imported successfully.


In [26]:
# CELL 1.2 — Global configuration

SEED = 42
TEST_SIZE = 0.25

print("Global config:")
print("SEED =", SEED)
print("TEST_SIZE =", TEST_SIZE)


Global config:
SEED = 42
TEST_SIZE = 0.25


In [27]:
# CELL 1.3 — Output directory configuration

from pathlib import Path

OUTDIR = PROJECT_ROOT / "credit_project_outputs"
OUTDIR.mkdir(parents=True, exist_ok=True)

print("Artifacts will be saved to:", OUTDIR)


Artifacts will be saved to: E:\uplift-modeling-causal-inference-marketing\uplift-modeling-causal-inference-marketing\credit_project_outputs


In [28]:
# CELL 1.5 — Add project root to Python path & define DATA root

import sys
from pathlib import Path

PROJECT_ROOT = Path().resolve().parents[0]
sys.path.append(str(PROJECT_ROOT))

DATA_ROOT = PROJECT_ROOT

print("Project root:", PROJECT_ROOT)



Project root: E:\uplift-modeling-causal-inference-marketing\uplift-modeling-causal-inference-marketing


In [29]:
# CELL 2 — Load data and create uplift split

from src.data import load_data, build_xy_treatment, stratified_uplift_split

DATA_PATH = DATA_ROOT / "Kevin_Hillstrom_MineThatData_E-MailAnalytics_DataMiningChallenge_2008.03.20.csv"

df = load_data(DATA_PATH)
print("Raw data shape:", df.shape)

X, y, t = build_xy_treatment(df)

X_train, X_test, y_train, y_test, t_train, t_test = stratified_uplift_split(
    X, y, t,
    test_size=0.25,
    seed=SEED
)


Raw data shape: (64000, 12)


In [30]:
# CELL 3 — Load & quick EDA
print("Rows, cols:", df.shape)
display(df.head(6))

print("\nColumn types:")
display(df.dtypes)

print("\nMissing values per column:")
display(df.isna().sum().sort_values(ascending=False).head(20))


Rows, cols: (64000, 12)


Unnamed: 0,recency,history_segment,history,mens,womens,zip_code,newbie,channel,segment,visit,conversion,spend
0,10,2) $100 - $200,142.44,1,0,Surburban,0,Phone,Womens E-Mail,0,0,0.0
1,6,3) $200 - $350,329.08,1,1,Rural,1,Web,No E-Mail,0,0,0.0
2,7,2) $100 - $200,180.65,0,1,Surburban,1,Web,Womens E-Mail,0,0,0.0
3,9,5) $500 - $750,675.83,1,0,Rural,1,Web,Mens E-Mail,0,0,0.0
4,2,1) $0 - $100,45.34,1,0,Urban,0,Web,Womens E-Mail,0,0,0.0
5,6,2) $100 - $200,134.83,0,1,Surburban,0,Phone,Womens E-Mail,1,0,0.0



Column types:


recency              int64
history_segment     object
history            float64
mens                 int64
womens               int64
zip_code            object
newbie               int64
channel             object
segment             object
visit                int64
conversion           int64
spend              float64
dtype: object


Missing values per column:


recency            0
history_segment    0
history            0
mens               0
womens             0
zip_code           0
newbie             0
channel            0
segment            0
visit              0
conversion         0
spend              0
dtype: int64

In [31]:
# CELL 5 — Feature selection (pipeline-aligned)

from src.features import infer_feature_types

# IMPORTANT:
# X, y, t are already created in CELL 2
# Do NOT re-derive them from df

X_proc, feature_names = infer_feature_types(X)

print("Candidate features:", feature_names)

# Split already done in CELL 2
print("Train shape:", X_train.shape, "Test shape:", X_test.shape)
print("Treatment rate (train):", t_train.mean())


Candidate features: ['history_segment', 'zip_code', 'channel']
Train shape: (48000, 10) Test shape: (16000, 10)
Treatment rate (train): 0.6671041666666667


In [32]:
# CELL 6 — Build preprocessing pipeline and save artifact

import os
import joblib

from src.features import infer_feature_types, build_preprocessor

num_cols, cat_cols = infer_feature_types(X_train)

print("Numerical cols:", num_cols)
print("Categorical cols:", cat_cols)

preprocessor = build_preprocessor(num_cols, cat_cols)

# Fit and transform
preprocessor.fit(X_train)
X_train_p = preprocessor.transform(X_train)
X_test_p = preprocessor.transform(X_test)

print("Transformed shapes:", X_train_p.shape, X_test_p.shape)

# Save preprocessor
preproc_path = os.path.join(OUTDIR, "preprocessor.joblib")
joblib.dump(preprocessor, preproc_path)

print("Saved preprocessor to:", preproc_path)


Numerical cols: ['recency', 'history', 'mens', 'womens', 'newbie', 'visit', 'spend']
Categorical cols: ['history_segment', 'zip_code', 'channel']
Transformed shapes: (48000, 20) (16000, 20)
Saved preprocessor to: E:\uplift-modeling-causal-inference-marketing\uplift-modeling-causal-inference-marketing\credit_project_outputs\preprocessor.joblib


In [33]:
# src/train.py

import lightgbm as lgb
from sklearn.model_selection import train_test_split


def train_lgb_classifier(
    X,
    y,
    params=None,
    num_boost_round=300,
    random_state=42,
    valid_split=0.15
):
    """
    Generic LightGBM binary classifier trainer.

    Used by:
    - S-Learner
    - T-Learner (treated / control models)

    Notes:
    - No early stopping (for stability & compatibility)
    - Optional validation split for monitoring only
    """

    # -------------------------
    # Validation split
    # -------------------------
    if valid_split and 0 < valid_split < 1.0:
        X_tr, X_val, y_tr, y_val = train_test_split(
            X,
            y,
            test_size=valid_split,
            random_state=random_state,
            stratify=y
        )

        train_set = lgb.Dataset(X_tr, label=y_tr)
        valid_sets = [
            train_set,
            lgb.Dataset(X_val, label=y_val)
        ]
        valid_names = ["train", "valid"]

    else:
        train_set = lgb.Dataset(X, label=y)
        valid_sets = [train_set]
        valid_names = ["train"]

    # -------------------------
    # Base parameters
    # -------------------------
    base_params = {
        "objective": "binary",
        "metric": "auc",
        "boosting_type": "gbdt",
        "learning_rate": 0.05,
        "num_leaves": 31,
        "min_data_in_leaf": 30,
        "feature_fraction": 0.8,
        "bagging_fraction": 0.8,
        "bagging_freq": 1,
        "seed": random_state,
        "verbosity": -1
    }

    if params:
        base_params.update(params)

    # -------------------------
    # Train model
    # -------------------------
    model = lgb.train(
        params=base_params,
        train_set=train_set,
        num_boost_round=num_boost_round,
        valid_sets=valid_sets,
        valid_names=valid_names
    )

    return model


In [34]:
# CELL 8 — S-Learner training

from src.train import train_lgb_classifier, build_s_learner_matrix

X_train_s = build_s_learner_matrix(X_train_p, t_train)
X_test_s  = build_s_learner_matrix(X_test_p,  t_test)

print("S-learner shapes:", X_train_s.shape, X_test_s.shape)

model_s = train_lgb_classifier(
    X_train_s,
    y_train,
    num_boost_round=300
)

joblib.dump(model_s, os.path.join(OUTDIR, "model_s_lgb.joblib"))
print("Saved model_s_lgb.joblib")


S-learner shapes: (48000, 21) (16000, 21)
Saved model_s_lgb.joblib


In [35]:
# CELL 9 — T-Learner training (notebook orchestration)

from src.train import train_t_learner

model_t_treat, model_t_ctrl = train_t_learner(
    X_train_p,
    y_train,
    t_train,
    min_samples=100,
    num_boost_round=300,
    random_state=SEED
)

if model_t_treat is None:
    print("⚠️ T-Learner skipped: insufficient samples in one group")
    print("Using S-Learner as final causal model (correct & expected)")
else:
    joblib.dump(model_t_treat, os.path.join(OUTDIR, "model_t_treat_lgb.joblib"))
    joblib.dump(model_t_ctrl, os.path.join(OUTDIR, "model_t_ctrl_lgb.joblib"))
    print("Saved T-Learner models")


Saved T-Learner models


In [36]:
# CELL 10 — SAFE uplift prediction (notebook orchestration)

from src.evaluate import predict_s_uplift, predict_t_uplift

# S-Learner uplift (always available)
uplift_s, p1_s, p0_s = predict_s_uplift(
    model_s,
    preprocessor,
    X_test
)

print("Sample S-Learner uplift:", uplift_s[:5])

# T-Learner uplift (ONLY if models exist)
if model_t_treat is not None and model_t_ctrl is not None:
    uplift_t, p1_t, p0_t = predict_t_uplift(
        model_t_treat,
        model_t_ctrl,
        preprocessor,
        X_test
    )
    print("Sample T-Learner uplift:", uplift_t[:5])
else:
    uplift_t = None
    p1_t = None
    p0_t = None
    print("T-Learner uplift skipped (models not available)")


Sample S-Learner uplift: [-1.08466253e-08  6.38981769e-10 -1.30827511e-14  2.28639577e-10
  2.52374154e-10]
Sample T-Learner uplift: [-2.25976852e-07 -3.63254376e-08 -5.23700270e-08 -3.60946764e-08
 -3.43938799e-08]


In [37]:
# CELL 11 — Qini & AUUC evaluation (notebook orchestration)

from src.evaluate import qini_dataframe, auuc

# -------------------------
# S-Learner evaluation
# -------------------------
df_qini_s = qini_dataframe(
    y_test.values,
    t_test.values,
    uplift_s
)
auuc_s = auuc(df_qini_s)
print("AUUC (S-Learner):", auuc_s)

# -------------------------
# T-Learner evaluation (SAFE)
# -------------------------
if uplift_t is not None:
    df_qini_t = qini_dataframe(
        y_test.values,
        t_test.values,
        uplift_t
    )
    auuc_t = auuc(df_qini_t)
    print("AUUC (T-Learner):", auuc_t)
else:
    df_qini_t = None
    auuc_t = None
    print("AUUC (T-Learner): N/A (skipped due to insufficient samples)")


AUUC (S-Learner): 48.54685100079782
AUUC (T-Learner): 82.67855471536512


In [38]:
# CELL 12 — Evaluation plots (orchestration only)

from src.evaluate import (
    plot_qini,
    plot_roc,
    plot_pr,
    plot_confusion
)

# Qini curve
plot_qini(
    df_qini_s=df_qini_s,
    auuc_s=auuc_s,
    df_qini_t=df_qini_t,
    auuc_t=auuc_t,
    save_path=os.path.join(OUTDIR, "qini.png")
)
print("Saved Qini curve")

# ROC curve (using p1_s)
plot_roc(
    y_true=y_test,
    y_score=p1_s,
    save_path=os.path.join(OUTDIR, "roc.png")
)
print("Saved ROC curve")

# Precision–Recall curve
plot_pr(
    y_true=y_test,
    y_score=p1_s,
    save_path=os.path.join(OUTDIR, "pr.png")
)
print("Saved PR curve")

# Confusion matrix
plot_confusion(
    y_true=y_test,
    y_score=p1_s,
    threshold=0.5,
    save_path=os.path.join(OUTDIR, "confusion_matrix.png")
)
print("Saved confusion matrix")


Saved Qini curve
Saved ROC curve
Saved PR curve
Saved confusion matrix


In [39]:
# CELL 13 — Uplift segmentation (orchestration)

from src.evaluate import build_uplift_segmentation

seg_df, seg_summary = build_uplift_segmentation(
    X_test=X_test,
    y_test=y_test,
    t_test=t_test,
    uplift=uplift_s,
    p0=p0_s,
    p1=p1_s
)

print("Uplift distribution summary:")
display(seg_df["uplift"].describe())

# Save outputs (orchestration responsibility)
seg_summary_path = os.path.join(OUTDIR, "segmentation_summary.csv")
seg_full_path = os.path.join(OUTDIR, "segmentation_full.csv")

seg_summary.to_csv(seg_summary_path, index=False)
seg_df.to_csv(seg_full_path, index=False)

print("Saved:", seg_summary_path)
print("Saved:", seg_full_path)

display(seg_summary)


Uplift distribution summary:


count    1.600000e+04
mean     4.456023e-09
std      7.258395e-08
min     -1.130374e-06
25%     -2.648181e-13
50%      3.586143e-10
75%      7.704134e-10
max      3.590313e-06
Name: uplift, dtype: float64

Saved: E:\uplift-modeling-causal-inference-marketing\uplift-modeling-causal-inference-marketing\credit_project_outputs\segmentation_summary.csv
Saved: E:\uplift-modeling-causal-inference-marketing\uplift-modeling-causal-inference-marketing\credit_project_outputs\segmentation_full.csv


Unnamed: 0,group,count,conversion_rate,avg_uplift,median_uplift,avg_p0,avg_p1,recommended_action
0,Lost Causes,12755,0.0,-1.722254e-10,2.283617e-10,2.483492e-08,2.466269e-08,Do not target (unlikely to convert)
2,Takers,3200,0.03125,2.618435e-08,1.537742e-09,0.0312499,0.03124993,Target (high incremental ROI)
1,Sure Things,45,1.0,-2.288185e-07,-1.500023e-07,0.999996,0.9999957,Do not target (no incremental gain)


In [40]:
# CELL 14 — Global feature importance (notebook orchestration)

from src.explain import (
    get_feature_names_from_preprocessor,
    compute_global_feature_importance
)

# Extract feature names
feature_names = get_feature_names_from_preprocessor(preprocessor)
print("Number of feature names:", len(feature_names))

# Compute importance
df_imp = compute_global_feature_importance(model_s, feature_names)

# Validation check
nz = (df_imp["importance_gain"] > 0).sum()
print(f"Features with non-zero gain importance: {nz} / {len(df_imp)}")
if nz == 0:
    print(
        "WARNING: All feature importances are zero. "
        "Investigate data variance or model training."
    )

# Save to CSV
feat_imp_path = os.path.join(OUTDIR, "global_feature_importance.csv")
df_imp.to_csv(feat_imp_path, index=False)
print("Saved global feature importance to:", feat_imp_path)

display(df_imp.head(30))


Number of feature names: 21
Features with non-zero gain importance: 21 / 21
Saved global feature importance to: E:\uplift-modeling-causal-inference-marketing\uplift-modeling-causal-inference-marketing\credit_project_outputs\global_feature_importance.csv


Unnamed: 0,feature,importance_gain,importance_split
6,spend,37499.293847,362
5,visit,255.662197,185
1,history,14.951937,2004
0,recency,5.584263,940
2,mens,1.498545,286
15,zip_code_Surburban,1.250221,345
14,zip_code_Rural,1.233211,136
20,treatment,1.011014,160
16,zip_code_Urban,0.973774,77
4,newbie,0.967841,256


In [41]:
# CELL 15 — Local explanations (notebook orchestration)

from src.explain import local_uplift_explanation
import json
import os

sample_idxs = list(
    X_test.sample(n=min(5, len(X_test)), random_state=SEED).index
)

local_reports = {}

for idx in sample_idxs:
    local_reports[int(idx)] = local_uplift_explanation(
        idx=idx,
        X_df=X_test,
        X_train_p=X_train_p,
        model=model_s,
        preprocessor=preprocessor,
        feature_names=feature_names,   # ✅ FIXED
        top_k=8
    )

local_reports_path = os.path.join(OUTDIR, "local_model_reports.json")
with open(local_reports_path, "w", encoding="utf-8") as f:
    json.dump(local_reports, f, indent=2)

print("Saved local model reports to:", local_reports_path)
display(local_reports)


Saved local model reports to: E:\uplift-modeling-causal-inference-marketing\uplift-modeling-causal-inference-marketing\credit_project_outputs\local_model_reports.json


{12097: {'index': 12097,
  'p1': 1.600753799707809e-08,
  'p0': 1.5647471419038104e-08,
  'uplift': 3.6006657803998524e-10,
  'top_features': [{'feature': 'history',
    'effect_on_uplift': -2.703356752561257e-10},
   {'feature': 'mens', 'effect_on_uplift': 6.640594772681e-12},
   {'feature': 'recency', 'effect_on_uplift': -3.3711009567378782e-12},
   {'feature': 'history_segment_1) $0 - $100',
    'effect_on_uplift': 2.4818463694955503e-14},
   {'feature': 'channel_Phone', 'effect_on_uplift': 7.430757631484451e-16},
   {'feature': 'zip_code_Surburban',
    'effect_on_uplift': -6.671833613886359e-16},
   {'feature': 'womens', 'effect_on_uplift': 0.0},
   {'feature': 'newbie', 'effect_on_uplift': 0.0}]},
 44450: {'index': 44450,
  'p1': 1.9573005414068044e-08,
  'p0': 1.9573233588812036e-08,
  'uplift': -2.2817474399209365e-13,
  'top_features': [{'feature': 'history',
    'effect_on_uplift': -5.048585111326332e-10},
   {'feature': 'recency', 'effect_on_uplift': -6.32930173547979e-11},


In [42]:
# CELL 16 — Write final report.md (notebook orchestration only)

from src.evaluate import build_report_text
import os

report_text = build_report_text(
    auuc_s=auuc_s,
    auuc_t=auuc_t,
    dataset_name="Hillstrom E-mail Analytics"
)

report_path = os.path.join(OUTDIR, "report.md")
with open(report_path, "w", encoding="utf-8") as f:
    f.write(report_text)

print("Wrote final report to:", report_path)
print("\nReport preview:\n")
print(report_text[:900])


Wrote final report to: E:\uplift-modeling-causal-inference-marketing\uplift-modeling-causal-inference-marketing\credit_project_outputs\report.md

Report preview:

# Uplift Modeling Project – Executive Summary

Date: 2025-12-23T12:07:02.661828Z

Dataset:
Hillstrom E-mail Analytics

Models Implemented:
- S-Learner (LightGBM, treatment as a feature)
- T-Learner (LightGBM, separate treated and control models – conditional)

Evaluation (Uplift Metrics):
- AUUC (S-Learner): 48.546851
- AUUC (T-Learner): 82.678555

Model Comparison:
The S-Learner achieved the highest and most reliable AUUC score, indicating superior
ranking of customers by incremental conversion impact. By pooling treated and control
data within a single model, the S-Learner provides more stable CATE estimates when
sample sizes are imbalanced or outcome patterns are similar across groups.


Segmentation:
Customers were segmented into four standard uplift groups based on predicted uplift
and baseline probabilities:
- Takers
- 