# 04 Uplift Modeling (S/T/X-Learner)

This notebook follows Phase 2 MVP 2.4: CATE estimation via S-Learner / T-Learner / X-Learner.

In [12]:
# ======================================================
# Section 0 (Cell 1/1): Setup
# ======================================================

import os
import sys
from pathlib import Path
from datetime import datetime, timezone, timedelta

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import yaml
from IPython.display import Markdown, display

from sklearn.model_selection import train_test_split

# Timezone anchoring for reproducibility
TZ_UTC8 = timezone(timedelta(hours=8))
run_ts_utc8 = datetime.now(TZ_UTC8).strftime('%Y-%m-%d %H:%M:%S %z')
print(f"[UTC+8] Run timestamp: {run_ts_utc8}")

# Project root resolution (avoid hardcoding)
project_root = Path.cwd()
if project_root.name == 'notebooks':
    project_root = project_root.parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))
os.chdir(project_root)

# Plot style
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['font.size'] = 13
sns.set_palette('Set2')

# Load config (config.yml / config.yaml)
config_candidates = [project_root / 'configs' / 'config.yaml', project_root / 'configs' / 'config.yml']
config_path = next((p for p in config_candidates if p.exists()), None)
if config_path is None:
    raise FileNotFoundError(f"No config file found. Searched: {[str(p) for p in config_candidates]}")
with open(config_path, 'r', encoding='utf-8') as f:
    config = yaml.safe_load(f)

features_path = Path(config['paths']['features_data'])
assert features_path.exists(), f"Feature file not found: {features_path}"

figures_dir = Path(config['paths'].get('figures_dir', 'outputs/figures/'))
figures_dir.mkdir(parents=True, exist_ok=True)
print(f"Figures will be saved to: {figures_dir}")

# Load features
df = pd.read_csv(features_path)
assert isinstance(df, pd.DataFrame) and len(df) > 0, "Loaded features DataFrame is empty"
print(f"Loaded: {features_path}")

treatment_col = config['data']['treatment_col']
outcome_col = config['data']['outcome_col']
spend_col = config['data']['spend_col']
covariates = config['data']['covariates']

missing_covs = [c for c in covariates if c not in df.columns]
assert len(missing_covs) == 0, f"Missing covariates in df: {missing_covs}"
assert treatment_col in df.columns, f"Missing required column: {treatment_col}"
assert outcome_col in df.columns, f"Missing required column: {outcome_col}"
assert spend_col in df.columns, f"Missing required column: {spend_col}"

# Split X, T, Y
X = df[covariates].copy()
T = pd.to_numeric(df[treatment_col], errors='coerce').astype(int)
Y = pd.to_numeric(df[outcome_col], errors='coerce').astype(int)

assert X.isnull().sum().sum() == 0, "X contains NaN"
assert set(pd.unique(T)).issubset({0, 1}), "T must be binary (0/1)"
assert set(pd.unique(Y)).issubset({0, 1}), "Y must be binary (0/1)"

# Load or compute PS vector
# Requests "load features + PS vector"; if the persisted PS artifact is not found,
# we compute PS via LogisticRegression (estimate_ps) and persist it for reproducibility.
ps_path = Path(config['paths'].get('ps_data', 'data/processed/hillstrom_ps.csv'))
ps = None
if ps_path.exists():
    ps_df = pd.read_csv(ps_path)
    if 'ps' not in ps_df.columns:
        raise ValueError(f"PS file missing 'ps' column: {ps_path}")
    ps = pd.to_numeric(ps_df['ps'], errors='coerce').to_numpy(dtype=float)
    if len(ps) != len(df):
        raise ValueError(f"PS length mismatch: len(ps)={len(ps)} vs len(df)={len(df)}")
    print(f"Loaded PS vector from: {ps_path}")
else:
    from src.causal import estimate_ps
    ps, _ = estimate_ps(X, T, random_state=int(config.get('general', {}).get('random_state', 42)))
    ps_out = pd.DataFrame({'ps': ps.astype(float)})
    ps_path.parent.mkdir(parents=True, exist_ok=True)
    ps_out.to_csv(ps_path, index=False)
    print(f"Computed PS vector and saved to: {ps_path}")

ps = np.asarray(ps, dtype=float).reshape(-1)
assert np.isfinite(ps).all(), "ps contains NaN/inf"
assert ps.min() >= 0.0 and ps.max() <= 1.0, "ps must be within [0, 1]"

# Train/Test split (architecture review adjustment #6)
X_train, X_test, T_train, T_test, Y_train, Y_test, ps_train, ps_test = train_test_split(
    X,
    T,
    Y,
    ps,
    test_size=0.3,
    random_state=int(config.get('general', {}).get('random_state', 42)),
    stratify=T,
)

assert len(X_train) == len(T_train) == len(Y_train) == len(ps_train), "Train split length mismatch"
assert len(X_test) == len(T_test) == len(Y_test) == len(ps_test), "Test split length mismatch"

print(
    f"df.shape={df.shape} | X.shape={X.shape} | "
    f"train={X_train.shape} | test={X_test.shape} | "
    f"T.mean(full)={T.mean():.4f} | T.mean(train)={np.mean(T_train):.4f} | T.mean(test)={np.mean(T_test):.4f}"
)
print(f"ps.mean(full)={ps.mean():.4f} | ps.std(full)={ps.std():.4f}")
print(f"Covariates (n={len(covariates)}): {covariates}")


[UTC+8] Run timestamp: 2026-02-23 21:56:39 +0800
Figures will be saved to: outputs\figures
Loaded: data\processed\hillstrom_features.csv
Loaded PS vector from: data\processed\hillstrom_ps.csv
df.shape=(64000, 16) | X.shape=(64000, 9) | train=(44800, 9) | test=(19200, 9) | T.mean(full)=0.6671 | T.mean(train)=0.6671 | T.mean(test)=0.6671
ps.mean(full)=0.6671 | ps.std(full)=0.0039
Covariates (n=9): ['recency', 'history', 'mens', 'womens', 'newbie', 'channel_Phone', 'channel_Web', 'zip_Surburban', 'zip_Urban']


## Section 1: S-Learner

In [13]:
# ======================================================
# Section 1 (Cell 1/1): Fit S-Learner on train, predict CATE on test
# ======================================================

import importlib
import src.uplift
importlib.reload(src.uplift)
from src.uplift import fit_s_learner

uplift_cfg = config.get('uplift', {})
n_estimators = int(uplift_cfg.get('n_estimators', 100))
max_depth = int(uplift_cfg.get('max_depth', 5))
random_state = int(config.get('general', {}).get('random_state', 42))

cate_s = fit_s_learner(
    X_train,
    T_train,
    Y_train,
    X_pred=X_test,
    n_estimators=n_estimators,
    max_depth=max_depth,
    random_state=random_state,
)

assert isinstance(cate_s, np.ndarray), 'cate_s must be a numpy.ndarray'
assert len(cate_s) == len(X_test), 'CATE length must match test sample size'
assert np.isfinite(cate_s).all(), 'CATE contains NaN/inf'

cate_s_series = pd.Series(cate_s, name='cate_s')
display(cate_s_series.describe(percentiles=[0.01, 0.05, 0.5, 0.95, 0.99]).to_frame().T)

max_abs = float(np.max(np.abs(cate_s)))
print(f"max(|cate_s|) = {max_abs:.6f}")
if max_abs > 0.10:
    print('[DQ] CATE magnitude > 0.10. Check model calibration / leakage / feature pipeline.')

# Expected scale (Hillstrom RCT): roughly within [-0.02, 0.02]
print(f"Expected (RCT) rough range: [-0.02, 0.02] | observed min={cate_s.min():.6f}, max={cate_s.max():.6f}")


Unnamed: 0,count,mean,std,min,1%,5%,50%,95%,99%,max
cate_s,19200.0,0.004402,0.004021,-0.011254,-0.003266,-0.000601,0.00377,0.011537,0.018345,0.036114


max(|cate_s|) = 0.036114
Expected (RCT) rough range: [-0.02, 0.02] | observed min=-0.011254, max=0.036114
