# Uplift EDA & Models (T-/X-learner, Trees)

This is a starter notebook. We'll fill it in Step 2.


In [23]:
# Step 3a — Load Hillstrom reliably and save to data/raw/hillstrom.csv
from pathlib import Path
import sys, subprocess, importlib.util as iutil
import pandas as pd

PROJ = Path.cwd().parents[0] if Path.cwd().name == "notebooks" else Path.cwd()
RAW = PROJ / "data" / "raw"
RAW.mkdir(parents=True, exist_ok=True)

def ensure(pkg, pip_name=None):
    if iutil.find_spec(pkg) is None:
        subprocess.check_call([sys.executable, "-m", "pip", "install", pip_name or pkg])

ensure("sklift", "scikit-uplift")

from sklift.datasets import fetch_hillstrom

try:
    # Correct signature per scikit-uplift docs
    X, y, t = fetch_hillstrom(target_col="spend", return_X_y_t=True)
    hill = pd.concat(
        [pd.DataFrame(X).reset_index(drop=True),
         pd.Series(y, name="spend").reset_index(drop=True),
         pd.Series(t, name="treatment").reset_index(drop=True)],
        axis=1
    )
    source = "sklift.fetch_hillstrom"
except Exception as e:
    # Fallback: read the dataset from scikit-uplift’s S3 mirror (per docs)
    url = "https://hillstorm1.s3.us-east-2.amazonaws.com/hillstorm_no_indices.csv.gz"
    hill = pd.read_csv(url, compression="gzip")
    if "treatment" not in hill.columns and "segment" in hill.columns:
        hill["treatment"] = (hill["segment"].astype(str) != "No E-Mail").astype(int)
    if "spend" not in hill.columns:
        # fallback to visit/conversion if needed
        if "conversion" in hill.columns:
            hill["spend"] = pd.to_numeric(hill["conversion"], errors="coerce").fillna(0.0)
        elif "visit" in hill.columns:
            hill["spend"] = pd.to_numeric(hill["visit"], errors="coerce").fillna(0.0)
        else:
            raise
    source = "S3 direct"

out = RAW / "hillstrom.csv"
hill.to_csv(out, index=False)

print("Saved:", out, "| Source:", source)
print("Shape:", hill.shape)
print("Treatment counts:", hill["treatment"].value_counts().to_dict())
print("Mean spend by treatment:", hill.groupby("treatment")["spend"].mean().round(3).to_dict())
hill.head()


Saved: C:\Users\balla\Downloads\clv-uplift-optimizer-starter\clv-uplift-optimizer\data\raw\hillstrom.csv | Source: sklift.fetch_hillstrom
Shape: (64000, 10)
Treatment counts: {'Womens E-Mail': 21387, 'Mens E-Mail': 21307, 'No E-Mail': 21306}
Mean spend by treatment: {'Mens E-Mail': 1.423, 'No E-Mail': 0.653, 'Womens E-Mail': 1.077}


Unnamed: 0,recency,history_segment,history,mens,womens,zip_code,newbie,channel,spend,treatment
0,10,2) $100 - $200,142.44,1,0,Surburban,0,Phone,0.0,Womens E-Mail
1,6,3) $200 - $350,329.08,1,1,Rural,1,Web,0.0,No E-Mail
2,7,2) $100 - $200,180.65,0,1,Surburban,1,Web,0.0,Womens E-Mail
3,9,5) $500 - $750,675.83,1,0,Rural,1,Web,0.0,Mens E-Mail
4,2,1) $0 - $100,45.34,1,0,Urban,0,Web,0.0,Womens E-Mail


In [24]:
import numpy as np, pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split

PROJ = Path.cwd().parents[0] if Path.cwd().name == "notebooks" else Path.cwd()
RAW  = PROJ / "data" / "raw"
PROC = PROJ / "data" / "processed"
PROC.mkdir(parents=True, exist_ok=True)

hill = pd.read_csv(RAW / "hillstrom.csv")

# Binary treatment: 1 if emailed (Mens/Womens), 0 if No E-Mail
treat = (hill["treatment"].astype(str) != "No E-Mail").astype(int)

# Target = spend (numeric)
y = pd.to_numeric(hill["spend"], errors="coerce").fillna(0.0)

# Features: drop current-treatment label & any explicit “segment”
X = hill.drop(columns=["spend","treatment","segment"], errors="ignore")

# Train/test split (stratify by treatment keeps the randomization ratio)
X_tr, X_te, y_tr, y_te, t_tr, t_te = train_test_split(
    X, y, treat, test_size=0.25, random_state=42, stratify=treat
)

X_tr.head()


Unnamed: 0,recency,history_segment,history,mens,womens,zip_code,newbie,channel
26859,1,1) $0 - $100,29.99,1,0,Urban,1,Web
44631,1,4) $350 - $500,424.1,0,1,Urban,1,Multichannel
29165,1,1) $0 - $100,29.99,0,1,Urban,0,Web
62255,1,5) $500 - $750,669.66,0,1,Surburban,1,Web
43677,5,2) $100 - $200,134.29,1,0,Surburban,0,Phone


In [25]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor

# Separate numeric vs categorical columns
num_cols = X_tr.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = [c for c in X_tr.columns if c not in num_cols]

pre = ColumnTransformer([
    ("num", "passthrough", num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_cols),
])

# Fit encoder on train only, then transform
pre.fit(X_tr)
A_tr = pre.transform(X_tr)
A_te = pre.transform(X_te)

# Two models: one for treated, one for control
reg_t = RandomForestRegressor(n_estimators=300, random_state=42, n_jobs=-1)
reg_c = RandomForestRegressor(n_estimators=300, random_state=42, n_jobs=-1)

reg_t.fit(A_tr[t_tr==1], y_tr[t_tr==1])
reg_c.fit(A_tr[t_tr==0], y_tr[t_tr==0])

# Predict potential outcomes on test
y1_hat = reg_t.predict(A_te)   # predicted spend if treated
y0_hat = reg_c.predict(A_te)   # predicted spend if control
uplift_hat = y1_hat - y0_hat   # predicted incremental spend
pd.DataFrame({"y1_hat": y1_hat, "y0_hat": y0_hat, "uplift_hat": uplift_hat}).head()


Unnamed: 0,y1_hat,y0_hat,uplift_hat
0,3.6636,0.0,3.6636
1,0.0,0.9792,-0.9792
2,0.0,0.0,0.0
3,0.0,0.0,0.0
4,0.0,0.0,0.0


In [26]:
# Propensity of treatment in training (randomized design)
p = float(t_tr.mean())

# Inverse propensity weighting contribution for each test row
w = y_te.to_numpy() * ( (t_te.to_numpy()/p) - ((1 - t_te.to_numpy())/(1 - p)) )

# Rank by predicted uplift desc
order = np.argsort(-uplift_hat)
w_sorted = w[order]

# Cumulative incremental value
cum_gain = np.cumsum(w_sorted)
n = np.arange(1, len(cum_gain)+1)
gain_df = pd.DataFrame({"n": n, "cum_gain": cum_gain, "cum_gain_per_customer": cum_gain / n})

# Deciles summary
k = (np.linspace(0.1, 1.0, 10) * len(gain_df)).astype(int)
deciles = gain_df.iloc[k-1][["n","cum_gain","cum_gain_per_customer"]].reset_index(drop=True)

best_idx = int(cum_gain.argmax()) + 1
best_pct = round(100 * best_idx / len(gain_df), 2)

print(f"Best policy size ≈ top {best_pct}% of customers (n={best_idx})")
deciles


Best policy size ≈ top 99.09% of customers (n=15855)


Unnamed: 0,n,cum_gain,cum_gain_per_customer
0,1600,1671.49572,1.044685
1,3200,2234.158113,0.698174
2,4800,5416.77477,1.128495
3,6400,5936.907457,0.927642
4,8000,6924.475859,0.865559
5,9600,5353.296828,0.557635
6,11200,7187.11075,0.641706
7,12800,8058.977127,0.629608
8,14400,8436.67168,0.58588
9,16000,10191.235193,0.636952


In [27]:
# Save per-customer scores on the test set
scores = pd.DataFrame({
    "row_id": X_te.reset_index(drop=True).index,
    "uplift_hat": uplift_hat,
    "y_actual": y_te.to_numpy(),
    "treated": t_te.to_numpy(),
    "ipw_contrib": w,
})
scores_sorted = scores.iloc[order].reset_index(drop=True)

scores_sorted.to_csv(PROC / "hillstrom_uplift_scores_test.csv", index=False)
gain_df.to_csv(PROC / "hillstrom_uplift_curve.csv", index=False)
deciles.to_csv(PROC / "hillstrom_uplift_deciles.csv", index=False)

print("Saved:", PROC / "hillstrom_uplift_scores_test.csv")
print("Saved:", PROC / "hillstrom_uplift_curve.csv")
print("Saved:", PROC / "hillstrom_uplift_deciles.csv")
scores_sorted.head(10)


Saved: C:\Users\balla\Downloads\clv-uplift-optimizer-starter\clv-uplift-optimizer\data\processed\hillstrom_uplift_scores_test.csv
Saved: C:\Users\balla\Downloads\clv-uplift-optimizer-starter\clv-uplift-optimizer\data\processed\hillstrom_uplift_curve.csv
Saved: C:\Users\balla\Downloads\clv-uplift-optimizer-starter\clv-uplift-optimizer\data\processed\hillstrom_uplift_deciles.csv


Unnamed: 0,row_id,uplift_hat,y_actual,treated,ipw_contrib
0,11786,267.8696,0.0,0,-0.0
1,13574,235.279467,0.0,1,0.0
2,8911,164.67,0.0,1,0.0
3,9527,146.373333,0.0,1,0.0
4,10755,136.221167,0.0,1,0.0
5,10271,122.0053,0.0,1,0.0
6,3177,120.406233,0.0,1,0.0
7,7190,116.433333,0.0,0,-0.0
8,9130,114.025333,0.0,1,0.0
9,11706,105.179967,0.0,1,0.0


In [28]:
from pathlib import Path
import json, joblib
import numpy as np
import pandas as pd

PROJ = Path.cwd().parents[0] if Path.cwd().name == "notebooks" else Path.cwd()
MODELD = PROJ / "models"
MODELD.mkdir(parents=True, exist_ok=True)

# Recreate the feature lists exactly as you trained
cat_cols = X_tr.select_dtypes(exclude=["number"]).columns.tolist()
num_cols = X_tr.select_dtypes(include=["number"]).columns.tolist()
feature_cols = list(X_tr.columns)

joblib.dump(pre,   MODELD / "preprocess.joblib")
joblib.dump(reg_t, MODELD / "rf_treated.joblib")
joblib.dump(reg_c, MODELD / "rf_control.joblib")

meta = {
    "feature_cols": feature_cols,
    "cat_cols": cat_cols,
    "num_cols": num_cols,
    "target": "spend",
    "treatment_col": "treatment",
    "id_fallback": "row_id"  # will be created at score-time if missing
}
(MODELD / "meta.json").write_text(json.dumps(meta, indent=2), encoding="utf-8")
print("Saved to:", MODELD)


Saved to: C:\Users\balla\Downloads\clv-uplift-optimizer-starter\clv-uplift-optimizer\models


In [30]:
from pathlib import Path
import json, joblib, pandas as pd

PROJ   = Path.cwd().parents[0] if Path.cwd().name == "notebooks" else Path.cwd()
RAW    = PROJ / "data" / "raw"
PROC   = PROJ / "data" / "processed"
MODELS = PROJ / "models"

# Use the test file you wrote earlier
df = pd.read_csv(RAW / "hillstrom_test_only.csv")  # 16,000 rows

# Load the artifacts you saved in step 02
enc   = joblib.load(MODELS / "preprocess.joblib")
reg_t = joblib.load(MODELS / "rf_treated.joblib")
reg_c = joblib.load(MODELS / "rf_control.joblib")
meta  = json.load(open(MODELS / "meta.json"))

# Use exact feature order/dtypes
feat_cols = meta["feature_cols"]
X = df[feat_cols].copy()
for c in meta["cat_cols"]:
    if c in X.columns:
        X[c] = X[c].astype(str)

A  = enc.transform(X)
y1 = reg_t.predict(A)
y0 = reg_c.predict(A)

uplift_hat = y1 - y0
nb_scored = pd.DataFrame({
    "row_id": df.get("row_id", pd.RangeIndex(len(df))),
    "uplift_hat": uplift_hat
})
nb_scored.head()


Unnamed: 0,row_id,uplift_hat
0,0,0.0
1,1,-0.406867
2,2,7.312567
3,3,7.4938
4,4,0.0


In [31]:
from pathlib import Path
code = r"""
import argparse, json, joblib, numpy as np, pandas as pd
from pathlib import Path

def load_models(model_dir: Path):
    pre = joblib.load(model_dir / "preprocess.joblib")
    reg_t = joblib.load(model_dir / "rf_treated.joblib")
    reg_c = joblib.load(model_dir / "rf_control.joblib")
    meta = json.loads((model_dir / "meta.json").read_text(encoding="utf-8"))
    return pre, reg_t, reg_c, meta

def prepare_features(df: pd.DataFrame, meta: dict):
    # ensure feature columns exist
    for c in meta["feature_cols"]:
        if c not in df.columns:
            df[c] = np.nan
    X = df[meta["feature_cols"]].copy()
    # keep types friendly
    for c in X.columns:
        if X[c].dtype == "object":
            X[c] = X[c].astype("string")
    return X

def score(input_csv, output_csv, model_dir, margin, cost, policy, topn=None, id_col=None):
    model_dir = Path(model_dir)
    pre, reg_t, reg_c, meta = load_models(model_dir)

    df = pd.read_csv(input_csv)
    # Choose an ID column or create one
    if id_col and id_col in df.columns:
        ids = df[id_col].copy()
    elif meta.get("id_fallback") in df.columns:
        ids = df[meta["id_fallback"]].copy()
    else:
        ids = pd.Series(np.arange(len(df)), name=meta.get("id_fallback","row_id"))
        df[ids.name] = ids

    X = prepare_features(df, meta)
    A = pre.transform(X)

    y1_hat = reg_t.predict(A)
    y0_hat = reg_c.predict(A)
    uplift = y1_hat - y0_hat

    out = pd.DataFrame({
        "row_id": ids.values,
        "y1_hat": y1_hat,
        "y0_hat": y0_hat,
        "uplift_hat": uplift
    }).sort_values("uplift_hat", ascending=False).reset_index(drop=True)

    out["exp_incremental_revenue"] = out["uplift_hat"]
    out["exp_profit_per_contact"]  = margin*out["uplift_hat"] - cost

    # decide contacts
    if policy == "pos":
        chosen = out.loc[out["exp_profit_per_contact"] > 0].copy()
    elif policy == "bestN":
        # maximize expected profit curve
        cg = out["uplift_hat"].cumsum()
        n  = np.arange(1, len(out)+1)
        profit_curve = margin*cg - cost*n
        best_n = int(np.argmax(profit_curve) + 1)
        if topn is not None:
            best_n = int(topn)
        chosen = out.head(best_n).copy()
    else:
        raise ValueError("policy must be 'pos' or 'bestN'")

    chosen["rank"] = np.arange(1, len(chosen)+1)
    cols = ["row_id","rank","uplift_hat","y1_hat","y0_hat","exp_incremental_revenue","exp_profit_per_contact"]
    chosen[cols].to_csv(output_csv, index=False)

    total_rev  = float(chosen["exp_incremental_revenue"].sum())
    total_cost = float(cost*len(chosen))
    total_prof = float(margin*total_rev - total_cost)

    print(f"Contacts: {len(chosen)}  |  Expected revenue: {total_rev:,.2f}  |  Cost: {total_cost:,.2f}  |  Profit: {total_prof:,.2f}")
    print("Saved:", output_csv)

if __name__ == "__main__":
    p = argparse.ArgumentParser(description="Score uplift and produce a contact list.")
    p.add_argument("--in", dest="input_csv",  required=True)
    p.add_argument("--out", dest="output_csv", required=True)
    p.add_argument("--models", default="models")
    p.add_argument("--margin", type=float, default=0.30)
    p.add_argument("--cost",   type=float, default=0.05)
    p.add_argument("--policy", choices=["pos","bestN"], default="pos")
    p.add_argument("--topn", type=int, default=None, help="optional, only for policy=bestN")
    p.add_argument("--id_col", default=None)
    args = p.parse_args()
    score(args.input_csv, args.output_csv, args.models, args.margin, args.cost, args.policy, args.topn, args.id_col)
"""
proj = Path.cwd().parents[0] if Path.cwd().name == "notebooks" else Path.cwd()
tools = proj / "tools"
tools.mkdir(parents=True, exist_ok=True)
(target := tools / "score_contacts.py").write_text(code, encoding="utf-8")
print("Wrote:", target)


Wrote: C:\Users\balla\Downloads\clv-uplift-optimizer-starter\clv-uplift-optimizer\tools\score_contacts.py


In [32]:
from pathlib import Path
PROJ = Path.cwd().parents[0] if Path.cwd().name == "notebooks" else Path.cwd()
PROC = PROJ / "data" / "processed"

nb_out = PROC / "nb_scored_test.csv"
nb_scored.to_csv(nb_out, index=False)
print("Saved:", nb_out, "| rows:", len(nb_scored))


Saved: C:\Users\balla\Downloads\clv-uplift-optimizer-starter\clv-uplift-optimizer\data\processed\nb_scored_test.csv | rows: 16000
