In [None]:
import polars as pl
import yaml
from xgboost import XGBClassifier
from sklearn.preprocessing import TargetEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

raw_data = pl.read_csv("./adult.csv")

with open("config.yaml", "r") as f:
    config = yaml.load(f, Loader=yaml.CLoader)

In [None]:
data = (
    raw_data.with_columns(
        **{
            "grad-degree": (pl.col("educational-num") > 13).cast(pl.Int32),
            "income": pl.col("income") == ">50K",
        }
    )
    .select(config["numeric_predictors"] + config["categorical_predictors"] + [config["target"]] + config["treatment"])
    # Randomly re-order
    .sample(fraction=1.0, shuffle=False, seed=7700)
)
train_data = data[:int(data.shape[0] * 0.75)]
validation_data = data[int(data.shape[0] * 0.75):]

## S-Learner

The S-learner uses a single model, where the treatment is indicated by a predictor in the model.

In [None]:
# Divide into predictors and targets.
x_train = train_data[config["numeric_predictors"] + config["categorical_predictors"] + config["treatment"]]
y_train = train_data[config["target"]]

x_validation = validation_data[config["numeric_predictors"] + config["categorical_predictors"] + config["treatment"]]
y_validation = validation_data[config["target"]]

encoder = ColumnTransformer(
    transformers=[
        ("target_encoding", TargetEncoder(target_type="binary"), config["categorical_predictors"]),
        ("passthrough", "passthrough", config["numeric_predictors"] + config["treatment"])
    ]
)
encoder.set_output(transform="polars")

x_train_tr = encoder.fit_transform(x_train, y_train)
x_validation_tr = encoder.transform(x_validation)

s_learner = XGBClassifier(**config["s_learner_params"])
eval_set = [(x_train_tr, y_train), (x_validation_tr, y_validation)]
s_learner.fit(x_train_tr, y_train, eval_set=eval_set, verbose=10)

s_learner_pipeline = Pipeline(
    [
        ("encoder", encoder),
        ("model", s_learner)
    ]
)

# Now let's predict with no graduate degrees and graduate degrees
x_validation_ng = x_validation.with_columns(**{"grad-degree": pl.lit(0)})
no_grad_p = s_learner_pipeline.predict_proba(x_validation_ng)[:, 1]
x_validation_g = x_validation.with_columns(**{"grad-degree": pl.lit(1)})
grad_p = s_learner_pipeline.predict_proba(x_validation_g)[:, 1]

x_validation = x_validation.with_columns(
    grad_p=grad_p,
    no_grad_p=no_grad_p,
    grad_cate=grad_p-no_grad_p,
    income=y_validation,
)

In [None]:
x_validation.group_by(pl.col("gender")).agg(
    base_rate=pl.col("income").mean(),
    ate=pl.col("grad_cate").mean()
)

In [None]:
x_validation.group_by(pl.col("race")).agg(
    base_rate=pl.col("income").mean(),
    ate=pl.col("grad_cate").mean()
)

## T-Learner

The T-learner uses two models, one for the treated group and then one for the control group, and then looks at the difference between the scores from each model to estimate the treatment effect.

In [None]:
# First we train the no treatment model
ng_train_data = train_data.filter(pl.col(config["treatment"]) == 0)
x_ng_train = ng_train_data[config["numeric_predictors"] + config["categorical_predictors"]]
y_ng_train = ng_train_data[config["target"]]

ng_validation_data = validation_data.filter(pl.col(config["treatment"]) == 0)
x_ng_validation = ng_validation_data[config["numeric_predictors"] + config["categorical_predictors"]]
y_ng_validation = ng_validation_data[config["target"]].to_numpy()

ng_encoder = ColumnTransformer(
    transformers=[
        ("target_encoding", TargetEncoder(target_type="binary"), config["categorical_predictors"]),
        ("passthrough", "passthrough", config["numeric_predictors"])
    ]
)
ng_encoder.set_output(transform="polars")

x_ng_train_tr = ng_encoder.fit_transform(x_ng_train, y_ng_train)
x_ng_validation_tr = ng_encoder.transform(x_ng_validation)

ng_learner = XGBClassifier(**config["s_learner_params"])
eval_set = [(x_ng_train_tr, y_ng_train), (x_ng_validation_tr, y_ng_validation)]
s_learner.fit(x_ng_train_tr, y_ng_train, eval_set=eval_set, verbose=10)

t_learner_ng_pipeline = Pipeline(
    [
        ("encoder", encoder),
        ("model", s_learner)
    ]
)