In [1]:
from pprint import pprint

import pandas as pd
import matplotlib.pyplot as plt
from sklift.metrics import uplift_at_k
from sklift.viz import plot_uplift_curve
from sklift.viz import plot_qini_curve
import numpy as np
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

from causalml.inference.meta import BaseXRegressor, BaseTClassifier, BaseSClassifier, BaseRClassifier
from causalml.dataset import *
from causalml.metrics import *

from sklift.models import TwoModels
from sklift.models import SoloModel

from econml.metalearners import TLearner
from econml.metalearners import SLearner
from econml.metalearners import XLearner

from lightgbm import LGBMClassifier

# Meta-Learners by Libraries

In [3]:
def CausalML(X_train, treat_train, y_train, X_test, treat_test, y_test):
    learner_t = BaseTClassifier(learner=LGBMClassifier())
    learner_t.fit(X=X_train, treatment=treat_train, y=y_train)
    cate_t = np.squeeze(learner_t.predict(X_test))
    score_t = uplift_at_k(y_true=y_test, uplift=cate_t, treatment=treat_test, strategy='by_group', k=0.3)
    
    learner_s = BaseSClassifier(learner=LGBMClassifier())
    learner_s.fit(X=X_train, treatment=treat_train, y=y_train)
    cate_s = np.squeeze(learner_s.predict(X_test))
    score_s = uplift_at_k(y_true=y_test, uplift=cate_s, treatment=treat_test, strategy='by_group', k=0.3)
    
    learner_x = BaseXRegressor(LGBMClassifier(),LGBMClassifier())
    learner_x.fit(X=X_train, treatment=treat_train, y=y_train)
    cate_x = np.squeeze(learner_x.predict(X_test))
    score_x = uplift_at_k(y_true=y_test, uplift=cate_x, treatment=treat_test, strategy='by_group', k=0.3)
    
    return score_t, score_s, score_x

In [4]:
def SkLift(X_train, treat_train, y_train, X_test, treat_test, y_test):
    tm = TwoModels(estimator_trmnt=LGBMClassifier(), estimator_ctrl=LGBMClassifier(), method='vanilla')
    tm = tm.fit(X_train, y_train, treat_train)
    uplift_tm = tm.predict(X_test)
    score_t = uplift_at_k(y_true=y_test, uplift=uplift_tm, treatment=treat_test, strategy='by_group', k=0.3)

    sm = SoloModel(LGBMClassifier())
    sm = sm.fit(X_train, y_train, treat_train)
    uplift_sm = sm.predict(X_test)
    score_s = uplift_at_k(y_true=y_test, uplift=uplift_sm, treatment=treat_test, strategy='by_group', k=0.3)
    
    score_x = 0
    
    return score_t, score_s, score_x

In [5]:
def EconML(X_train, treat_train, y_train, X_test, treat_test, y_test):
    est = TLearner(LGBMClassifier())
    est.fit(y_train, treat_train, X_train)
    uplift = np.squeeze(est.const_marginal_effect(X_test))
    score_t = uplift_at_k(y_true=y_test, uplift=uplift, treatment=treat_test, strategy='by_group', k=0.3)

    est = SLearner(LGBMClassifier())
    est.fit(y_train, treat_train, X_train)
    uplift = np.squeeze(est.const_marginal_effect(X_test))
    score_s = uplift_at_k(y_true=y_test, uplift=uplift, treatment=treat_test, strategy='by_group', k=0.3)
    
    est = XLearner(LGBMClassifier())
    est.fit(y_train, treat_train, X_train)
    uplift = np.squeeze(est.const_marginal_effect(X_test))
    score_x = uplift_at_k(y_true=y_test, uplift=uplift, treatment=treat_test, strategy='by_group', k=0.3)
    
    return score_t, score_s, score_x

In [6]:
Libraries = [CausalML, SkLift, EconML]

# Preprocessing Datasets

In [7]:
Datasets = []

## X5 Retail Hero Dataset

In [8]:
df_clients = pd.read_csv('clients.csv', index_col='client_id')
df_train = pd.read_csv('uplift_train.csv', index_col='client_id')
df_test = pd.read_csv('uplift_test.csv', index_col='client_id')

df_features = df_clients.copy()
df_features['first_issue_time'] = \
    (pd.to_datetime(df_features['first_issue_date'])
     - pd.to_datetime(df_features['first_issue_date']).min()) / pd.Timedelta('365d')

df_features['first_redeem_time'] = \
    (pd.to_datetime(df_features['first_redeem_date'])
     - pd.to_datetime(df_features['first_redeem_date']).min()) / pd.Timedelta('365d')

df_features['issue_redeem_delay'] = df_features['first_redeem_time'] \
    - df_features['first_issue_time']

df_features = df_features.join(pd.get_dummies(df_features['gender']))
df_features['first_redeem_time'] = df_features['first_redeem_time'].fillna(df_features['first_redeem_time'].mean())
df_features['issue_redeem_delay'] = df_features['issue_redeem_delay'].fillna(df_features['issue_redeem_delay'].mean())

df_features = df_features.drop(['first_issue_date', 'first_redeem_date', 'gender'], axis=1)

indices_train = df_train.index
indices_test = df_test.index
indices_learn, indices_testid = train_test_split(df_train.index, test_size=0.3, random_state=123)

X_train = df_features.loc[indices_learn, :]
y_train = df_train.loc[indices_learn, 'target']
treat_train = df_train.loc[indices_learn, 'treatment_flg']

X_test = df_features.loc[indices_testid, :]
y_test = df_train.loc[indices_testid, 'target']
treat_test =  df_train.loc[indices_testid, 'treatment_flg']

X_train_full = df_features.loc[indices_train, :]
y_train_full = df_train.loc[:, 'target']
treat_train_full = df_train.loc[:, 'treatment_flg']

cat_features = ['gender']

Datasets.append((X_train, treat_train, y_train, X_test, treat_test, y_test))

## Hillstrom Dataset

In [9]:
df = pd.read_csv('Hillstrom.csv')
df.drop(['history_segment', "conversion", "spend"], axis=1, inplace=True)

cat_cols = ['zip_code', 'channel']
df_ohe = pd.get_dummies(df, columns=cat_cols)
df_ohe.segment = df_ohe.segment.map({'Womens E-Mail': 1, 'Mens E-Mail': 1, 'No E-Mail': 0})

X = df_ohe.drop('visit', axis=1)
y = df_ohe['visit'].astype('int')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

treat_train = X_train['segment']
treat_test = X_test['segment']

X_train.drop(['segment'], axis=1, inplace=True)
X_test.drop(['segment'], axis=1, inplace=True)

Datasets.append((X_train, treat_train, y_train, X_test, treat_test, y_test))

## Kuusito Dataset

In [10]:
df = pd.read_csv('Kuusito.csv')
df.drop(['customer_type'], axis=1, inplace=True)

df = df.replace(r'Value', '', regex=True)
df['target_control'] = df['target_control'].map({'control': 1, 'target': 0})
df['outcome'] = df['outcome'].map({'negative': 0, 'positive': 1})

df = pd.get_dummies(df,drop_first=True)

X = df.drop('outcome', axis=1).astype('int64')
y = df['outcome']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

treat_train = X_train['target_control']
treat_test = X_test['target_control']

X_train.drop(['target_control'], axis=1, inplace=True)
X_test.drop(['target_control'], axis=1, inplace=True)
X_train.drop(['customer_id'], axis=1, inplace=True)
X_test.drop(['customer_id'], axis=1, inplace=True)

Datasets.append((X_train, treat_train, y_train, X_test, treat_test, y_test))

## Synthetic Dataset

In [11]:
y, X, treatment, tau, b, e = synthetic_data(mode=2, n=10000, p=8, sigma=1.0)
y = (y > np.median(y)).astype(int)
X_train, X_test, y_train, y_test, treat_train, treat_test= train_test_split(X, y, treatment, test_size=0.33, random_state=0)

Datasets.append((X_train, treat_train, y_train, X_test, treat_test, y_test))

# Filling in the Table

In [12]:
idx = pd.MultiIndex.from_product([['RetailHero', 'Hillstrom', 'Kuusito', 'Synthetic'],
                                  ['T', 'S', 'X']])

In [13]:
scores = np.zeros((12, 3))

for di, (X_train, treat_train, y_train, X_test, treat_test, y_test) in enumerate(Datasets):
    for fi, MetaLearners in enumerate(Libraries):
        scores[3*di:3*di+3, fi] = MetaLearners(X_train, treat_train, y_train, X_test, treat_test, y_test)

In [14]:
df = pd.DataFrame(scores,
                  columns=['CausalML', 'SkLift', 'EconML'],
                  index=idx)

df.index.names = ['Dataset', 'Learner']
df.columns.name = 'Library'
df = df.round(3)
df.replace(0, '-', inplace=True)
display(df)

with open("LearnersByLibraries.txt", "w") as text_file:
    text_file.write(df.to_latex())

Unnamed: 0_level_0,Library,CausalML,SkLift,EconML
Dataset,Learner,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
RetailHero,T,0.053,0.053,0.038
RetailHero,S,0.04,0.04,0.038
RetailHero,X,0.038,-,0.038
Hillstrom,T,0.061,0.061,0.074
Hillstrom,S,0.067,0.067,0.074
Hillstrom,X,0.073,-,0.073
Kuusito,T,0.279,0.279,0.219
Kuusito,S,0.31,0.31,0.183
Kuusito,X,0.239,-,0.239
Synthetic,T,0.42,0.42,0.377
