In [1]:
from pprint import pprint

import pandas as pd
import matplotlib.pyplot as plt
from sklift.metrics import uplift_at_k
from sklift.viz import plot_uplift_curve
from sklift.viz import plot_qini_curve
import numpy as np
from sklearn.model_selection import train_test_split
import sklearn

import warnings
warnings.filterwarnings('ignore')


In [2]:
import statsmodels.api as sm
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression

from causalml.inference.tree import UpliftRandomForestClassifier
from causalml.dataset import *

plt.style.use('fivethirtyeight')

%matplotlib inline

# Preprocessing Datasets

In [3]:
Datasets = []

## X5 Retail Hero Dataset

In [4]:
df_clients = pd.read_csv('data/clients.csv', index_col='client_id')
df_train = pd.read_csv('data/uplift_train.csv', index_col='client_id')
df_test = pd.read_csv('data/uplift_test.csv', index_col='client_id')

df_features = df_clients.copy()
df_features['first_issue_time'] = \
    (pd.to_datetime(df_features['first_issue_date'])
     - pd.to_datetime(df_features['first_issue_date']).min()) / pd.Timedelta('365d')

df_features['first_redeem_time'] = \
    (pd.to_datetime(df_features['first_redeem_date'])
     - pd.to_datetime(df_features['first_redeem_date']).min()) / pd.Timedelta('365d')

df_features['issue_redeem_delay'] = df_features['first_redeem_time'] \
    - df_features['first_issue_time']

df_features = df_features.join(pd.get_dummies(df_features['gender']))
df_features['first_redeem_time'] = df_features['first_redeem_time'].fillna(df_features['first_redeem_time'].mean())
df_features['issue_redeem_delay'] = df_features['issue_redeem_delay'].fillna(df_features['issue_redeem_delay'].mean())

df_features = df_features.drop(['first_issue_date', 'first_redeem_date', 'gender'], axis=1)

indices_train = df_train.index
indices_test = df_test.index
indices_learn, indices_testid = train_test_split(df_train.index, test_size=0.3, random_state=123)

X_train = df_features.loc[indices_learn, :]
y_train = df_train.loc[indices_learn, 'target']
treat_train = df_train.loc[indices_learn, 'treatment_flg']

X_test = df_features.loc[indices_testid, :]
y_test = df_train.loc[indices_testid, 'target']
treat_test =  df_train.loc[indices_testid, 'treatment_flg']

X_train_full = df_features.loc[indices_train, :]
y_train_full = df_train.loc[:, 'target']
treat_train_full = df_train.loc[:, 'treatment_flg']

cat_features = ['gender']

Datasets.append((X_train, treat_train, y_train, X_test, treat_test, y_test))

## Hillstrom Dataset

In [5]:
df = pd.read_csv('data/Hillstrom.csv')
df.drop(['history_segment', "conversion", "spend"], axis=1, inplace=True)

cat_cols = ['zip_code', 'channel']
df_ohe = pd.get_dummies(df, columns=cat_cols)
df_ohe.segment = df_ohe.segment.map({'Womens E-Mail': 1, 'Mens E-Mail': 1, 'No E-Mail': 0})

X = df_ohe.drop('visit', axis=1)
y = df_ohe['visit'].astype('int')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

treat_train = X_train['segment']
treat_test = X_test['segment']

X_train.drop(['segment'], axis=1, inplace=True)
X_test.drop(['segment'], axis=1, inplace=True)

Datasets.append((X_train, treat_train, y_train, X_test, treat_test, y_test))

## Kuusito Dataset

In [6]:
df = pd.read_csv('data/Kuusito.csv')
df.drop(['customer_type'], axis=1, inplace=True)

df = df.replace(r'Value', '', regex=True)
df['target_control'] = df['target_control'].map({'control': 1, 'target': 0})
df['outcome'] = df['outcome'].map({'negative': 0, 'positive': 1})

df = pd.get_dummies(df,drop_first=True)

X = df.drop('outcome', axis=1).astype('int64')
y = df['outcome']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

treat_train = X_train['target_control']
treat_test = X_test['target_control']

X_train.drop(['target_control'], axis=1, inplace=True)
X_test.drop(['target_control'], axis=1, inplace=True)
X_train.drop(['customer_id'], axis=1, inplace=True)
X_test.drop(['customer_id'], axis=1, inplace=True)

Datasets.append((X_train, treat_train, y_train, X_test, treat_test, y_test))

## Synthetic Dataset

In [7]:
y, X, treatment, tau, b, e = synthetic_data(mode=2, n=10000, p=8, sigma=1.0)
y = (y > np.median(y)).astype(int)
X_train, X_test, y_train, y_test, treat_train, treat_test= train_test_split(X, y, treatment, test_size=0.33, random_state=0)

X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)
y_train = pd.Series(y_train)
y_test = pd.Series(y_test)
treat_train = pd.Series(treat_train)
treat_test = pd.Series(treat_test)

Datasets.append((X_train, treat_train, y_train, X_test, treat_test, y_test))

# Preparing Evaluation Functions

In [8]:
Functions = ['KL', 'ED', 'Chi']

# UpliftRandomForestClassifier

In [9]:
def rf_score(data, eval_func):
    (X_train, treat_train, y_train, X_test, treat_test, y_test) = data
    
    rf_clf = UpliftRandomForestClassifier(n_estimators=100, control_name='0', evaluationFunction=eval_func)

    rf_clf.fit(X_train.values,
               treatment=treat_train.map(str).values,
               y=y_train.values)

    y_pred = rf_clf.predict(X_test.values).reshape(-1)
    score = uplift_at_k(y_true=y_test, uplift=y_pred, treatment=treat_test, strategy='by_group', k=0.3)
    return round(score, 3)

In [10]:
scores = np.zeros((4, 3))

for di, data in enumerate(Datasets):
    for fi, eval_func in enumerate(Functions):
        scores[di, fi] = rf_score(data, eval_func)

In [11]:
df = pd.DataFrame(scores,
                  columns=Functions,
                  index=['RetailHero', 'Hillstrom', 'Kuusito', 'Synthetic'])

df.index.name = 'Dataset'
display(df)

with open("ForestByFunctions.txt", "w") as text_file:
    text_file.write(df.to_latex())

Unnamed: 0_level_0,KL,ED,Chi
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
RetailHero,0.059,0.059,0.053
Hillstrom,0.067,0.073,0.064
Kuusito,0.145,0.147,0.151
Synthetic,0.388,0.378,0.373
