# Hillstrom dataset

In [58]:
from pprint import pprint

import pandas as pd
import matplotlib.pyplot as plt
from sklift.metrics import uplift_at_k
from sklift.viz import plot_uplift_curve
from sklift.viz import plot_qini_curve
import numpy as np
from sklearn.model_selection import train_test_split
import sklearn

import warnings
warnings.filterwarnings('ignore')


In [59]:
import statsmodels.api as sm
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression

from causalml.inference.tree import UpliftRandomForestClassifier
from causalml.dataset import *
from econml.orf import DMLOrthoForest, DROrthoForest
from econml.dml import CausalForestDML

plt.style.use('fivethirtyeight')

%matplotlib inline

## Hillstrom

In [60]:
df = pd.read_csv('Hillstrom.csv')
df.drop(['history_segment', "conversion", "spend"], axis=1, inplace=True)

cat_cols = ['zip_code', 'channel']
df_ohe = pd.get_dummies(df, columns=cat_cols)
df_ohe.segment = df_ohe.segment.map({'Womens E-Mail': 1, 'Mens E-Mail': 1, 'No E-Mail': 0})

X = df_ohe.drop('visit', axis=1)
y = df_ohe['visit']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

treat_train = X_train['segment']
treat_test = X_test['segment']

X_train.drop(['segment'], axis=1, inplace=True)
X_test.drop(['segment'], axis=1, inplace=True)

In [64]:
rf_clf = UpliftRandomForestClassifier(n_estimators=100, control_name='0')

rf_clf.fit(X_train.values,
           treatment=treat_train.map(str).values,
           y=y_train.values)

y_pred = rf_clf.predict(X_test.values).reshape(-1)
score = uplift_at_k(y_true=y_test, uplift=y_pred, treatment=treat_test, strategy='by_group', k=0.3)
print(f"UpliftRandomForest | Uplift at 30: {score}")

UpliftRandomForest | Uplift at 30: 0.07239353705918017


## Kuusito

In [65]:
df = pd.read_csv('Kuusito.csv')
df.drop(['customer_type'], axis=1, inplace=True)

df = df.replace(r'Value', '', regex=True)
df['target_control'] = df['target_control'].map({'control': 0, 'target': 1})
df['outcome'] = df['outcome'].map({'negative': 0, 'positive': 1})

X = df.drop('outcome', axis=1).astype('int64')
y = df['outcome']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

treat_train = X_train['target_control']
treat_test = X_test['target_control']

X_train.drop(['target_control'], axis=1, inplace=True)
X_test.drop(['target_control'], axis=1, inplace=True)

In [67]:
rf_clf = UpliftRandomForestClassifier(n_estimators=100, control_name='0')

rf_clf.fit(X_train.values,
           treatment=treat_train.map(str).values,
           y=y_train.values)

y_pred = rf_clf.predict(X_test.values).reshape(-1)
score = uplift_at_k(y_true=y_test, uplift=y_pred, treatment=treat_test, strategy='by_group', k=0.3)
print(f"UpliftRandomForest | Uplift at 30: {score}")

UpliftRandomForest | Uplift at 30: 0.19758691206543966


## X5 Retail Hero

In [68]:
df_clients = pd.read_csv('clients.csv', index_col='client_id')
df_train = pd.read_csv('uplift_train.csv', index_col='client_id')
df_test = pd.read_csv('uplift_test.csv', index_col='client_id')

df_features = df_clients.copy()
df_features['first_issue_time'] = \
    (pd.to_datetime(df_features['first_issue_date'])
     - pd.to_datetime(df_features['first_issue_date']).min()) / pd.Timedelta('365d')

df_features['first_redeem_time'] = \
    (pd.to_datetime(df_features['first_redeem_date'])
     - pd.to_datetime(df_features['first_redeem_date']).min()) / pd.Timedelta('365d')

df_features['issue_redeem_delay'] = df_features['first_redeem_time'] \
    - df_features['first_issue_time']

df_features = df_features.join(pd.get_dummies(df_features['gender']))
df_features['first_redeem_time'] = df_features['first_redeem_time'].fillna(df_features['first_redeem_time'].mean())
df_features['issue_redeem_delay'] = df_features['issue_redeem_delay'].fillna(df_features['issue_redeem_delay'].mean())

df_features = df_features.drop(['first_issue_date', 'first_redeem_date', 'gender'], axis=1)

indices_train = df_train.index
indices_test = df_test.index
indices_learn, indices_valid = train_test_split(df_train.index, test_size=0.3, random_state=123)


X_train = df_features.loc[indices_learn, :]
y_train = df_train.loc[indices_learn, 'target']
treat_train = df_train.loc[indices_learn, 'treatment_flg']

X_val = df_features.loc[indices_valid, :]
y_val = df_train.loc[indices_valid, 'target']
treat_val =  df_train.loc[indices_valid, 'treatment_flg']

X_train_full = df_features.loc[indices_train, :]
y_train_full = df_train.loc[:, 'target']
treat_train_full = df_train.loc[:, 'treatment_flg']

X_test = df_features.loc[indices_test, :]

cat_features = ['gender']

In [71]:
rf_clf = UpliftRandomForestClassifier(n_estimators=100, control_name='0')

rf_clf.fit(X_train.values,
           treatment=treat_train.map(str).values,
           y=y_train.values)

y_pred = rf_clf.predict(X_val.values).reshape(-1)
score = uplift_at_k(y_true=y_val, uplift=y_pred, treatment=treat_val, strategy='by_group', k=0.3)
print(f"UpliftRandomForest | Uplift at 30: {score}")

UpliftRandomForest | Uplift at 30: 0.05308883954198007
