In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier, Pool, cv
from catboost.utils import get_roc_curve

from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score

from sklift.models.models import TwoModels

from tqdm import tqdm

In [2]:
client_train = pd.read_csv('client_train.csv')
client_test = pd.read_csv('client_test.csv')

In [3]:
train = pd.read_csv('data/x5-uplift-valid/data/train.csv')
test = pd.read_csv('data/x5-uplift-valid/data/test.csv')

In [4]:
client_train.head()

Unnamed: 0,client_id,0_x,1_x,2_x,3_x,4_x,5_x,6_x,7_x,0_y,...,purchase_sum,price,netto,is_own_trademark,is_alcohol,transaction_id,transaction_datetime,age,first_redeem_date,first_issue_date
0,000012768d,0.35579,-1.312827,-1.805221,-1.324123,-1.482387,0.172043,1.046069,-0.853127,4.0,...,784.788462,46.403846,0.540231,0.076923,0.0,52,1543648365,45,1515094000.0,1501947648
1,000036f903,10.003441,-0.463081,3.784677,-1.122427,-1.885861,-0.687858,-1.574172,-0.539083,2.0,...,362.746914,50.374486,0.481877,0.08642,0.006173,162,1543402116,72,1492951000.0,1491832463
2,0001f552b0,-3.779056,-3.192819,-3.454442,-2.215208,0.592765,-0.186645,-0.870753,-0.806661,1.0,...,557.841163,60.972481,0.483849,0.116279,0.0,86,1543043683,33,1535461000.0,1498850438
3,00020e7b18,-10.359004,-22.527,4.226019,-0.007912,-8.342791,0.459325,-5.084318,-9.420597,39.0,...,2055.038456,62.344056,0.482176,0.161765,0.0,272,1542961286,73,1515607000.0,1511782905
4,000220a0a7,-6.280748,-5.320904,-4.380471,-6.226598,-1.636538,-3.749386,-2.394807,-2.199908,0.0,...,817.500941,66.485294,0.357918,0.188235,0.011765,85,1544547256,45,1538665000.0,1512840380


In [5]:
client_test.head()

Unnamed: 0,client_id,0_x,1_x,2_x,3_x,4_x,5_x,6_x,7_x,0_y,...,purchase_sum,price,netto,is_own_trademark,is_alcohol,transaction_id,transaction_datetime,age,first_redeem_date,first_issue_date
0,00010925a5,-3.286636,-4.190984,-0.52913,-2.923619,-1.586825,-0.077354,1.111007,-0.520708,2.0,...,365.307692,68.089744,0.615551,0.115385,0.012821,78,1543143378,83,1536942000.0,1532449289
1,00035a21d9,-0.951191,-1.185871,-0.748385,-1.05668,-0.285379,-0.739092,0.52647,-0.502414,0.0,...,674.827692,59.307692,0.419346,0.038462,0.0,26,1549712782,69,1551540000.0,1549723582
2,00038f9200,-11.146963,8.943216,-4.433087,-5.541098,-6.097161,6.062577,0.266438,-2.438299,11.0,...,238.506329,44.438819,0.546804,0.088608,0.0,158,1544180630,79,1547301000.0,1544191430
3,0004315e57,-5.125633,-6.295177,-3.827248,-5.276777,-1.101357,-2.668629,-1.622628,-1.921029,1.0,...,667.672131,51.953552,0.366172,0.180328,0.0,122,1543152056,38,1549120000.0,1543162856
4,0006fca4bf,-1.698258,-2.092598,-1.284988,-1.786417,-0.794371,-0.005176,-0.604105,0.5595,1.0,...,1799.678571,66.211905,0.445714,0.142857,0.035714,28,1550648850,57,,1550659650


In [6]:
test.head()

Unnamed: 0,client_id
0,a9a604ed6e
1,ebd7360016
2,908cd9b8e8
3,dceb8ce861
4,f4f0ac6b06


In [7]:
client_train = client_train.merge(train, how='left', on='client_id')
client_train['date_dif'] = client_train['transaction_datetime'] - client_train['first_issue_date']
client_train.drop(columns=['transaction_datetime', 'first_issue_date'], inplace=True)

In [8]:
client_train

Unnamed: 0,client_id,0_x,1_x,2_x,3_x,4_x,5_x,6_x,7_x,0_y,...,price,netto,is_own_trademark,is_alcohol,transaction_id,age,first_redeem_date,treatment_flg,purchased,date_dif
0,000012768d,0.355790,-1.312827,-1.805221,-1.324123,-1.482387,0.172043,1.046069,-0.853127,4.0,...,46.403846,0.540231,0.076923,0.000000,52,45,1.515094e+09,0,1,41700717
1,000036f903,10.003441,-0.463081,3.784677,-1.122427,-1.885861,-0.687858,-1.574172,-0.539083,2.0,...,50.374486,0.481877,0.086420,0.006173,162,72,1.492951e+09,1,1,51569653
2,0001f552b0,-3.779056,-3.192819,-3.454442,-2.215208,0.592765,-0.186645,-0.870753,-0.806661,1.0,...,60.972481,0.483849,0.116279,0.000000,86,33,1.535461e+09,1,1,44193245
3,00020e7b18,-10.359004,-22.527000,4.226019,-0.007912,-8.342791,0.459325,-5.084318,-9.420597,39.0,...,62.344056,0.482176,0.161765,0.000000,272,73,1.515607e+09,1,1,31178381
4,000220a0a7,-6.280748,-5.320904,-4.380471,-6.226598,-1.636538,-3.749386,-2.394807,-2.199908,0.0,...,66.485294,0.357918,0.188235,0.011765,85,45,1.538665e+09,0,1,31706876
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140022,fffd5cd0c6,-0.814589,-2.291821,-1.269909,-4.180317,-3.337012,5.269766,0.187225,-0.931089,8.0,...,52.655000,0.580350,0.200000,0.000000,100,47,1.537728e+09,1,1,45002142
140023,fffd63dfe3,-1.217327,-1.495454,-0.908427,-1.250921,-0.246740,-0.602915,-0.304342,-0.326000,1.0,...,49.346667,0.354400,0.400000,0.160000,25,31,1.518804e+09,1,1,51979387
140024,fffd8c9d7d,-1.079535,-1.335818,-0.829732,-1.162276,-0.342656,-0.818033,1.570479,0.003446,0.0,...,66.375000,0.433400,0.350000,0.000000,20,48,1.530389e+09,1,0,24608990
140025,fffe0abb97,-1.574708,-1.935621,-1.177377,-1.623138,-0.350101,-0.784079,-0.444146,-0.379824,0.0,...,58.478070,0.603632,0.078947,0.000000,38,35,1.518341e+09,0,0,31788539


In [9]:
train_, val_ = train_test_split(client_train, test_size=0.2, random_state=42)

x_train = train_.drop(['client_id', 'purchased', 'treatment_flg'], axis=1)
y_train = train_['purchased']
treat_train = train_['treatment_flg']


x_val = val_.drop(['client_id', 'purchased', 'treatment_flg'], axis=1)
y_val = val_['purchased']
treat_val = val_['treatment_flg']

In [10]:
params = {'iterations': 500,
          'learning_rate': 0.02,
          'verbose': False,
          'loss_function': 'Logloss',
          'eval_metric': 'AUC',
          'random_seed': 42,
          'depth': 8,
          }

### val two models

In [11]:
x_train = client_train.drop(columns=['purchased', 'treatment_flg', 'client_id'])
y_train = client_train['purchased']
treat_train = client_train['treatment_flg']

In [12]:
estimator_trmnt_val = CatBoostClassifier(**params)
estimator_ctrl_val = CatBoostClassifier(**params)

In [13]:
tm_ctrl_val = TwoModels(
    estimator_trmnt=estimator_trmnt_val,
    estimator_ctrl=estimator_ctrl_val,
    method='ddr_control'
)

In [14]:
cat_features = ['gender']

In [15]:
tm_ctrl_val = tm_ctrl_val.fit(x_train, y_train, treat_train,
                        estimator_trmnt_fit_params={'plot': True},
                        estimator_ctrl_fit_params={'plot': True}
                        )

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [16]:
uplift_tm_val = tm_ctrl_val.predict(x_val)

In [17]:
roc_val =roc_auc_score(y_val, uplift_tm_val)

In [18]:
gini = 2 * roc_val - 1
gini

0.17671359675967957

## TEST

In [19]:
client_test_ = test.merge(client_test, how='inner', on='client_id')
client_test_['date_dif'] = client_test_['transaction_datetime'] - client_test_['first_issue_date']
client_test_.drop(columns=['transaction_datetime', 'first_issue_date'], inplace=True)
client_test_

Unnamed: 0,client_id,0_x,1_x,2_x,3_x,4_x,5_x,6_x,7_x,0_y,...,express_points_spent,purchase_sum,price,netto,is_own_trademark,is_alcohol,transaction_id,age,first_redeem_date,date_dif
0,a9a604ed6e,-2.156000,-1.410567,-1.369530,-1.931263,-0.468651,-1.031899,-0.640093,-0.507615,0.0,...,0.00000,783.783721,62.802326,0.445000,0.279070,0.000000,43,36,,7575622
1,ebd7360016,-8.016330,-0.187184,-4.456487,0.369728,-4.350522,4.529934,-1.256905,-1.984229,8.0,...,0.00000,217.380087,38.411594,0.407461,0.278261,0.000000,115,63,1.504283e+09,43878967
2,908cd9b8e8,-4.529898,-4.395273,-3.301222,-4.712800,-3.498401,2.892091,-1.209462,-1.624379,6.0,...,0.00000,422.135795,55.846320,0.488284,0.125000,0.000000,88,49,1.531502e+09,20613925
3,dceb8ce861,3.450358,-0.499576,0.489552,-4.032146,-3.968508,3.554526,-1.464811,-2.295642,7.0,...,0.00000,1670.712991,51.667379,0.419205,0.128205,0.000000,117,46,1.534013e+09,39724993
4,f4f0ac6b06,-6.095244,-3.300595,-6.804456,-1.978318,-5.236014,4.457234,0.966524,1.646245,10.0,...,0.00000,872.106509,48.375740,0.369899,0.153846,0.035503,169,45,1.550262e+09,13467333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60007,4762fb04c5,-8.312063,-6.668958,-3.603223,-7.325538,-3.527663,-0.642486,-2.821883,0.717337,4.0,...,0.00000,654.625556,61.458388,0.476902,0.202614,0.032680,153,26,1.545833e+09,15784185
60008,533d1516e7,-0.801075,-1.003355,-0.639586,-0.904943,-0.180813,-0.594569,-0.456368,0.517885,0.0,...,0.00000,132.091875,54.843750,0.299812,0.187500,0.125000,16,37,1.515347e+09,36284563
60009,c93b21a707,-8.582654,-10.209056,-7.873446,-9.953300,-7.816201,11.760788,-1.823666,-2.206817,18.0,...,0.00000,371.402033,37.060166,0.315299,0.099585,0.012448,241,42,1.553968e+09,4740472
60010,3fe0668de5,-2.360622,-2.917157,-1.800938,-2.508120,-0.623390,-1.459896,0.055127,-0.924025,0.0,...,-5.09434,388.607925,44.345912,0.568453,0.301887,0.000000,53,33,1.549103e+09,-10800


## Two models

In [20]:
x = client_train.drop(columns=['purchased', 'treatment_flg', 'client_id'])
y = client_train['purchased']
treat_train = client_train['treatment_flg']

In [21]:
estimator_trmnt = CatBoostClassifier(**params)
estimator_ctrl = CatBoostClassifier(**params)

In [22]:
tm_ctrl = TwoModels(
    estimator_trmnt=estimator_trmnt,
    estimator_ctrl=estimator_ctrl,
    method='ddr_control'
)

In [23]:
tm_ctrl = tm_ctrl.fit(x, y, treat_train,
                        estimator_trmnt_fit_params={'plot': True},
                        estimator_ctrl_fit_params={'plot': True},
                        )

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [24]:
uplift_tm_ctrl = tm_ctrl.predict(client_test_.drop(columns='client_id'))

In [25]:
test_two_models = test.copy()

In [26]:
test_two_models['pred'] = uplift_tm_ctrl

In [27]:
test_two_models.head()

Unnamed: 0,client_id,pred
0,a9a604ed6e,-0.01776
1,ebd7360016,0.024547
2,908cd9b8e8,0.029508
3,dceb8ce861,-0.012184
4,f4f0ac6b06,0.007601


In [28]:
test_two_models.to_csv('predict/test_two_models.csv', index=False)

## ClassTransformation

In [29]:
from sklift.models import ClassTransformation

In [30]:
ct = ClassTransformation(CatBoostClassifier(**params))

ct = ct.fit(x, y, treat_train, estimator_fit_params={'plot': True})

uplift_ct = ct.predict(client_test_.drop(columns='client_id'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [None]:
ct_models = test.copy()
ct_models['pred'] = uplift_ct

In [None]:
ct_models.to_csv('predict/ct_models.csv', index=False)