In [263]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier, Pool, cv
from catboost.utils import get_roc_curve

from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score

from sklift.models.models import TwoModels

from tqdm import tqdm

In [264]:
client_train = pd.read_csv('client_train.csv')
client_test = pd.read_csv('client_test.csv')

In [265]:
train = pd.read_csv('data/x5-uplift-valid/data/train.csv')
test = pd.read_csv('data/x5-uplift-valid/data/test.csv')

In [266]:
client_train.head()

Unnamed: 0,client_id,0_x,1_x,2_x,3_x,4_x,5_x,6_x,7_x,0_y,...,purchase_sum,price,netto,is_own_trademark,is_alcohol,transaction_id,transaction_datetime,age,first_redeem_date,first_issue_date
0,000012768d,0.355781,-1.31278,-1.804815,-1.32369,-1.469317,0.21051,1.044833,-0.833008,4.0,...,784.788462,46.403846,0.540231,0.076923,0.0,52,1543648365,45,1515094000.0,1501947648
1,000036f903,10.003428,-0.463047,3.784692,-1.123606,-1.83256,-0.611661,-1.616685,-0.550288,2.0,...,362.746914,50.374486,0.481877,0.08642,0.006173,162,1543402116,72,1492951000.0,1491832463
2,0001f552b0,-3.779058,-3.192809,-3.454585,-2.215659,0.629662,-0.158581,-0.882117,-0.799215,1.0,...,557.841163,60.972481,0.483849,0.116279,0.0,86,1543043683,33,1535461000.0,1498850438
3,00020e7b18,-10.35912,-22.526962,4.227034,-0.014116,-8.232325,0.918253,-5.457034,-9.747162,39.0,...,2055.038456,62.344056,0.482176,0.161765,0.0,272,1542961286,73,1515607000.0,1511782905
4,000220a0a7,-6.280739,-5.320772,-4.380316,-6.214154,-1.817169,-4.074261,-1.938217,-1.907696,0.0,...,817.500941,66.485294,0.357918,0.188235,0.011765,85,1544547256,45,1538665000.0,1512840380


In [267]:
client_test.head()

Unnamed: 0,client_id,0_x,1_x,2_x,3_x,4_x,5_x,6_x,7_x,0_y,...,purchase_sum,price,netto,is_own_trademark,is_alcohol,transaction_id,transaction_datetime,age,first_redeem_date,first_issue_date
0,00010925a5,-3.286651,-4.191003,-0.528918,-2.925622,-1.575245,-0.063147,1.081676,-0.415735,2.0,...,365.307692,68.089744,0.615551,0.115385,0.012821,78,1543143378,83,1536942000.0,1532449289
1,00035a21d9,-0.951189,-1.185884,-0.748372,-1.057212,-0.293547,-0.741212,0.509296,-0.439351,0.0,...,674.827692,59.307692,0.419346,0.038462,0.0,26,1549712782,69,1551540000.0,1549723582
2,00038f9200,-11.146938,8.943293,-4.434049,-5.536691,-6.127315,6.039925,0.222843,-2.613397,11.0,...,238.506329,44.438819,0.546804,0.088608,0.0,158,1544180630,79,1547301000.0,1544191430
3,0004315e57,-5.125624,-6.295154,-3.827619,-5.275782,-1.170564,-2.692296,-1.652692,-1.83087,1.0,...,667.672131,51.953552,0.366172,0.180328,0.0,122,1543152056,38,1549120000.0,1543162856
4,0006fca4bf,-1.698256,-2.092594,-1.285078,-1.785885,-0.798678,-0.008321,-0.604449,0.539743,1.0,...,1799.678571,66.211905,0.445714,0.142857,0.035714,28,1550648850,57,,1550659650


In [268]:
test.head()

Unnamed: 0,client_id
0,a9a604ed6e
1,ebd7360016
2,908cd9b8e8
3,dceb8ce861
4,f4f0ac6b06


In [269]:
client_train = client_train.merge(train, how='left', on='client_id')
client_train['date_dif'] = client_train['transaction_datetime'] - client_train['first_issue_date']
client_train.drop(columns=['transaction_datetime', 'first_issue_date'], inplace=True)

In [270]:
client_train

Unnamed: 0,client_id,0_x,1_x,2_x,3_x,4_x,5_x,6_x,7_x,0_y,...,price,netto,is_own_trademark,is_alcohol,transaction_id,age,first_redeem_date,treatment_flg,purchased,date_dif
0,000012768d,0.355781,-1.312780,-1.804815,-1.323690,-1.469317,0.210510,1.044833,-0.833008,4.0,...,46.403846,0.540231,0.076923,0.000000,52,45,1.515094e+09,0,1,41700717
1,000036f903,10.003428,-0.463047,3.784692,-1.123606,-1.832560,-0.611661,-1.616685,-0.550288,2.0,...,50.374486,0.481877,0.086420,0.006173,162,72,1.492951e+09,1,1,51569653
2,0001f552b0,-3.779058,-3.192809,-3.454585,-2.215659,0.629662,-0.158581,-0.882117,-0.799215,1.0,...,60.972481,0.483849,0.116279,0.000000,86,33,1.535461e+09,1,1,44193245
3,00020e7b18,-10.359120,-22.526962,4.227034,-0.014116,-8.232325,0.918253,-5.457034,-9.747162,39.0,...,62.344056,0.482176,0.161765,0.000000,272,73,1.515607e+09,1,1,31178381
4,000220a0a7,-6.280739,-5.320772,-4.380316,-6.214154,-1.817169,-4.074261,-1.938217,-1.907696,0.0,...,66.485294,0.357918,0.188235,0.011765,85,45,1.538665e+09,0,1,31706876
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140022,fffd5cd0c6,-0.814586,-2.291806,-1.269687,-4.180072,-3.346819,5.269581,0.206053,-0.843078,8.0,...,52.655000,0.580350,0.200000,0.000000,100,47,1.537728e+09,1,1,45002142
140023,fffd63dfe3,-1.217323,-1.495459,-0.908325,-1.251408,-0.259787,-0.586860,-0.351530,-0.310741,1.0,...,49.346667,0.354400,0.400000,0.160000,25,31,1.518804e+09,1,1,51979387
140024,fffd8c9d7d,-1.079537,-1.335815,-0.830052,-1.162133,-0.311179,-0.831840,1.615173,-0.016375,0.0,...,66.375000,0.433400,0.350000,0.000000,20,48,1.530389e+09,1,0,24608990
140025,fffe0abb97,-1.574709,-1.935621,-1.177521,-1.623152,-0.328769,-0.771939,-0.443243,-0.387981,0.0,...,58.478070,0.603632,0.078947,0.000000,38,35,1.518341e+09,0,0,31788539


In [271]:
train_, val_ = train_test_split(client_train, test_size=0.2, random_state=42)

x_train = train_.drop(['client_id', 'purchased', 'treatment_flg'], axis=1)
y_train = train_['purchased']
treat_train = train_['treatment_flg']


x_val = val_.drop(['client_id', 'purchased', 'treatment_flg'], axis=1)
y_val = val_['purchased']
treat_val = val_['treatment_flg']

In [272]:
params = {'iterations': 500,
          'learning_rate': 0.02,
          'verbose': False,
          'loss_function': 'Logloss',
          'eval_metric': 'AUC',
          'random_seed': 42,
          'depth': 8,
          }

### val two models

In [273]:
x_train = client_train.drop(columns=['purchased', 'treatment_flg', 'client_id'])
y_train = client_train['purchased']
treat_train = client_train['treatment_flg']

In [274]:
estimator_trmnt_val = CatBoostClassifier(**params)
estimator_ctrl_val = CatBoostClassifier(**params)

In [275]:
tm_ctrl_val = TwoModels(
    estimator_trmnt=estimator_trmnt_val,
    estimator_ctrl=estimator_ctrl_val,
    method='ddr_control'
)

In [276]:
cat_features = ['gender']

In [277]:
tm_ctrl_val = tm_ctrl_val.fit(x_train, y_train, treat_train,
                        estimator_trmnt_fit_params={'plot': True},
                        estimator_ctrl_fit_params={'plot': True}
                        )

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [278]:
uplift_tm_val = tm_ctrl_val.predict(x_val)

In [279]:
roc_val =roc_auc_score(y_val, uplift_tm_val)

In [280]:
gini = 2 * roc_val - 1
gini

0.1772942037839773

## TEST

In [281]:
client_test_ = test.merge(client_test, how='inner', on='client_id')
client_test_['date_dif'] = client_test_['transaction_datetime'] - client_test_['first_issue_date']
client_test_.drop(columns=['transaction_datetime', 'first_issue_date'], inplace=True)
client_test_

Unnamed: 0,client_id,0_x,1_x,2_x,3_x,4_x,5_x,6_x,7_x,0_y,...,express_points_spent,purchase_sum,price,netto,is_own_trademark,is_alcohol,transaction_id,age,first_redeem_date,date_dif
0,a9a604ed6e,-2.155999,-1.410518,-1.369626,-1.928059,-0.461227,-1.045206,-0.601918,-0.608859,0.0,...,0.00000,783.783721,62.802326,0.445000,0.279070,0.000000,43,36,,7575622
1,ebd7360016,-8.016330,-0.187129,-4.456820,0.370588,-4.425113,4.506902,-1.257900,-2.005194,8.0,...,0.00000,217.380087,38.411594,0.407461,0.278261,0.000000,115,63,1.504283e+09,43878967
2,908cd9b8e8,-4.529901,-4.395328,-3.301167,-4.715726,-3.487629,2.911752,-1.231799,-1.511304,6.0,...,0.00000,422.135795,55.846320,0.488284,0.125000,0.000000,88,49,1.531502e+09,20613925
3,dceb8ce861,3.450367,-0.499542,0.489259,-4.031961,-4.045298,3.539653,-1.450187,-2.370790,7.0,...,0.00000,1670.712991,51.667379,0.419205,0.128205,0.000000,117,46,1.534013e+09,39724993
4,f4f0ac6b06,-6.095299,-3.300697,-6.803961,-1.980993,-5.089669,4.566713,0.877313,1.609928,10.0,...,0.00000,872.106509,48.375740,0.369899,0.153846,0.035503,169,45,1.550262e+09,13467333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60007,4762fb04c5,-8.312068,-6.668911,-3.603374,-7.322215,-3.437715,-0.632689,-2.749270,0.474705,4.0,...,0.00000,654.625556,61.458388,0.476902,0.202614,0.032680,153,26,1.545833e+09,15784185
60008,533d1516e7,-0.801084,-1.003313,-0.639524,-0.906656,-0.276381,-0.612789,-0.512875,0.642188,0.0,...,0.00000,132.091875,54.843750,0.299812,0.187500,0.125000,16,37,1.515347e+09,36284563
60009,c93b21a707,-8.582658,-10.209088,-7.873896,-9.951057,-7.755145,11.762752,-1.770457,-2.198819,18.0,...,0.00000,371.402033,37.060166,0.315299,0.099585,0.012448,241,42,1.553968e+09,4740472
60010,3fe0668de5,-2.360615,-2.917149,-1.801001,-2.506089,-0.619838,-1.467109,0.080278,-0.926321,0.0,...,-5.09434,388.607925,44.345912,0.568453,0.301887,0.000000,53,33,1.549103e+09,-10800


## Two models

In [282]:
x = client_train.drop(columns=['purchased', 'treatment_flg', 'client_id'])
y = client_train['purchased']
treat_train = client_train['treatment_flg']

In [283]:
estimator_trmnt = CatBoostClassifier(**params)
estimator_ctrl = CatBoostClassifier(**params)

In [284]:
tm_ctrl = TwoModels(
    estimator_trmnt=estimator_trmnt,
    estimator_ctrl=estimator_ctrl,
    method='ddr_control'
)

In [285]:
tm_ctrl = tm_ctrl.fit(x, y, treat_train,
                        estimator_trmnt_fit_params={'plot': True},
                        estimator_ctrl_fit_params={'plot': True},
                        )

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [286]:
uplift_tm_ctrl = tm_ctrl.predict(client_test_.drop(columns='client_id'))

In [287]:
test_two_models = test.copy()

In [288]:
test_two_models['pred'] = uplift_tm_ctrl

In [289]:
test_two_models.head()

Unnamed: 0,client_id,pred
0,a9a604ed6e,-0.017876
1,ebd7360016,0.028965
2,908cd9b8e8,0.035141
3,dceb8ce861,-0.012442
4,f4f0ac6b06,-0.012344


In [290]:
test_two_models.to_csv('predict/test_two_models.csv', index=False)

## ClassTransformation

In [291]:
from sklift.models import ClassTransformation

In [292]:
ct = ClassTransformation(CatBoostClassifier(**params))

ct = ct.fit(x, y, treat_train, estimator_fit_params={'plot': True})

uplift_ct = ct.predict(client_test_.drop(columns='client_id'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [None]:
ct_models = test.copy()
ct_models['pred'] = uplift_ct

In [None]:
ct_models.to_csv('predict/ct_models.csv', index=False)