In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier, Pool, cv
from catboost.utils import get_roc_curve

from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score

from sklift.models.models import TwoModels

from tqdm import tqdm

In [2]:
client_train = pd.read_csv('client_train.csv')
client_test = pd.read_csv('client_test.csv')

In [3]:
train = pd.read_csv('data/x5-uplift-valid/data/train.csv')
test = pd.read_csv('data/x5-uplift-valid/data/test.csv')

In [4]:
client_train.head()

Unnamed: 0,client_id,0_x,1_x,2_x,3_x,4_x,5_x,6_x,7_x,0_y,...,price,netto,is_own_trademark,is_alcohol,transaction_id,transaction_datetime,age,gender,first_redeem_date,first_issue_date
0,000012768d,0.35579,-1.312767,-1.805202,-1.324361,-1.433339,0.210571,0.997666,-1.19468,4.0,...,46.403846,0.540231,0.076923,0.0,52,1543648365,45,U,1515094000.0,1501947648
1,000036f903,10.003442,-0.46298,3.784321,-1.121093,-1.813001,-0.643167,-1.655461,-0.814132,2.0,...,50.374486,0.481877,0.08642,0.006173,162,1543402116,72,F,1492951000.0,1491832463
2,0001f552b0,-3.779053,-3.19279,-3.454876,-2.213815,0.624925,-0.176464,-0.921317,-0.848478,1.0,...,60.972481,0.483849,0.116279,0.0,86,1543043683,33,F,1535461000.0,1498850438
3,00020e7b18,-10.359025,-22.526642,4.226336,-0.010849,-8.143775,0.708611,-5.663005,-9.967441,39.0,...,62.344056,0.482176,0.161765,0.0,272,1542961286,73,U,1515607000.0,1511782905
4,000220a0a7,-6.280757,-5.320674,-4.379339,-6.228188,-1.550169,-3.660052,-2.316301,-2.587857,0.0,...,66.485294,0.357918,0.188235,0.011765,85,1544547256,45,M,1538665000.0,1512840380


In [5]:
client_test.head()

Unnamed: 0,client_id,0_x,1_x,2_x,3_x,4_x,5_x,6_x,7_x,0_y,...,price,netto,is_own_trademark,is_alcohol,transaction_id,transaction_datetime,age,gender,first_redeem_date,first_issue_date
0,00010925a5,-3.286649,-4.19101,-0.528681,-2.924018,-1.629946,-0.082534,1.020414,-0.486885,2.0,...,68.089744,0.615551,0.115385,0.012821,78,1543143378,83,U,1536942000.0,1532449289
1,00035a21d9,-0.951187,-1.18586,-0.748416,-1.056971,-0.287017,-0.740275,0.521372,-0.474041,0.0,...,59.307692,0.419346,0.038462,0.0,26,1549712782,69,U,1551540000.0,1549723582
2,00038f9200,-11.146945,8.943185,-4.434059,-5.539798,-6.143625,6.041638,0.145091,-2.319488,11.0,...,44.438819,0.546804,0.088608,0.0,158,1544180630,79,U,1547301000.0,1544191430
3,0004315e57,-5.125619,-6.295256,-3.827442,-5.276845,-1.218722,-2.704091,-1.753487,-1.75905,1.0,...,51.953552,0.366172,0.180328,0.0,122,1543152056,38,U,1549120000.0,1543162856
4,0006fca4bf,-1.698257,-2.09261,-1.285074,-1.786266,-0.805288,-0.010533,-0.618332,0.565975,1.0,...,66.211905,0.445714,0.142857,0.035714,28,1550648850,57,U,,1550659650


In [6]:
test.head()

Unnamed: 0,client_id
0,a9a604ed6e
1,ebd7360016
2,908cd9b8e8
3,dceb8ce861
4,f4f0ac6b06


In [7]:
client_train = client_train.merge(train, how='left', on='client_id')
client_train['date_dif'] = client_train['transaction_datetime'] - client_train['first_issue_date']
client_train.drop(columns=['transaction_datetime', 'first_issue_date'], inplace=True)

In [8]:
client_train

Unnamed: 0,client_id,0_x,1_x,2_x,3_x,4_x,5_x,6_x,7_x,0_y,...,netto,is_own_trademark,is_alcohol,transaction_id,age,gender,first_redeem_date,treatment_flg,purchased,date_dif
0,000012768d,0.355790,-1.312767,-1.805202,-1.324361,-1.433339,0.210571,0.997666,-1.194680,4.0,...,0.540231,0.076923,0.000000,52,45,U,1.515094e+09,0,1,41700717
1,000036f903,10.003442,-0.462980,3.784321,-1.121093,-1.813001,-0.643167,-1.655461,-0.814132,2.0,...,0.481877,0.086420,0.006173,162,72,F,1.492951e+09,1,1,51569653
2,0001f552b0,-3.779053,-3.192790,-3.454876,-2.213815,0.624925,-0.176464,-0.921317,-0.848478,1.0,...,0.483849,0.116279,0.000000,86,33,F,1.535461e+09,1,1,44193245
3,00020e7b18,-10.359025,-22.526642,4.226336,-0.010849,-8.143775,0.708611,-5.663005,-9.967441,39.0,...,0.482176,0.161765,0.000000,272,73,U,1.515607e+09,1,1,31178381
4,000220a0a7,-6.280757,-5.320674,-4.379339,-6.228188,-1.550169,-3.660052,-2.316301,-2.587857,0.0,...,0.357918,0.188235,0.011765,85,45,M,1.538665e+09,0,1,31706876
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140022,fffd5cd0c6,-0.814585,-2.291754,-1.270044,-4.179781,-3.292490,5.296040,0.142201,-1.087444,8.0,...,0.580350,0.200000,0.000000,100,47,M,1.537728e+09,1,1,45002142
140023,fffd63dfe3,-1.217319,-1.495435,-0.908676,-1.251205,-0.256814,-0.588767,-0.362914,-0.312609,1.0,...,0.354400,0.400000,0.160000,25,31,U,1.518804e+09,1,1,51979387
140024,fffd8c9d7d,-1.079535,-1.335840,-0.829749,-1.162285,-0.343432,-0.819184,1.597766,0.035321,0.0,...,0.433400,0.350000,0.000000,20,48,F,1.530389e+09,1,0,24608990
140025,fffe0abb97,-1.574706,-1.935598,-1.177582,-1.622334,-0.331746,-0.777673,-0.469852,-0.409714,0.0,...,0.603632,0.078947,0.000000,38,35,F,1.518341e+09,0,0,31788539


In [9]:
train_, val_ = train_test_split(client_train, test_size=0.2, random_state=42)

x_train = train_.drop(['client_id', 'purchased', 'treatment_flg'], axis=1)
y_train = train_['purchased']
treat_train = train_['treatment_flg']


x_val = val_.drop(['client_id', 'purchased', 'treatment_flg'], axis=1)
y_val = val_['purchased']
treat_val = val_['treatment_flg']

In [10]:
params = {'iterations': 200,
          'learning_rate': 0.05,
          'verbose': False,
          'loss_function': 'Logloss',
          'eval_metric': 'AUC',
          'random_seed': 42,
          'depth': 8,
          }

### val two models

In [11]:
x_train = client_train.drop(columns=['purchased', 'treatment_flg', 'client_id'])
y_train = client_train['purchased']
treat_train = client_train['treatment_flg']

In [12]:
estimator_trmnt_val = CatBoostClassifier(**params)
estimator_ctrl_val = CatBoostClassifier(**params)

In [13]:
tm_ctrl_val = TwoModels(
    estimator_trmnt=estimator_trmnt_val,
    estimator_ctrl=estimator_ctrl_val,
    method='ddr_control'
)

In [14]:
cat_features = ['gender']

In [15]:
tm_ctrl_val = tm_ctrl_val.fit(x_train, y_train, treat_train,
                        estimator_trmnt_fit_params={'plot': True,
                        'cat_features':cat_features},
                        estimator_ctrl_fit_params={'plot': True,
                        'cat_features':cat_features}
                        )

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [16]:
uplift_tm_val = tm_ctrl_val.predict(x_val)

In [17]:
roc_val =roc_auc_score(y_val, uplift_tm_val)

In [18]:
gini = 2 * roc_val - 1
gini

0.18223727240029364

## TEST

In [19]:
client_test_ = test.merge(client_test, how='inner', on='client_id')
client_test_['date_dif'] = client_test_['transaction_datetime'] - client_test_['first_issue_date']
client_test_.drop(columns=['transaction_datetime', 'first_issue_date'], inplace=True)
client_test_

Unnamed: 0,client_id,0_x,1_x,2_x,3_x,4_x,5_x,6_x,7_x,0_y,...,purchase_sum,price,netto,is_own_trademark,is_alcohol,transaction_id,age,gender,first_redeem_date,date_dif
0,a9a604ed6e,-2.156004,-1.410594,-1.369566,-1.929635,-0.468196,-1.047256,-0.636993,-0.504247,0.0,...,783.783721,62.802326,0.445000,0.279070,0.000000,43,36,F,,7575622
1,ebd7360016,-8.016316,-0.187270,-4.456468,0.369322,-4.490660,4.494249,-1.395802,-1.759495,8.0,...,217.380087,38.411594,0.407461,0.278261,0.000000,115,63,F,1.504283e+09,43878967
2,908cd9b8e8,-4.529898,-4.395282,-3.301201,-4.713464,-3.523308,2.885222,-1.248660,-1.563979,6.0,...,422.135795,55.846320,0.488284,0.125000,0.000000,88,49,F,1.531502e+09,20613925
3,dceb8ce861,3.450373,-0.499648,0.489392,-4.032752,-4.055727,3.529723,-1.508489,-2.245210,7.0,...,1670.712991,51.667379,0.419205,0.128205,0.000000,117,46,U,1.534013e+09,39724993
4,f4f0ac6b06,-6.095282,-3.300421,-6.804593,-1.975886,-5.048235,4.511504,1.009965,1.470616,10.0,...,872.106509,48.375740,0.369899,0.153846,0.035503,169,45,U,1.550262e+09,13467333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60007,4762fb04c5,-8.312078,-6.668895,-3.603318,-7.322845,-3.411982,-0.644720,-2.789912,0.682131,4.0,...,654.625556,61.458388,0.476902,0.202614,0.032680,153,26,M,1.545833e+09,15784185
60008,533d1516e7,-0.801068,-1.003413,-0.639249,-0.906338,-0.293981,-0.616396,-0.593893,0.739090,0.0,...,132.091875,54.843750,0.299812,0.187500,0.125000,16,37,M,1.515347e+09,36284563
60009,c93b21a707,-8.582656,-10.209091,-7.874050,-9.950804,-7.812747,11.742416,-1.818426,-2.263788,18.0,...,371.402033,37.060166,0.315299,0.099585,0.012448,241,42,U,1.553968e+09,4740472
60010,3fe0668de5,-2.360616,-2.917133,-1.801031,-2.507403,-0.598602,-1.467294,0.084404,-0.893502,0.0,...,388.607925,44.345912,0.568453,0.301887,0.000000,53,33,U,1.549103e+09,-10800


## Two models

In [20]:
x = client_train.drop(columns=['purchased', 'treatment_flg', 'client_id'])
y = client_train['purchased']
treat_train = client_train['treatment_flg']

In [21]:
estimator_trmnt = CatBoostClassifier(**params)
estimator_ctrl = CatBoostClassifier(**params)

In [22]:
tm_ctrl = TwoModels(
    estimator_trmnt=estimator_trmnt,
    estimator_ctrl=estimator_ctrl,
    method='ddr_control'
)

In [23]:
tm_ctrl = tm_ctrl.fit(x, y, treat_train,
                        estimator_trmnt_fit_params={'plot': True,
                        'cat_features':cat_features},
                        estimator_ctrl_fit_params={'plot': True,
                        'cat_features':cat_features},
                        )

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [24]:
uplift_tm_ctrl = tm_ctrl.predict(client_test_.drop(columns='client_id'))

In [25]:
test_two_models = test.copy()

In [26]:
test_two_models['pred'] = uplift_tm_ctrl

In [27]:
test_two_models.head()

Unnamed: 0,client_id,pred
0,a9a604ed6e,-0.010526
1,ebd7360016,0.025892
2,908cd9b8e8,0.019099
3,dceb8ce861,-0.028366
4,f4f0ac6b06,-0.016431


In [28]:
test_two_models.to_csv('predict/test_two_models.csv', index=False)

## ClassTransformation

In [29]:
#from sklift.models import ClassTransformation

In [30]:
"""ct = ClassTransformation(CatBoostClassifier(iterations=300, learning_rate=0.05, verbose=False))

ct = ct.fit(x, y, treat_train, estimator_fit_params={'plot': True})

uplift_ct = ct.predict(client_test_.drop(columns='client_id'))"""

"ct = ClassTransformation(CatBoostClassifier(iterations=300, learning_rate=0.05, verbose=False))\n\nct = ct.fit(x, y, treat_train, estimator_fit_params={'plot': True})\n\nuplift_ct = ct.predict(client_test_.drop(columns='client_id'))"

In [31]:
"""ct_models = test.copy()
ct_models['pred'] = uplift_ct"""

"ct_models = test.copy()\nct_models['pred'] = uplift_ct"

In [32]:
#ct_models.to_csv('predict/ct_models.csv', index=False)