In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier, Pool, cv
from catboost.utils import get_roc_curve

from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score

from tqdm import tqdm

In [2]:
client_train = pd.read_csv('client_train.csv')
client_test = pd.read_csv('client_test.csv')

In [3]:
train = pd.read_csv('data/x5-uplift-valid/data/train.csv')
test = pd.read_csv('data/x5-uplift-valid/data/test.csv')

In [4]:
client_train.head()

Unnamed: 0,client_id,0,1,2,3,4,5,6,7,8,...,96,97,98,99,regular_points_received,express_points_received,regular_points_spent,express_points_spent,purchase_sum,price
0,000012768d,0.298621,0.432653,0.83146,0.846764,0.626652,0.607852,0.328089,0.585355,0.644966,...,0.684222,0.279474,0.596315,0.6747,0.038887,0.0,1.0,1.0,0.097023,0.044936
1,000036f903,0.294312,0.437322,0.832806,0.856812,0.627715,0.618506,0.344688,0.58454,0.647482,...,0.678477,0.288772,0.574425,0.680844,0.011071,0.01713,1.0,1.0,0.044777,0.048781
2,0001f552b0,0.310389,0.426473,0.834515,0.845426,0.621119,0.616448,0.324878,0.582864,0.643575,...,0.685792,0.27952,0.590199,0.675235,0.033063,0.0,1.0,1.0,0.068928,0.059044
3,00020e7b18,0.265627,0.440302,0.801341,0.842562,0.605562,0.569167,0.301997,0.586933,0.640177,...,0.696298,0.293335,0.612342,0.667429,0.115254,0.0,0.985305,0.975735,0.254272,0.060372
4,000220a0a7,0.31093,0.423089,0.823149,0.84463,0.616097,0.605226,0.336365,0.574943,0.662766,...,0.692866,0.289195,0.594978,0.677433,0.054316,0.0,1.0,1.0,0.101073,0.064382


In [5]:
client_test.head()

Unnamed: 0,client_id,0,1,2,3,4,5,6,7,8,...,96,97,98,99,regular_points_received,express_points_received,regular_points_spent,express_points_spent,purchase_sum,price
0,00010925a5,0.602796,0.653504,0.56176,0.812171,0.947037,0.095243,0.17809,0.907668,0.264658,...,0.769402,0.49348,0.460429,0.406862,0.010441,0.0,0.997957,1.0,0.046507,0.073531
1,00035a21d9,0.609385,0.652256,0.566319,0.813157,0.949011,0.091118,0.17185,0.907309,0.266069,...,0.782112,0.49512,0.464713,0.411372,0.04981,0.0,0.992501,0.727778,0.086052,0.064047
2,00038f9200,0.600764,0.6573,0.55344,0.803758,0.943214,0.090692,0.176712,0.905878,0.261282,...,0.772466,0.492488,0.473524,0.410519,0.011122,0.0,0.998968,1.0,0.030306,0.04799
3,0004315e57,0.612397,0.643062,0.558015,0.811745,0.937913,0.09278,0.174199,0.904878,0.262935,...,0.778072,0.504654,0.469453,0.408398,0.031285,0.0,0.994268,0.915209,0.085138,0.056105
4,0006fca4bf,0.610441,0.650651,0.565246,0.813154,0.946626,0.091251,0.17116,0.907946,0.266773,...,0.772702,0.496632,0.465522,0.412395,0.159777,0.0,1.0,1.0,0.229769,0.071503


In [6]:
test.head()

Unnamed: 0,client_id
0,a9a604ed6e
1,ebd7360016
2,908cd9b8e8
3,dceb8ce861
4,f4f0ac6b06


In [7]:
client_train = client_train.merge(train, how='left', on='client_id')

In [8]:
client_train

Unnamed: 0,client_id,0,1,2,3,4,5,6,7,8,...,98,99,regular_points_received,express_points_received,regular_points_spent,express_points_spent,purchase_sum,price,treatment_flg,purchased
0,000012768d,0.298621,0.432653,0.831460,0.846764,0.626652,0.607852,0.328089,0.585355,0.644966,...,0.596315,0.674700,0.038887,0.00000,1.000000,1.000000,0.097023,0.044936,0,1
1,000036f903,0.294312,0.437322,0.832806,0.856812,0.627715,0.618506,0.344688,0.584540,0.647482,...,0.574425,0.680844,0.011071,0.01713,1.000000,1.000000,0.044777,0.048781,1,1
2,0001f552b0,0.310389,0.426473,0.834515,0.845426,0.621119,0.616448,0.324878,0.582864,0.643575,...,0.590199,0.675235,0.033063,0.00000,1.000000,1.000000,0.068928,0.059044,1,1
3,00020e7b18,0.265627,0.440302,0.801341,0.842562,0.605562,0.569167,0.301997,0.586933,0.640177,...,0.612342,0.667429,0.115254,0.00000,0.985305,0.975735,0.254272,0.060372,1,1
4,000220a0a7,0.310930,0.423089,0.823149,0.844630,0.616097,0.605226,0.336365,0.574943,0.662766,...,0.594978,0.677433,0.054316,0.00000,1.000000,1.000000,0.101073,0.064382,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140022,fffd5cd0c6,0.309236,0.426382,0.828646,0.848185,0.618146,0.612783,0.329462,0.582921,0.642794,...,0.589868,0.676234,0.014099,0.00000,1.000000,1.000000,0.051155,0.050989,1,1
140023,fffd63dfe3,0.302328,0.431006,0.836751,0.845925,0.626143,0.612396,0.334681,0.594253,0.647682,...,0.593783,0.674165,0.009001,0.00000,1.000000,1.000000,0.044529,0.047786,1,1
140024,fffd8c9d7d,0.302051,0.431063,0.837479,0.845890,0.626899,0.611498,0.328404,0.590052,0.648187,...,0.594442,0.675752,0.015705,0.00000,1.000000,1.000000,0.050345,0.064275,1,0
140025,fffe0abb97,0.302607,0.429642,0.838735,0.845806,0.626568,0.612021,0.327640,0.583591,0.646621,...,0.590904,0.674139,0.024812,0.00000,0.998505,0.873684,0.063427,0.056628,0,0


In [9]:
train

Unnamed: 0,client_id,treatment_flg,purchased
0,ad6561e2d8,1,1
1,7c1ccbf93f,1,1
2,b58fadcab6,1,1
3,e99e6fabb9,0,0
4,27fb6f8520,1,1
...,...,...,...
140022,999d284453,1,1
140023,f634deea4e,0,1
140024,16cb4f99b0,0,1
140025,23c2b72b2e,1,1


Split train dataset on 2: with treatment_flg and not treatment_flg

In [10]:
df_train_flg_0 = client_train[client_train['treatment_flg']==0]
df_train_flg_1 = client_train[client_train['treatment_flg']==1]

In [11]:
df_train_flg_0.head()

Unnamed: 0,client_id,0,1,2,3,4,5,6,7,8,...,98,99,regular_points_received,express_points_received,regular_points_spent,express_points_spent,purchase_sum,price,treatment_flg,purchased
0,000012768d,0.298621,0.432653,0.83146,0.846764,0.626652,0.607852,0.328089,0.585355,0.644966,...,0.596315,0.6747,0.038887,0.0,1.0,1.0,0.097023,0.044936,0,1
4,000220a0a7,0.31093,0.423089,0.823149,0.84463,0.616097,0.605226,0.336365,0.574943,0.662766,...,0.594978,0.677433,0.054316,0.0,1.0,1.0,0.101073,0.064382,0,1
6,0002ce2217,0.307802,0.427314,0.829262,0.846416,0.622351,0.606969,0.319636,0.582998,0.641861,...,0.605826,0.672436,0.015691,0.0,1.0,1.0,0.049669,0.034721,0,1
7,00031cbbe6,0.29145,0.428192,0.825177,0.845177,0.621608,0.611815,0.330538,0.572937,0.638708,...,0.594982,0.677855,0.022686,0.0,0.998655,1.0,0.065168,0.054488,0,1
9,0004254599,0.314118,0.420251,0.830149,0.845321,0.609576,0.606276,0.327736,0.583325,0.644815,...,0.580642,0.674331,0.019019,0.0,0.990473,0.97757,0.061313,0.060139,0,0


In [12]:
df_train_flg_0['purchased'].mean()

0.6028470157786416

In [13]:
train_0, test_0 = train_test_split(df_train_flg_0, test_size=0.2, random_state=42)

x_0 = train_0.drop(['client_id', 'purchased'], axis=1)
y_0 = train_0['purchased']

x_0_test = test_0.drop(['client_id', 'purchased'], axis=1)
y_0_test = test_0['purchased']

In [14]:
df_train_flg_1['purchased'].mean()

0.6363921837308554

In [15]:
params = {'iterations': 300,
          'learning_rate': 0.05,
          'verbose': False,
          'loss_function': 'Logloss',
          'eval_metric': 'AUC',
          'random_seed': 42,
          'depth': 8
          }

In [16]:
model_flg_0 = CatBoostClassifier(**params)

In [17]:
model_flg_0.fit(x_0, y=y_0, plot=True, eval_set=(x_0_test, y_0_test))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x7f86019f70a0>

In [18]:
model_flg_1 = CatBoostClassifier(**params)

In [19]:
train_1, test_1 = train_test_split(df_train_flg_1, test_size=0.2, random_state=42)

x_1 = train_1.drop(['client_id', 'purchased'], axis=1)
y_1 = train_1['purchased']

x_1_test = test_1.drop(['client_id', 'purchased'], axis=1)
y_1_test = test_1['purchased']

In [20]:
model_flg_1.fit(x_1, y=y_1, plot=True, eval_set=(x_1_test, y_1_test))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x7f8648101490>

In [21]:
# get top thresholds
def get_top_thr(x, y, model):
    pool = Pool(x, y)
    fpr, tpr, thr = get_roc_curve(model, pool)
    data= {
        'fpr':fpr,
        'tpr': tpr,
        'thrs': thr
    }

    roc = pd.DataFrame(data)
    roc['roc_auc'] =  (1 + roc['tpr'] - roc['fpr']) / 2
    thr_top = float(roc[roc['roc_auc']==roc['roc_auc'].max()]['thrs'])
    return thr_top, roc

In [22]:
thr_0, roc_0 = get_top_thr(x_0_test, y_0_test, model_flg_0)
thr_1, roc_1 = get_top_thr(x_1_test, y_1_test, model_flg_1)

In [23]:
model_flg_0.set_probability_threshold(thr_0)
model_flg_1.set_probability_threshold(thr_1)

## TEST

In [24]:
client_test_ = test.merge(client_test, how='inner', on='client_id')
client_test_

Unnamed: 0,client_id,0,1,2,3,4,5,6,7,8,...,96,97,98,99,regular_points_received,express_points_received,regular_points_spent,express_points_spent,purchase_sum,price
0,a9a604ed6e,0.610620,0.651864,0.563794,0.811560,0.946989,0.091724,0.171941,0.908255,0.267350,...,0.777601,0.499476,0.463689,0.409173,0.054240,0.000000,1.000000,1.000000,0.099973,0.067821
1,ebd7360016,0.616861,0.645869,0.554907,0.806783,0.930909,0.089829,0.172204,0.907222,0.261426,...,0.781474,0.498580,0.463728,0.397422,0.007529,0.000000,0.999266,1.000000,0.027607,0.041481
2,908cd9b8e8,0.605300,0.650185,0.559119,0.812871,0.939452,0.091489,0.177089,0.904951,0.264585,...,0.780156,0.493335,0.472151,0.411823,0.012440,0.000000,0.996187,1.000000,0.053767,0.060309
3,dceb8ce861,0.610396,0.651229,0.557863,0.818684,0.944235,0.089914,0.171303,0.907439,0.265250,...,0.769286,0.497953,0.458712,0.404191,0.071487,0.070731,0.726425,1.000000,0.213292,0.055796
4,f4f0ac6b06,0.602160,0.653975,0.544853,0.809940,0.938537,0.085438,0.165473,0.909826,0.269766,...,0.779184,0.513909,0.465776,0.404459,0.055277,0.000000,0.981640,1.000000,0.111258,0.052242
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60007,4762fb04c5,0.614052,0.640260,0.557256,0.810351,0.928442,0.089336,0.174580,0.901925,0.273741,...,0.766335,0.518051,0.472530,0.410530,0.030256,0.000000,0.998481,1.000000,0.083471,0.066370
60008,533d1516e7,0.610549,0.651952,0.564499,0.813183,0.947367,0.090442,0.169872,0.907916,0.267298,...,0.778187,0.496737,0.466519,0.413863,0.002561,0.000000,1.000000,1.000000,0.016710,0.059227
60009,c93b21a707,0.614970,0.639715,0.545829,0.813080,0.925596,0.087958,0.171572,0.903597,0.267172,...,0.780413,0.507010,0.459237,0.416323,0.014011,0.000000,1.000000,1.000000,0.047285,0.040022
60010,3fe0668de5,0.610106,0.649520,0.560070,0.812476,0.944550,0.090171,0.173009,0.908775,0.269001,...,0.772197,0.515616,0.472095,0.404930,0.028432,0.000000,0.997206,0.907547,0.049483,0.047890


In [25]:
client_test_0 = client_test_.copy()
client_test_0['treatment_flg'] = 0
client_test_0.drop(columns='client_id', inplace=True)


client_test_1 = client_test_.copy()
client_test_1['treatment_flg'] = 1
client_test_1.drop(columns='client_id', inplace=True)

In [26]:
predict_flag_0 = model_flg_0.predict_proba(client_test_0)[:, 0]
predict_flag_1 = model_flg_1.predict_proba(client_test_1)[:, 1]

In [27]:
test_cat = test.copy()

In [28]:
test_cat['pred'] = predict_flag_0 * predict_flag_1

In [29]:
test_cat.to_csv('predict/test.csv', index=False)

In [30]:
test_cat

Unnamed: 0,client_id,pred
0,a9a604ed6e,0.103293
1,ebd7360016,0.020123
2,908cd9b8e8,0.040129
3,dceb8ce861,0.200600
4,f4f0ac6b06,0.115057
...,...,...
60007,4762fb04c5,0.052224
60008,533d1516e7,0.017237
60009,c93b21a707,0.039839
60010,3fe0668de5,0.050366


## Two models

In [31]:
from sklift.models.models import TwoModels

In [32]:
x = client_train.drop(columns=['purchased', 'treatment_flg', 'client_id'])
y = client_train['purchased']
treat_train = client_train['treatment_flg']

In [33]:
estimator_trmnt = CatBoostClassifier(iterations=300, learning_rate=0.05, verbose=False)
estimator_ctrl = CatBoostClassifier(iterations=300, learning_rate=0.05, verbose=False)

In [34]:
tm_ctrl = TwoModels(
    estimator_trmnt=estimator_trmnt,
    estimator_ctrl=estimator_ctrl,
    method='ddr_control'
)

In [35]:
tm_ctrl = tm_ctrl.fit(x, y, treat_train,
                        estimator_trmnt_fit_params={'plot': True},
                        estimator_ctrl_fit_params={'plot': True}
                        )

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [36]:
uplift_tm_ctrl = tm_ctrl.predict(client_test_.drop(columns='client_id'))

In [37]:
test_two_models = test.copy()

In [38]:
test_two_models['pred'] = uplift_tm_ctrl

In [39]:
test_two_models.head()

Unnamed: 0,client_id,pred
0,a9a604ed6e,0.044528
1,ebd7360016,0.008482
2,908cd9b8e8,0.012816
3,dceb8ce861,0.059949
4,f4f0ac6b06,0.048293


In [40]:
test_two_models.to_csv('predict/test_two_models.csv', index=False)

## ClassTransformation

In [41]:
from sklift.models import ClassTransformation

In [42]:
ct = ClassTransformation(CatBoostClassifier(iterations=300, learning_rate=0.05, verbose=False))

ct = ct.fit(x, y, treat_train, estimator_fit_params={'plot': True})

uplift_ct = ct.predict(client_test_.drop(columns='client_id'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [None]:
ct_models = test.copy()
ct_models['pred'] = uplift_ct

In [None]:
ct_models.to_csv('predict/ct_models.csv', index=False)