In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier

from sklift.metrics import uplift_at_k
from sklift.viz import plot_uplift_preds
from sklift.models import SoloModel, ClassTransformation, TwoModels

# Задание 1
1. скачать набор данных маркетинговых кампаний отсюда https://www.kaggle.com/davinwijaya/customer-retention
2. там поле conversion - это целевая переменная, а offer - коммуникация.  
   Переименовать поля (conversion -> target, offer -> treatment) и привести поле treatment к бинарному виду (1 или 0, т.е было какое-то предложение или нет) - значение No Offer означает отсутствие коммуникации, а все остальные - наличие.
3. сделать разбиение набора данных не тренировочную и тестовую выборки
4. сделать feature engineering на ваше усмотрение (допускается свобода выбора методов)
5. провести uplift-моделирование 3 способами: одна модель с признаком коммуникации (S learner), модель с трансформацией таргета (трансформация классов п. 2. 1) и вариант с двумя независимыми моделями
6. в конце вывести единую таблицу сравнения метрик uplift@10%, uplift@20% этих 3 моделей

In [3]:
mpc_df = pd.read_csv('./input/mpc_data.csv') 
mpc_df.head(5)

Unnamed: 0,recency,history,used_discount,used_bogo,zip_code,is_referral,channel,offer,conversion
0,10,142.44,1,0,Surburban,0,Phone,Buy One Get One,0
1,6,329.08,1,1,Rural,1,Web,No Offer,0
2,7,180.65,0,1,Surburban,1,Web,Buy One Get One,0
3,9,675.83,1,0,Rural,1,Web,Discount,0
4,2,45.34,1,0,Urban,0,Web,Buy One Get One,0


In [4]:
mpc_df.rename(columns = {'conversion' : 'target', 'offer' : 'treatment'}, inplace = True)
mpc_df['treatment'] = mpc_df['treatment'].apply(lambda x: 0 if x=='No Offer' else 1, 0)
mpc_df.head(5)

Unnamed: 0,recency,history,used_discount,used_bogo,zip_code,is_referral,channel,treatment,target
0,10,142.44,1,0,Surburban,0,Phone,1,0
1,6,329.08,1,1,Rural,1,Web,0,0
2,7,180.65,0,1,Surburban,1,Web,1,0
3,9,675.83,1,0,Rural,1,Web,1,0
4,2,45.34,1,0,Urban,0,Web,1,0


In [148]:
X_train, X_val, y_train, y_val = train_test_split(mpc_df.drop('target', 1), mpc_df['target'], random_state=6)
treat_train = X_train['treatment']
treat_val =  X_val['treatment']
X_train.shape

(48000, 8)

In [184]:
cat_features = ['zip_code', 'channel']

estim_model = CatBoostClassifier(iterations=20, thread_count=2, random_state=42, silent=True)
estim_model2 = CatBoostClassifier(iterations=20, thread_count=2, random_state=42, silent=True)

In [185]:
col = ['model_name', 'model', 'uplift@10%', 'uplift@30%']
models_names = ['SoloModel', 'ClassTransformation', 'TwoModels']
models = [SoloModel(estim_model), ClassTransformation(estim_model), TwoModels(estim_model, estim_model2, method='vanilla')]

results_df = pd.DataFrame(columns=col)
results_df['model_name'] = models_names
results_df['model'] = models
results_df

Unnamed: 0,model_name,model,uplift@10%,uplift@30%
0,SoloModel,SoloModel(estimator=<catboost.core.CatBoostCla...,,
1,ClassTransformation,ClassTransformation(estimator=<catboost.core.C...,,
2,TwoModels,TwoModels(estimator_ctrl=<catboost.core.CatBoo...,,


In [186]:
fit_params = (X_train, y_train, treat_train)
for index, row in results_df.iterrows():
    
    model = row['model']

    if row['model_name'] != 'TwoModels':
        m = model.fit(*fit_params, estimator_fit_params={'cat_features': cat_features})
    else:
        m = model.fit(*fit_params, estimator_trmnt_fit_params={'cat_features': cat_features},  estimator_ctrl_fit_params={'cat_features': cat_features})
    uplift_sm = m.predict(X_val)
    row['uplift@10%'] = uplift_at_k(y_true=y_val, uplift=uplift_sm, treatment=treat_val, strategy='by_group', k=0.1)
    row['uplift@30%'] = uplift_at_k(y_true=y_val, uplift=uplift_sm, treatment=treat_val, strategy='by_group', k=0.3)

SoloModel(estimator=<catboost.core.CatBoostClassifier object at 0x0000019C2E299AC0>)
ClassTransformation(estimator=<catboost.core.CatBoostClassifier object at 0x0000019C2E299AC0>)
TwoModels(estimator_ctrl=<catboost.core.CatBoostClassifier object at 0x0000019C2E2991F0>,
          estimator_trmnt=<catboost.core.CatBoostClassifier object at 0x0000019C2E299AC0>)


In [187]:
results_df.drop('model', 1)

Unnamed: 0,model_name,uplift@10%,uplift@30%
0,SoloModel,0.090823,0.067106
1,ClassTransformation,0.201103,0.154544
2,TwoModels,0.078181,0.069369
