In [1]:
%matplotlib inline

import pandas as pd; pd.set_option('display.max_columns', None)
from sklearn.model_selection import train_test_split

# Инструкция по установке пакета: https://github.com/maks-sh/scikit-uplift
# Ссылка на документацию: https://scikit-uplift.readthedocs.io/en/latest/
from sklift.metrics import uplift_at_k
from sklift.viz import plot_uplift_preds

from sklift.models import SoloModel
from sklift.models import ClassTransformation
from sklift.models import TwoModels

# sklift поддерживает любые модели, 
# которые удовлетворяют соглашениями scikit-learn
# Для примера воспользуемся catboost
from catboost import CatBoostClassifier

import numpy as np

* channel - channels that the customer using, Phone/Web/Multichannel
* is_referral - indicates if the customer was acquired from referral channel
* zip_code - class of the zip code as Suburban/Urban/Rural
* used_bogo - indicates if the customer used a buy one get one before
* used_discount - indicates if the customer used a discount before
* history - value of the historical purchases
* recency - months since last purchase

1. скачать набор данных маркетинговых кампаний отсюда https://www.kaggle.com/davinwijaya/customer-retention
2. там поле conversion - это целевая переменная, а offer - коммуникация. Переименовать поля (conversion -> target, offer -> treatment) и привести поле treatment к бинарному виду (1 или 0, т.е было какое-то предложение или нет) - значение No Offer означает отсутствие коммуникации, а все остальные - наличие.
3. сделать разбиение набора данных не тренировочную и тестовую выборки
4. сделать feature engineering на ваше усмотрение (допускается свобода выбора методов)
5. провести uplift-моделирование 3 способами: одна модель с признаком коммуникации (S learner), модель с трансформацией таргета (трансформация классов п. 2. 1) и вариант с двумя независимыми моделями
6. в конце вывести единую таблицу сравнения метрик uplift@10%, uplift@20% этих 3 моделей
7. построить модель UpliftTreeClassifier и попытаться описать словами полученное дерево
8. (опционально) для модели S learner (модель с дополнительным признаком коммуникации) построить зависимость таргета (конверсии - поле conversion) от значения uplift: 1) сделать прогноз и получить uplift для тестовой выборки 2) отсортировать тестовую выборку по uplift по убыванию 3) разбить на децили (pandas qcut вам в помощь) 4) для каждого дециля посчитать среднюю conversion
9. (опционально) построить модель UpliftRandomForestClassifier и попытаться описать словами полученное дерево

# 1-4

In [30]:
df = pd.read_csv('data_for_task.csv')
df.rename(columns={'conversion':'target', 'offer':'treatment'}, inplace=True)
df['treatment'].replace({'Buy One Get One':1, 'Discount':1,'No Offer':0}, inplace=True)
df['used_dics_or_bogo'] = np.where((df['used_discount'] == 1) | ( df['used_bogo'] == 1), 1,0)
df.drop(columns=['used_discount', 'used_bogo'], inplace=True)
df.head(3)

Unnamed: 0,recency,history,zip_code,is_referral,channel,treatment,target,used_dics_or_bogo
0,10,142.44,Surburban,0,Phone,1,0,1
1,6,329.08,Rural,1,Web,0,0,1
2,7,180.65,Surburban,1,Web,1,0,1


In [31]:
cat_f = ['zip_code', 'channel']
models_results = {
    'approach': [],
    'uplift@10%': [], 
    'uplift@20%': []
}
df['treatment'].value_counts()*100/df.shape[0], df['target'].value_counts()*100/df.shape[0]

(1    66.709375
 0    33.290625
 Name: treatment, dtype: float64,
 0    85.321875
 1    14.678125
 Name: target, dtype: float64)

In [32]:
X = df.drop(columns=['treatment', 'target'])
W = df['treatment'] 
y = df['target']
indices = df.index
indices_learn, indices_valid = train_test_split(df.index, test_size=0.4, random_state=123, stratify=df['target'])
# обучающий набор 
X_train = X.iloc[indices_learn]
y_train = y.iloc[indices_learn]
W_train = W.iloc[indices_learn]
# валидационный набор
X_test = X.iloc[indices_valid]
y_test = y.iloc[indices_valid]
W_test = W.iloc[indices_valid]

# 5-6

In [33]:
def uplift_data(model, X_train=X_train, y_train=y_train, W_train=W_train, y_test=y_test, W_test=W_test):
    models_results = {
    }
    if model == TwoModels:
        tm = TwoModels(
        estimator_trmnt=CatBoostClassifier(iterations=20, thread_count=2, random_state=42, silent=True), 
        estimator_ctrl=CatBoostClassifier(iterations=20, thread_count=2, random_state=42, silent=True), 
            method='vanilla' )
        tm = tm.fit( X_train, y_train, W_train,
                    estimator_trmnt_fit_params={'cat_features': cat_f}, 
                    estimator_ctrl_fit_params={'cat_features': cat_f})

        uplift_tm = tm.predict(X_test)

        tm_score_1 = uplift_at_k(y_true=y_test, uplift=uplift_tm, treatment=W_test, strategy='by_group', k=0.1)
        tm_score_2 = uplift_at_k(y_true=y_test, uplift=uplift_tm, treatment=W_test, strategy='by_group', k=0.2)
        
        models_results['approach']= model
        models_results['uplift@10%']= tm_score_1
        models_results['uplift@20%']= tm_score_2
        
    else:         
        sm = model(CatBoostClassifier(iterations=20, thread_count=2, random_state=42, silent=True))
        sm = sm.fit(X_train, y_train, W_train, estimator_fit_params={'cat_features': cat_f})
        
        uplift_sm = sm.predict(X_test)
        
        sm_score_1 = uplift_at_k(y_true=y_test, uplift=uplift_sm, treatment=W_test, strategy='by_group', k=0.1)
        sm_score_2 = uplift_at_k(y_true=y_test, uplift=uplift_sm, treatment=W_test, strategy='by_group', k=0.2)
        
        models_results['approach']= model
        models_results['uplift@10%']= sm_score_1
        models_results['uplift@20%']= sm_score_2
        
    
    return models_results

In [34]:
models_results_solo = uplift_data(SoloModel, X_train=X_train, y_train=y_train, W_train=W_train, y_test=y_test, W_test=W_test)
models_results_class = uplift_data(ClassTransformation, X_train=X_train, y_train=y_train, W_train=W_train, y_test=y_test, W_test=W_test)
models_results_two = uplift_data(TwoModels, X_train=X_train, y_train=y_train, W_train=W_train, y_test=y_test, W_test=W_test)

models_df = pd.DataFrame(list(models_results_solo.values())[1:], index=list(models_results_solo.keys())[1:], columns=['Solo'])
models_df.insert(loc=len(models_df.columns), column='Class', value=list(models_results_class.values())[1:])
models_df.insert(loc=len(models_df.columns), column='TwoModels', value=list(models_results_two.values())[1:])
models_df

Unnamed: 0,Solo,Class,TwoModels
uplift@10%,0.05832,0.058372,0.05362
uplift@20%,0.060895,0.065608,0.050619


### UpliftTreeClassifier

In [39]:
X_train_tree = pd.concat([X_train.drop('zip_code', 1), 
                          pd.get_dummies(X_train['zip_code'], prefix='zip_code')], 1)
X_train_tree = pd.concat([X_train_tree.drop('channel', 1), 
                          pd.get_dummies(X_train['channel'], prefix='channel')], 1)

features = [col for col in X_train_tree]

  X_train_tree = pd.concat([X_train.drop('zip_code', 1),
  X_train_tree = pd.concat([X_train.drop('zip_code', 1),
  X_train_tree = pd.concat([X_train_tree.drop('channel', 1),
  X_train_tree = pd.concat([X_train_tree.drop('channel', 1),


In [45]:
%%time
from IPython.display import Image
from causalml.inference.tree import UpliftTreeClassifier, UpliftRandomForestClassifier
from causalml.inference.tree import uplift_tree_string, uplift_tree_plot

uplift_model = UpliftTreeClassifier(max_depth=8, min_samples_leaf=200, min_samples_treatment=50,
                                    n_reg=100, evaluationFunction='KL', control_name='control')

uplift_model.fit(X_train_tree.values,
                 treatment=W_train.map({1: 'treatment1', 0: 'control'}).values,
                 y=y_train)

graph = uplift_tree_plot(uplift_model.fitted_uplift_tree, features)
Image(graph.create_png())

ModuleNotFoundError: No module named 'causalml'

In [44]:
pip install causalml.inference.tree

Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement causalml.inference.tree (from versions: none)
ERROR: No matching distribution found for causalml.inference.tree


In [None]:
git clone https://github.com/uber/causalml.git
cd causalml
pip install .
python setup.py build_ext --inplace

conda install -c conda-forge causalml