# Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [57]:
from typing import List
from IPython.display import display, HTML

In [3]:
import causalml

In [6]:
import catboost as cb
import pylift
import causalml.metrics as cmetrics

from causalml.inference.tree import UpliftRandomForestClassifier
from sklearn.model_selection import train_test_split

In [7]:
import sklift.metrics
import sklift.viz

# Task 1

`ATE` (_Average Treatment Effect_) - средний по всем наблюдениям (пользователям) прирост целевой переменной.

В нашем кейсе интересны показатели `visit`, `conversion`, `spend`.  
Для них и посмотрим различия между ЦГ и КГ.  
На обучающей выборке посчитайте для каждой переменной следующие метрики:
* Среднее значение переменной в контрольной группе и в целевой.
* Оценку ATE переменной.
* Относительное изменение переменной в целевой группе в сравнении с контрольной.

In [20]:
data = pd.read_csv('data.csv')

data.head()

Unnamed: 0,recency,history_segment,history,mens,womens,zip_code,newbie,channel,segment,visit,conversion,spend
0,10,2) $100 - $200,142.44,1,0,Surburban,0,Phone,Womens E-Mail,0,0,0.0
1,6,3) $200 - $350,329.08,1,1,Rural,1,Web,No E-Mail,0,0,0.0
2,7,2) $100 - $200,180.65,0,1,Surburban,1,Web,Womens E-Mail,0,0,0.0
3,9,5) $500 - $750,675.83,1,0,Rural,1,Web,Mens E-Mail,0,0,0.0
4,2,1) $0 - $100,45.34,1,0,Urban,0,Web,Womens E-Mail,0,0,0.0


In [21]:
data['segment'].value_counts()

Womens E-Mail    21387
Mens E-Mail      21307
No E-Mail        21306
Name: segment, dtype: int64

In [22]:
data = data[data.segment.isin(['Womens E-Mail', 'No E-Mail'])].reset_index(drop=True)
data['treatment_flag'] = data['segment'].apply(lambda x: 1 if x == 'Womens E-Mail' else 0)

data.head()

Unnamed: 0,recency,history_segment,history,mens,womens,zip_code,newbie,channel,segment,visit,conversion,spend,treatment_flag
0,10,2) $100 - $200,142.44,1,0,Surburban,0,Phone,Womens E-Mail,0,0,0.0,1
1,6,3) $200 - $350,329.08,1,1,Rural,1,Web,No E-Mail,0,0,0.0,0
2,7,2) $100 - $200,180.65,0,1,Surburban,1,Web,Womens E-Mail,0,0,0.0,1
3,2,1) $0 - $100,45.34,1,0,Urban,0,Web,Womens E-Mail,0,0,0.0,1
4,6,2) $100 - $200,134.83,0,1,Surburban,0,Phone,Womens E-Mail,1,0,0.0,1


In [24]:
train_df, val_df = train_test_split(data, test_size=0.5, random_state=148)

In [13]:
# train_idx, test_idx = train_test_split(data.index, test_size=0.5, random_state=148)

In [44]:
targets = ['visit', 'conversion', 'spend']

stats_df = pd.DataFrame(index=targets, 
                        columns=['mean_control', 'mean_treated', 'ate', 'relative_change'])

for target in targets:
    mean_control = train_df[train_df.treatment_flag == 0][target].mean()
    mean_treated = train_df[train_df.treatment_flag == 1][target].mean()
    
    stats_df.loc[target, 'mean_control'] = mean_control
    stats_df.loc[target, 'mean_treated'] = mean_treated
    stats_df.loc[target, 'ate'] = mean_treated - mean_control
    stats_df.loc[target, 'relative_change'] = mean_treated / mean_control

In [45]:
stats_df

Unnamed: 0,mean_control,mean_treated,ate,relative_change
visit,0.10172,0.148193,0.046473,1.456867
conversion,0.004983,0.008498,0.003515,1.705437
spend,0.583971,1.131093,0.547121,1.936898


# Task 2 Волатильность метрик

In [97]:
n_iter = 500

mask = train_df.treatment_flag == 1
n_treatment, n_control = train_df[mask].shape[0], train_df[~mask].shape[0]

for target in targets:
    ate_list = []
    relative_change_list = []
    for _ in range(n_iter):
        treatment_choice = np.random.choice(train_df[mask][target], size=n_treatment, replace=True)
        control_choice = np.random.choice(train_df[~mask][target], size=n_control, replace=True)
        
        treatment_stat = treatment_choice.mean()
        control_stat = control_choice.mean()
        
        ate = treatment_stat - control_stat
        relative_change = treatment_stat / control_stat
        
        ate_list.append(ate)
        relative_change_list.append(relative_change)
        
    mean_ate = np.mean(ate_list)
    std_ate = np.std(ate_list)
    ci_ate = mean_ate - 2 * std_ate, mean_ate + 2 * std_ate
    ci_span_ate = np.diff(ci_ate)[0]
    
    mean_rc = np.mean(relative_change_list)
    std_rc = np.std(relative_change_list)
    ci_rc = mean_rc - 2 * std_rc, mean_rc + 2 * std_rc
    ci_span_rc = np.diff(ci_rc)[0]
    
    display(HTML(f'<h3>Action: {target}</h3>'))
    display(HTML(f'<h4>ATE: {mean_ate:.3f}; confidence interval span: {ci_span_ate:.3f}; {ci_span_ate * 100 / mean_ate:.3f}<h4>'))
    display(HTML(f'<h4>Relative change: {mean_rc:.3f}; confidence interval span: {ci_span_rc:.3f}; {ci_span_rc * 100 / mean_rc:.3f}<h4>'))
    print()








