In [1]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf

In [2]:
df = pd.read_csv("cuped_data.csv")

In [3]:
df

Unnamed: 0.1,Unnamed: 0,user,variant,revenue_before,revenue_after
0,0,1,1,5.318490,11.028245
1,1,2,1,7.522152,11.632472
2,2,3,1,5.078062,12.804922
3,3,4,0,5.805444,9.669108
4,4,5,1,5.631787,9.815636
...,...,...,...,...,...
995,995,996,0,5.017693,9.069170
996,996,997,0,2.348520,7.578279
997,997,998,0,6.136338,8.879685
998,998,999,1,5.254159,12.497577


Сначала рассчитаем $SE$ и разницу средних до применения $CUPED$

In [4]:
np.mean(df.loc[df.variant==1, 'revenue_after']) - np.mean(df.loc[df.variant==0, 'revenue_after'])

1.9970698127040443

In [5]:
smf.ols('revenue_after ~ variant', data=df).fit().summary().tables[1]

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,7.9962,0.063,127.332,0.000,7.873,8.119
variant,1.9971,0.087,22.866,0.000,1.826,2.168


Полученный коэфициент для variant – разница средних

### Порядок действий расчета $CUPED$
1. Рассчитайте зависимость $Y$ от $X$ и получите $\theta$ с помощью линейной регрессии
2. Посчитайте $Y^{cuped} = Y - \theta X$
3. Посчитайте разницу CUPED метрик

где $Y$ – значение целевой метрики во время проведения эксперимента, $X$ – та же самая метрика, но до проведения эксперимента

In [12]:
theta = smf.ols('revenue_after ~ revenue_before', data=df).fit().params[1]

In [13]:
df['revenue_cuped'] = df['revenue_after'] - theta * (df['revenue_before'] - np.mean(df['revenue_before']))

In [14]:
smf.ols('revenue_cuped ~ variant', data=df).fit().summary().tables[1]

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,7.9731,0.044,181.428,0.000,7.887,8.059
variant,2.0417,0.061,33.405,0.000,1.922,2.162


Дисперсия стала меньше на треть, разница средних немного изменилась

Также давайте посчитаем `theta` альтернативно через соотношениев ковариации к дисперсии

In [15]:
theta = df['revenue_before'].cov(df['revenue_after']) / df['revenue_before'].var() 

In [16]:
df['revenue_cuped'] = df['revenue_after'] - theta * (df['revenue_before'] - np.mean(df['revenue_before']))

In [17]:
smf.ols('revenue_cuped ~ variant', data=df).fit().summary().tables[1]

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,7.9731,0.044,181.428,0.000,7.887,8.059
variant,2.0417,0.061,33.405,0.000,1.922,2.162
