In [1]:
import os
import dotenv

import pandas as pd
import pandahouse as ph
from scipy import stats
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

dotenv.load_dotenv()
connection = {'host': os.getenv('host'),
                      'database':os.getenv('database'),
                      'user':os.getenv('user'), 
                      'password':os.getenv('password')
                     }

query1 = '''
SELECT  exp_group,
        user_id,
        sum(action='like') AS likes,
        sum(action='view') AS views,
        likes/views AS ctr
FROM    {db}.feed_actions
WHERE   toDate(time) BETWEEN '2023-03-02' AND '2023-03-08'
        AND exp_group in (1,2)
GROUP BY exp_group, user_id
'''
query2 = '''
SELECT  exp_group,
        user_id,
        sum(action='like') AS likes,
        sum(action='view') AS views,
        likes/views AS ctr
FROM    {db}.feed_actions
WHERE   toDate(time) BETWEEN '2023-03-02' AND '2023-03-08'
        AND exp_group in (0,3)
GROUP BY exp_group, user_id
'''

df1 = ph.read_clickhouse(query1, connection=connection)
df2 = ph.read_clickhouse(query2, connection=connection)


In [2]:
CTRcontrol1 = df1.query('exp_group == 1').likes.sum()/ df1.query('exp_group == 1').views.sum()
CTRcontrol2 = df2.query('exp_group == 0').likes.sum()/ df2.query('exp_group == 0').views.sum()

In [3]:
df1['linearized_likes'] = df1.likes - CTRcontrol1 * df1.views
df2['linearized_likes'] = df2.likes - CTRcontrol2 * df2.views

In [4]:
print(stats.ttest_ind(df1[df1.exp_group == 1].linearized_likes, 
                   df1[df1.exp_group == 2].linearized_likes, 
                   equal_var = False))
print(stats.ttest_ind(df1[df1.exp_group == 1].ctr, 
                   df1[df1.exp_group == 2].ctr, 
                   equal_var = False))

Ttest_indResult(statistic=6.122579994775973, pvalue=9.439432187037644e-10)
Ttest_indResult(statistic=0.7094392041270486, pvalue=0.4780623130874935)


In [5]:
print(stats.ttest_ind(df2[df2.exp_group == 0].linearized_likes, 
                   df2[df2.exp_group == 3].linearized_likes, 
                   equal_var = False))
print(stats.ttest_ind(df2[df2.exp_group == 0].ctr, 
                   df2[df2.exp_group == 3].ctr, 
                   equal_var = False))

Ttest_indResult(statistic=-15.21499546090383, pvalue=5.491424947968532e-52)
Ttest_indResult(statistic=-13.896870721904069, pvalue=1.055849414662529e-43)


При использовании использовании t-testa на метрике линеаризованных лайков получается более высокая чувствительность теста, он прокрасился даже на тех данных, где до этого распределение было скошенным и t-test использовать было нельзя(1 и 2 группы). p-value уменьшилось в обоих случаях