In [6]:
# importing needed libs

import pandahouse

import numpy as np
import pandas as pd

from scipy import stats
from tqdm import tqdm

In [7]:
# importing needed data

connection = {
    'host': 'https://clickhouse.lab.karpov.courses',
    'password': 'dpo_python_2020',
    'user': 'student',
    'database': 'simulator_20250120'
}

query = """
SELECT
  user_id,
  sum(action = 'view') AS views,
  sum(action = 'like') AS likes,
  likes / views AS ctr
FROM
  simulator_20250120.feed_actions
WHERE
  toDate(time) BETWEEN '2024-12-27'
  AND '2025-01-02'
GROUP BY
  user_id
"""

data = pandahouse.read_clickhouse(query=query, connection=connection)

data.head()

Unnamed: 0,user_id,views,likes,ctr
0,13289,32,2,0.0625
1,121096,57,12,0.210526
2,5090,14,2,0.142857
3,129283,80,23,0.2875
4,4394,54,25,0.462963


In [19]:
name = 'data'
data.to_csv(f'{name}.csv', index=False)
pd.read_csv(f'{name}.csv').head()

Unnamed: 0,user_id,views,likes,ctr
0,13289,32,2,0.0625
1,121096,57,12,0.210526
2,5090,14,2,0.142857
3,129283,80,23,0.2875
4,4394,54,25,0.462963


In [8]:
# counting the sample size believing that during AB test it will be the same

sample_size = data.shape[0] // 2

In [9]:
# query to get views number and corresponding number of users 

q = """
select views, count() as users
from (select
    user_id,
    sum(action = 'view') as views
from simulator_20250120.feed_actions
where toDate(time) between '2024-12-27' and '2025-01-02'
group by user_id
)
group by views
order by views
"""


views_distribution = pandahouse.read_clickhouse(q, connection=connection)

# countring probability for users to get in a group with a certain number of views
views_distribution = views_distribution.assign(p = views_distribution.users / (views_distribution.users.sum()))
views_distribution.head()

Unnamed: 0,views,users,p
0,1,4,9.5e-05
1,2,1,2.4e-05
2,3,4,9.5e-05
3,4,5,0.000119
4,5,18,0.000429


In [22]:
name = 'views_distribution'
views_distribution.to_csv(f'{name}.csv', index=False)
pd.read_csv(f'{name}.csv').head()

Unnamed: 0,views,users,p
0,1,4,9.5e-05
1,2,1,2.4e-05
2,3,4,9.5e-05
3,4,5,0.000119
4,5,18,0.000429


In [10]:
# query to get CTRs and corresponding number of users 

q = """
select 
   floor(ctr, 2) as ctr, count() as users
from (select toDate(time) as dt,
    user_id,
    sum(action = 'like')/sum(action = 'view') as ctr
from simulator_20250120.feed_actions
where dt between '2024-12-27' and '2025-01-02'
group by dt, user_id
)
group by ctr
"""

ctr_distribution = pandahouse.read_clickhouse(q, connection=connection)

# countring probability for users to get in a group with a certain CTR
ctr_distribution = ctr_distribution.assign(p = ctr_distribution.users / (ctr_distribution.users.sum()))
ctr_distribution.head()

Unnamed: 0,ctr,users,p
0,0.0,1443,0.016952
1,0.65,4,4.7e-05
2,0.71,5,5.9e-05
3,0.49,4,4.7e-05
4,0.54,72,0.000846


In [21]:
name = 'ctr_distribution'
ctr_distribution.to_csv(f'{name}.csv', index=False)
pd.read_csv(f'{name}.csv').head()

Unnamed: 0,ctr,users,p
0,0.0,1443,0.016952
1,0.65,4,4.7e-05
2,0.71,5,5.9e-05
3,0.49,4,4.7e-05
4,0.54,72,0.000846


In [18]:
rng = np.random.default_rng()
result = 0
simulations = 20000

for _ in tqdm(np.arange(simulations)):
    # generating the views for users control and test groups
    control_group_views = rng.choice(views_distribution.views, replace=True, p=views_distribution.p, size=sample_size).astype("int64")
    test_group_views = rng.choice(views_distribution.views, replace=True, p=views_distribution.p, size=sample_size).astype("int64")
    
    # applying the probable effect on the users in test group
    test_group_views = test_group_views + ((1 + rng.binomial(n=1, p=0.5, size=sample_size)) * rng.binomial(n=1, p=0.9, size=sample_size) * (test_group_views >= 30))
    
    # generating the CTRs for users control and test groups
    control_group_ctrs = rng.choice(ctr_distribution.ctr, replace=True, p=ctr_distribution.p, size=sample_size)
    test_group_ctrs = rng.choice(ctr_distribution.ctr, replace=True, p=ctr_distribution.p, size=sample_size)
    
    # generating clicks for users control and test groups
    control_likes = rng.binomial(control_group_views, control_group_ctrs)
    test_likes = rng.binomial(test_group_views, test_group_ctrs)
    
    # since users who have less than 30 views will not be affected by changes, we cut them out of the samples applying masks
    control_mask = control_group_views >= 30
    test_mask = test_group_views >= 30
    
    result += (stats.ttest_ind(control_likes[control_mask], test_likes[test_mask], equal_var=False).pvalue < 0.05)

100%|██████████| 20000/20000 [05:08<00:00, 64.90it/s]


In [20]:
print(f'The statistical power of the T-test considering all input is {round(result * 100 / simulations, 1)}')

The statistical power of the T-test considering all input is 48.8
