In [1]:
import vault_client
import getpass

from os.path import expanduser
import json
import numpy as np
from scipy.stats import mannwhitneyu

def get_config():
    home = expanduser("~")
    
    with open(home + '/config.json') as f:
        config_data = f.read()
    config = json.loads(config_data)
    return config

yav = vault_client.instances.Production(
    rsa_login='miptgirl', 
    authorization='OAuth {}'.format(get_config()['vault_api_token']),)

ver = 'ver-01dssnc1zdezgbfd48xhj9j9xn'
YT_TOKEN = yav.get_version(ver)['value']['yt_token']
YQL_TOKEN = yav.get_version(ver)['value']['yql_token']
SOLOMON_TOKEN = yav.get_version(ver)['value']['solomon_token']
ver = 'ver-01eex0jhkrhzt14dfn8a2tyy42'
STATLOG_PASS = yav.get_version(ver)['value']['password']

def df_to_wiki(df, show_index = False):
    print('#|\n ||' + df.to_csv(index = show_index, sep = '|').replace('|', ' | ').replace('\n', '|| \n ||')[:-2] + '|#')
    

In [2]:
import io
import time
import pandas as pd
import datetime

HOST = 'http://mtgray03k.yandex.ru:8123'
HOST = 'http://mtlog01-01-1.yandex.ru:8123'
HOST = 'http://clickhouse.metrika.yandex.net:8123'
#HOST = 'http://mtsmart001-1.yandex.ru:8123'
HOST = 'http://mtch01k.metrika.yandex.net:8123'

import requests
import http.client  # or http.client if you're on Python 3
http.client._MAXHEADERS = 100000

def get_clickhouse_data(query, host = HOST, connection_timeout = 1500, user = 'miptgirl', password = '1QwBgINk'):
    NUMBER_OF_TRIES = 30
    DELAY = 10
    
    for i in range(NUMBER_OF_TRIES):
        r = requests.post(host, 
                          params = {'timeout_before_checking_execution_speed': 120, 'max_bytes_in_join': 0}, 
                          timeout = connection_timeout, data = query,
                          auth = (user, password), verify = False
                         )
        if (r.status_code == 200) and ('DB:Exception' not in r.text):
            return r.text
        else:
            print('ATTENTION: try #%d failed' % i)
            if i != (NUMBER_OF_TRIES-1):
                #print query
                print(r.text)
                time.sleep(DELAY*(i+1))
            else:
                raise ValueError(r.text)
        
def get_clickhouse_df(query, host = HOST, connection_timeout = 1500, user = 'miptgirl', password = '1QwBgINk'):
    data = get_clickhouse_data(query, host, connection_timeout, user, password) 
    df = pd.read_csv(io.StringIO(data), sep = '\t')
    return df

def upload(table, content, host=HOST):
    '''Uploads data to table in ClickHous'''
    content = content.encode('utf-8')
    query_dict = {
             'query': 'INSERT INTO ' + table + ' FORMAT TabSeparatedWithNames '
        }
    r = requests.post(host, data=content, params=query_dict, auth = ('miptgirl', '1QwBgINk'))
    result = r.text
    if r.status_code == 200:
        return result
    else:
        raise ValueError(r.text)

In [3]:
def get_counters_interface_raw_data(start_date_str, end_date_str):
    q = '''
    select
        user_id,
        count() as page_views,
        uniqExact(visit_id) as sessions,
        uniq(url_path) as uniq_reports
    from
        (select 
            WatchID as watch_id,
            UserID as user_id,
            PassportUserID as passport_user_id,
            arrayFilter(x -> x != '', splitByChar('/', path(URL)))[1] as main_url_path,
            '/'||arrayStringConcat(arraySlice(arrayFilter(x -> x != '', splitByChar('/', path(URL))), 1, 2), '/') as url_path,
            if(extractURLParameter(URL, 'id') = '', extractURLParameter(URL, 'ids'), extractURLParameter(URL, 'id')) as counter_id
        from hits_all
        where EventDate >= '{start_date}'
            and EventDate <= '{end_date}'
            and CounterID = 24226447
            and not DontCountHits
            and PassportUserID != 0
            and url_path not in ('/inpage/visor-proto', '/inpage/visor-player', '/inpage')
            and main_url_path in ('stat', 'dashboard', 'inpage', 'publishers', 'markedphones', 'legacy')
            and counter_id != '')
        any left join
        (select VisitID as visit_id, arrayJoin(WatchIDs) as watch_id
        from visits_all
        where StartDate >= toDate('{start_date}') - 7
            and StartDate <= '{end_date}'
            and CounterID = 24226447)
        using watch_id
    group by user_id
    format TSVWithNames
    '''.format(
        start_date = start_date_str,
        end_date = end_date_str
    )

    df = get_clickhouse_df(q)
    
    return df

In [4]:
start_date_str = '2021-02-01'
end_date_str = '2021-02-28'

In [5]:
%%time 

df = get_counters_interface_raw_data(start_date_str, end_date_str)

CPU times: user 216 ms, sys: 58.3 ms, total: 275 ms
Wall time: 19.2 s


In [6]:
df = df.sample(frac = 1)

In [7]:
df.head()

Unnamed: 0,user_id,page_views,sessions,uniq_reports
67618,534762791565180971,36,6,3
226602,3346378141598910985,72,20,1
379358,9070109651613496920,12,1,1
412446,304167801604681883,8,2,2
128967,8715333681611777317,9,2,2


In [8]:
df.shape

(501020, 4)

In [13]:
import tqdm

recs = []

for i in tqdm.tqdm(range(10000)):
    df = df.sample(frac = 1)
    df1, df2 = np.array_split(df, 2)
    
    for param in ['page_views', 'sessions', 'uniq_reports']:
        stat, p =  mannwhitneyu(df1[param], df2[param], alternative = 'two-sided')
        recs.append(
            {'i': i, 'p': p, 'param': param}
        )

100%|██████████| 10000/10000 [32:39<00:00,  5.10it/s]


In [14]:
test_df = pd.DataFrame(recs)

In [15]:
cmp_df = pd.DataFrame()

cmp_df['total'] = test_df.groupby('param').i.count()

for alpha in [0.05, 0.01, 0.005, 0.001]:
    cmp_df['alpha_%.3f' % alpha] = test_df[test_df.p <= alpha].groupby('param').i.count()

In [16]:
df_to_wiki(
    cmp_df.apply(lambda x: x/cmp_df.total).applymap(lambda x: round(x, 4)).drop('total', axis = 1), True
)

#|
 ||param | alpha_0.050 | alpha_0.010 | alpha_0.005 | alpha_0.001|| 
 ||page_views | 0.0481 | 0.0107 | 0.0058 | 0.0009|| 
 ||sessions | 0.0489 | 0.01 | 0.005 | 0.0015|| 
 ||uniq_reports | 0.0511 | 0.0103 | 0.0047 | 0.0013|| 
 |#


In [17]:
recs = []

for i in tqdm.tqdm(range(10000)):
    df = df.sample(frac = 1)
    res = np.array_split(df, 10)
    df1 = res[0]
    df2 = res[1]
    
    for param in ['page_views', 'sessions', 'uniq_reports']:
        stat, p =  mannwhitneyu(df1[param], df2[param], alternative = 'two-sided')
        recs.append(
            {'i': i, 'p': p, 'param': param}
        )

100%|██████████| 10000/10000 [12:15<00:00, 13.59it/s]


In [18]:
test_df = pd.DataFrame(recs)

In [19]:
cmp_df = pd.DataFrame()

cmp_df['total'] = test_df.groupby('param').i.count()

for alpha in [0.05, 0.01, 0.005, 0.001]:
    cmp_df['alpha_%.3f' % alpha] = test_df[test_df.p <= alpha].groupby('param').i.count()

In [20]:
df_to_wiki(
    cmp_df.apply(lambda x: x/cmp_df.total).applymap(lambda x: round(x, 4)).drop('total', axis = 1), True
)

#|
 ||param | alpha_0.050 | alpha_0.010 | alpha_0.005 | alpha_0.001|| 
 ||page_views | 0.0519 | 0.0104 | 0.0056 | 0.0011|| 
 ||sessions | 0.0496 | 0.0092 | 0.0052 | 0.0011|| 
 ||uniq_reports | 0.051 | 0.0109 | 0.0056 | 0.0006|| 
 |#


In [21]:
recs = []

for i in tqdm.tqdm(range(10000)):
    df = df.sample(frac = 1)
    res = np.array_split(df, 20)
    df1 = res[0]
    df2 = res[1]
    
    for param in ['page_views', 'sessions', 'uniq_reports']:
        stat, p =  mannwhitneyu(df1[param], df2[param], alternative = 'two-sided')
        recs.append(
            {'i': i, 'p': p, 'param': param}
        )

100%|██████████| 10000/10000 [09:42<00:00, 17.16it/s]


In [22]:
test_df = pd.DataFrame(recs)

In [23]:
cmp_df = pd.DataFrame()

cmp_df['total'] = test_df.groupby('param').i.count()

for alpha in [0.05, 0.01, 0.005, 0.001]:
    cmp_df['alpha_%.3f' % alpha] = test_df[test_df.p <= alpha].groupby('param').i.count()

In [24]:
df_to_wiki(
    cmp_df.apply(lambda x: x/cmp_df.total).applymap(lambda x: round(x, 4)).drop('total', axis = 1), True
)

#|
 ||param | alpha_0.050 | alpha_0.010 | alpha_0.005 | alpha_0.001|| 
 ||page_views | 0.0524 | 0.0102 | 0.0051 | 0.0015|| 
 ||sessions | 0.0521 | 0.0106 | 0.0059 | 0.0008|| 
 ||uniq_reports | 0.0503 | 0.0101 | 0.0049 | 0.0011|| 
 |#
