In [144]:
import vault_client
import getpass
import tqdm

from os.path import expanduser
import json

def get_config():
    home = expanduser("~")
    
    with open(home + '/config.json') as f:
        config_data = f.read()
    config = json.loads(config_data)
    return config

yav = vault_client.instances.Production(
    rsa_login='miptgirl', 
    authorization='OAuth {}'.format(get_config()['vault_api_token']),)

ver = 'ver-01dssnc1zdezgbfd48xhj9j9xn'
YT_TOKEN = yav.get_version(ver)['value']['yt_token']
YQL_TOKEN = yav.get_version(ver)['value']['yql_token']
SOLOMON_TOKEN = yav.get_version(ver)['value']['solomon_token']
ver = 'ver-01eex0jhkrhzt14dfn8a2tyy42'
STATLOG_PASS = yav.get_version(ver)['value']['password']

def df_to_wiki(df, show_index = False):
    print('#|\n ||' + df.to_csv(index = show_index, sep = '|').replace('|', ' | ').replace('\n', '|| \n ||')[:-2] + '|#')

In [145]:
from yql.api.v1.client import YqlClient
DEFAULT_DB = 'hahn'
DEFAULT_HEADER = """
PRAGMA yt.PoolTrees = "physical";
PRAGMA SimpleColumns;
pragma yt.MinPublishedAvgChunkSize = "5G";"""

def run_yql_wait_and_return(query='', token=YQL_TOKEN, db=DEFAULT_DB, add_default_header=True, syntax_version=1):
    with YqlClient(db=db, token=token) as client:
        if add_default_header:
            query = DEFAULT_HEADER + query
        request = client.query(query, syntax_version=syntax_version)
        request.run()
        result = request.get_results()
        if not request.is_success:
            status = request.status
            errors = '\n'.join([str(x) for x in result.errors]) if result.errors else 'No messages'
            error_message = """There is an error in YQL api request. Status: {status}\nErrors: '{errors}'""".format(
                status=status,
                errors=errors)
            raise(BaseException(error_message))
    return request.full_dataframe

In [146]:
import io
import time
import pandas as pd
import datetime

HOST = 'http://mtgray03k.yandex.ru:8123'
HOST = 'http://mtlog01-01-1.yandex.ru:8123'
HOST = 'http://clickhouse.metrika.yandex.net:8123'
#HOST = 'http://mtsmart001-1.yandex.ru:8123'
HOST = 'http://mtch01k.metrika.yandex.net:8123'

import requests
import http.client  # or http.client if you're on Python 3
http.client._MAXHEADERS = 100000

def get_clickhouse_data(query, host = HOST, connection_timeout = 1500, user = 'miptgirl', password = '1QwBgINk'):
    NUMBER_OF_TRIES = 30
    DELAY = 10
    
    for i in range(NUMBER_OF_TRIES):
        r = requests.post(host, 
                          params = {'timeout_before_checking_execution_speed': 120, 'max_bytes_in_join': 0}, 
                          timeout = connection_timeout, data = query,
                          auth = (user, password), verify = False
                         )
        if (r.status_code == 200) and ('DB:Exception' not in r.text):
            return r.text
        else:
            print('ATTENTION: try #%d failed' % i)
            if i != (NUMBER_OF_TRIES-1):
                #print query
                print(r.text)
                time.sleep(DELAY*(i+1))
            else:
                raise ValueError(r.text)
        
def get_clickhouse_df(query, host = HOST, connection_timeout = 1500, user = 'miptgirl', password = '1QwBgINk'):
    data = get_clickhouse_data(query, host, connection_timeout, user, password) 
    df = pd.read_csv(io.StringIO(data), sep = '\t')
    return df

def upload(table, content, host=HOST):
    '''Uploads data to table in ClickHous'''
    content = content.encode('utf-8')
    query_dict = {
             'query': 'INSERT INTO ' + table + ' FORMAT TabSeparatedWithNames '
        }
    r = requests.post(host, data=content, params=query_dict, auth = ('miptgirl', '1QwBgINk'))
    result = r.text
    if r.status_code == 200:
        return result
    else:
        raise ValueError(r.text)

In [232]:
def get_data(date_str, host):

    q = '''
    select 
        counter_id,
        date,
        goal_id,
        total_visits,
        conversions,
        conversion_visits
    from
        (select 
            CounterID as counter_id,
            arrayJoin(Goals.ID) as goal_id,
            sum(Sign) as conversions,
            uniq(VisitID) as conversion_visits,
            StartDate as date
        from visits_all
        where StartDate = '{date}'
            and dictGet('counters', 'mtlog_layer_id', toUInt64(CounterID)) = 1
            and UTCStartTime >= '2021-10-15 13:00:00'
            and UTCStartTime <= '2021-10-16 22:00:00'
        group by goal_id, date, counter_id)
        inner join
        (select 
            CounterID as counter_id,
            sum(Sign) as total_visits,
            StartDate as date
        from visits_all
        where StartDate = '{date}'
            and dictGet('counters', 'mtlog_layer_id', toUInt64(CounterID)) = 1
            and UTCStartTime >= '2021-10-15 13:00:00'
            and UTCStartTime <= '2021-10-16 22:00:00'
        group by date, counter_id)
        using (date, counter_id)
    format TSVWithNames
    '''.format(date = date_str)
    
    df = get_clickhouse_df(q, host)
    return df

In [233]:
%%time 
prod_df = get_data('2021-10-16', 'http://mtgiga001-1.metrika.yandex.net:8123')

CPU times: user 84.3 ms, sys: 15.8 ms, total: 100 ms
Wall time: 2.78 s


In [236]:
%%time 
test_df = get_data('2021-10-16', 'http://mtgiga001-1t.metrika.yandex.net:8123')

CPU times: user 99 ms, sys: 0 ns, total: 99 ms
Wall time: 2.92 s


In [237]:
! cp ../metr-45766/testing_changed_goals.csv .

In [238]:
updated_goals_df = pd.read_csv('testing_changed_goals.csv', sep = '\t')

In [None]:
yql_query = '''
select *
from `//home/metrica-analytics/miptgirl/final_bad_goals_cube`
'''

bad_goals_cube_df = run_yql_wait_and_return(yql_query)

In [None]:
cmp_df = prod_df.merge(test_df, how = 'outer', on = ['counter_id', 'date', 'goal_id'],
             suffixes = ('_prod', '_test')).fillna(0)

In [202]:
cmp_df.sort_values('conversions_prod', ascending = False)

Unnamed: 0,counter_id,date,goal_id,total_visits_prod,conversions_prod,conversion_visits_prod,total_visits_test,conversions_test,conversion_visits_test
3118,62226256,2021-10-16,99139144,4696.0,27551.0,4697.0,278814.0,1562536.0,280722.0
7114,62226256,2021-10-16,99355534,4696.0,27550.0,4697.0,278814.0,1562527.0,280722.0
7132,54005761,2021-10-16,51806647,3007.0,25481.0,2176.0,183187.0,1630561.0,131703.0
8539,115080,2021-10-16,13207210,15793.0,24062.0,15561.0,953968.0,1483902.0,939835.0
9259,55027,2021-10-16,35983435,1581.0,22984.0,1580.0,97798.0,1188655.0,98500.0
...,...,...,...,...,...,...,...,...,...
33503,59998987,2021-10-16,84010588,0.0,0.0,0.0,7.0,2.0,2.0
33504,57037618,2021-10-16,177573934,0.0,0.0,0.0,972.0,1.0,1.0
33505,76575265,2021-10-16,196407121,0.0,0.0,0.0,19.0,1.0,1.0
33506,63980734,2021-10-16,111991330,0.0,0.0,0.0,18.0,1.0,1.0


In [203]:
for p in ['total_visits_prod', 'conversions_prod', 'total_visits_test', 'conversions_test']:
    cmp_df[p] = cmp_df[p].map(int)
    
    
cmp_df['visits_diff'] = list(map(
    lambda x, y: round(100.*(y-x)/x, 2) if x != 0 else None,
    cmp_df.total_visits_prod,
    cmp_df.total_visits_test
))

cmp_df['conversions_diff'] = list(map(
    lambda x, y: round(100.*(y-x)/x, 2) if x != 0 else None,
    cmp_df.conversions_prod,
    cmp_df.conversions_test
))

cmp_df['conversion_visits_diff'] = list(map(
    lambda x, y: round(100.*(y-x)/x, 2) if x != 0 else None,
    cmp_df.conversion_visits_prod,
    cmp_df.conversion_visits_test
))

In [204]:
updated_goals = updated_goals_df.goal_id.values

In [205]:
 def get_coefs(tmp_df):
    filt_tmp_df = tmp_df[(tmp_df.total_visits_test != 0) & (tmp_df.total_visits_prod != 0)]
    print('goals:', tmp_df.shape[0], ', matched goals:', filt_tmp_df.shape[0])
    
    filt_total_visits_prod = filt_tmp_df.total_visits_prod.sum()
    filt_total_visits_test = filt_tmp_df.total_visits_test.sum()
    
    print('total visits: %d vs %d (%.2f%%)' % (filt_total_visits_prod, filt_total_visits_test, 
            100.*(filt_total_visits_test - filt_total_visits_prod)/filt_total_visits_prod))
    
    
    conversions_prod = tmp_df.conversions_prod.sum()
    conversion_test = tmp_df.conversions_test.sum()
    
    print('conversion: %d vs %d (%.2f%%)' % (conversions_prod, conversion_test, 
            100.*(conversion_test - conversions_prod)/conversions_prod))
    
    conversions_prod = tmp_df.conversion_visits_prod.sum()
    conversion_test = tmp_df.conversion_visits_test.sum()
    
    print('conversion visits: %d vs %d (%.2f%%)' % (conversions_prod, conversion_test, 
            100.*(conversion_test - conversions_prod)/conversions_prod))
    
    tmp_df['goal_id'] = tmp_df['goal_id'].map(str)
    return tmp_df.merge(bad_goals_cube_df[['goal_id', 'goal_types', 'pattern', 'bad_goals_share']],
                     how = 'left', on = 'goal_id').sort_values('conversions_prod', ascending = False)

In [206]:
changed_tmp_df = cmp_df[cmp_df.goal_id.isin(updated_goals)]
changed_tmp_df = get_coefs(changed_tmp_df)

goals: 430 , matched goals: 40
total visits: 28933 vs 1702189 (5783.21%)
conversion: 14560 vs 859999 (5806.59%)
conversion visits: 5102 vs 299022 (5760.88%)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [207]:
df_to_wiki(
    changed_tmp_df.head(20))

#|
 ||counter_id | date | goal_id | total_visits_prod | conversions_prod | conversion_visits_prod | total_visits_test | conversions_test | conversion_visits_test | visits_diff | conversions_diff | conversion_visits_diff | goal_types | pattern | bad_goals_share|| 
 ||152220 | 2021-10-16 | 103280617 | 5411 | 9799 | 2655.0 | 317397 | 580048 | 154264.0 | 5765.77 | 5819.46 | 5710.32 | url | contain: _step1_ | 100.0|| 
 ||152220 | 2021-10-16 | 6236880 | 5411 | 3889 | 1949.0 | 317397 | 230458 | 115549.0 | 5765.77 | 5825.89 | 5828.63 | url | contain: /ac_result | 100.0|| 
 ||152220 | 2021-10-16 | 6236874 | 5411 | 495 | 261.0 | 317397 | 27536 | 15082.0 | 5765.77 | 5462.83 | 5678.54 | url | contain: /pc_result | 100.0|| 
 ||152220 | 2021-10-16 | 6236877 | 5411 | 98 | 38.0 | 317397 | 2798 | 1450.0 | 5765.77 | 2755.1 | 3715.79 | url | contain: /cc_result | 100.0|| 
 ||76989841 | 2021-10-16 | 188992906 | 151 | 62 | 37.0 | 9261 | 4860 | 2642.0 | 6033.11 | 7738.71 | 7040.54 | url | contain: goal__ecf

In [208]:
good_tmp_df = cmp_df[~cmp_df.goal_id.isin(bad_goals_cube_df.goal_id.map(int).values)]
good_tmp_df = good_tmp_df[(good_tmp_df.total_visits_test != 0) & (good_tmp_df.total_visits_prod != 0)]
good_tmp_df = get_coefs(good_tmp_df)

goals: 10001 , matched goals: 10001
total visits: 5089862 vs 302366316 (5840.56%)
conversion: 542892 vs 31442189 (5691.61%)
conversion visits: 204158 vs 11624872 (5594.06%)


In [209]:
good_tmp_df['conversions_abs_diff'] = good_tmp_df.conversions_test - good_tmp_df.conversions_prod

In [210]:
df_to_wiki(good_tmp_df.sort_values('conversions_abs_diff').head(20))

#|
 ||counter_id | date | goal_id | total_visits_prod | conversions_prod | conversion_visits_prod | total_visits_test | conversions_test | conversion_visits_test | visits_diff | conversions_diff | conversion_visits_diff | goal_types | pattern | bad_goals_share | conversions_abs_diff|| 
 ||152220 | 2021-10-16 | 9904910 | 5411 | 171 | 98.0 | 317397 | 15 | 6.0 | 5765.77 | -91.23 | -93.88 |  |  |  | -156|| 
 ||62990695 | 2021-10-16 | 105008053 | 1 | 1 | 1.0 | 33 | 1 | 1.0 | 3200.0 | 0.0 | 0.0 |  |  |  | 0|| 
 ||71397556 | 2021-10-16 | 154856395 | 2 | 1 | 1.0 | 4 | 1 | 1.0 | 100.0 | 0.0 | 0.0 |  |  |  | 0|| 
 ||52133725 | 2021-10-16 | 222052012 | 2 | 2 | 2.0 | 3 | 2 | 2.0 | 50.0 | 0.0 | 0.0 |  |  |  | 0|| 
 ||60593689 | 2021-10-16 | 104280205 | 1 | 1 | 1.0 | 5 | 1 | 1.0 | 400.0 | 0.0 | 0.0 |  |  |  | 0|| 
 ||56688235 | 2021-10-16 | 219300850 | 1 | 3 | 1.0 | 110 | 3 | 1.0 | 10900.0 | 0.0 | 0.0 |  |  |  | 0|| 
 ||202289 | 2021-10-16 | 126771985 | 8 | 1 | 1.0 | 446 | 1 | 1.0 | 5475.0 | 0.0 | 0

In [211]:
yql_query = '''
select goal_id
from `//home/metrika/data-transfer/prod/mtacs-metrikamain/conv_main_ad_goals`
where goal_type = 'step';
'''

step_goals_df = run_yql_wait_and_return(yql_query)

In [212]:
filt_good_tmp_df = cmp_df[~cmp_df.goal_id.isin(bad_goals_cube_df.goal_id.map(int).values)]
filt_good_tmp_df = filt_good_tmp_df[(filt_good_tmp_df.total_visits_test != 0) & (filt_good_tmp_df.total_visits_prod != 0)]
filt_good_tmp_df = filt_good_tmp_df[~filt_good_tmp_df.goal_id.isin(step_goals_df.goal_id.map(str).values)]
filt_good_tmp_df = get_coefs(filt_good_tmp_df)

goals: 9789 , matched goals: 9789
total visits: 4728326 vs 281168015 (5846.46%)
conversion: 534610 vs 31241832 (5743.85%)
conversion visits: 199334 vs 11492794 (5665.60%)


In [213]:
filt_good_tmp_df['conversions_abs_diff'] = filt_good_tmp_df.conversions_test - filt_good_tmp_df.conversions_prod

In [214]:
df_to_wiki(filt_good_tmp_df.sort_values('conversions_abs_diff').head(20))

#|
 ||counter_id | date | goal_id | total_visits_prod | conversions_prod | conversion_visits_prod | total_visits_test | conversions_test | conversion_visits_test | visits_diff | conversions_diff | conversion_visits_diff | goal_types | pattern | bad_goals_share | conversions_abs_diff|| 
 ||57035416 | 2021-10-16 | 214310758 | 2 | 1 | 1.0 | 26 | 1 | 1.0 | 1200.0 | 0.0 | 0.0 |  |  |  | 0|| 
 ||65673955 | 2021-10-16 | 126234868 | 1 | 1 | 1.0 | 15 | 1 | 1.0 | 1400.0 | 0.0 | 0.0 |  |  |  | 0|| 
 ||60269626 | 2021-10-16 | 85904065 | 1 | 1 | 1.0 | 6 | 1 | 1.0 | 500.0 | 0.0 | 0.0 |  |  |  | 0|| 
 ||76324525 | 2021-10-16 | 206672749 | 3 | 1 | 1.0 | 71 | 1 | 1.0 | 2266.67 | 0.0 | 0.0 |  |  |  | 0|| 
 ||62387938 | 2021-10-16 | 173467843 | 1 | 3 | 1.0 | 5 | 3 | 1.0 | 400.0 | 0.0 | 0.0 |  |  |  | 0|| 
 ||41807069 | 2021-10-16 | 176522065 | 3 | 1 | 1.0 | 204 | 1 | 1.0 | 6700.0 | 0.0 | 0.0 |  |  |  | 0|| 
 ||45455397 | 2021-10-16 | 57308563 | 3 | 1 | 1.0 | 209 | 1 | 1.0 | 6866.67 | 0.0 | 0.0 |  |  |  |

In [215]:
# tmp_df = cmp_df[cmp_df.goal_id.isin(bad_goals_cube_df.goal_id.map(int).values)]
# get_coefs(tmp_df)

In [216]:
other_bad_tmp_df = cmp_df[cmp_df.goal_id.isin(bad_goals_cube_df.goal_id.map(int).values) & (~cmp_df.goal_id.isin(updated_goals))]
other_bad_tmp_df = get_coefs(other_bad_tmp_df)

goals: 702 , matched goals: 272
total visits: 217709 vs 12837078 (5796.44%)
conversion: 115879 vs 4978687 (4196.45%)
conversion visits: 40713 vs 1911671 (4595.48%)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [217]:
other_bad_tmp_df['pred_conversions_test'] = ((100. - other_bad_tmp_df.bad_goals_share)/100 * other_bad_tmp_df.conversions_prod) \
    .map(lambda x: int(round(x)))

In [218]:
other_bad_tmp_df['pred_conversions_diff'] = (other_bad_tmp_df.pred_conversions_test - other_bad_tmp_df.conversions_test)\
    .map(lambda x: abs(x))

In [219]:
df_to_wiki(other_bad_tmp_df.sort_values('pred_conversions_diff', ascending = False).head(20))

#|
 ||counter_id | date | goal_id | total_visits_prod | conversions_prod | conversion_visits_prod | total_visits_test | conversions_test | conversion_visits_test | visits_diff | conversions_diff | conversion_visits_diff | goal_types | pattern | bad_goals_share | pred_conversions_test | pred_conversions_diff|| 
 ||45727869 | 2021-10-16 | 42528902 | 12588 | 22830 | 12591.0 | 750430 | 1366686 | 747566.0 | 5861.47 | 5886.36 | 5837.3 | url | contain: / | 0.0 | 22830 | 1343856|| 
 ||55027 | 2021-10-16 | 35983435 | 1581 | 22984 | 1580.0 | 97798 | 1188655 | 98500.0 | 6085.83 | 5071.66 | 6134.18 | url | contain: sima-land.ru | 23.18 | 17656 | 1170999|| 
 ||153166 | 2021-10-16 | 23887485 | 8083 | 18285 | 8022.0 | 478126 | 1015567 | 472490.0 | 5815.2 | 5454.1 | 5789.93 | url | contain: kommersant.ru | 5.2 | 17334 | 998233|| 
 ||34 | 2021-10-16 | 2918974 | 890 | 6317 | 857.0 | 51642 | 279911 | 50381.0 | 5702.47 | 4331.07 | 5778.76 | url | contain: direct.yandex.ru | 37.51 | 3947 | 275964|| 
 ||115

In [220]:
df_to_wiki(other_bad_tmp_df.head(20))

#|
 ||counter_id | date | goal_id | total_visits_prod | conversions_prod | conversion_visits_prod | total_visits_test | conversions_test | conversion_visits_test | visits_diff | conversions_diff | conversion_visits_diff | goal_types | pattern | bad_goals_share | pred_conversions_test | pred_conversions_diff|| 
 ||55027 | 2021-10-16 | 35983435 | 1581 | 22984 | 1580.0 | 97798 | 1188655 | 98500.0 | 6085.83 | 5071.66 | 6134.18 | url | contain: sima-land.ru | 23.18 | 17656 | 1170999|| 
 ||45727869 | 2021-10-16 | 42528902 | 12588 | 22830 | 12591.0 | 750430 | 1366686 | 747566.0 | 5861.47 | 5886.36 | 5837.3 | url | contain: / | 0.0 | 22830 | 1343856|| 
 ||153166 | 2021-10-16 | 23887485 | 8083 | 18285 | 8022.0 | 478126 | 1015567 | 472490.0 | 5815.2 | 5454.1 | 5789.93 | url | contain: kommersant.ru | 5.2 | 17334 | 998233|| 
 ||115080 | 2021-10-16 | 26350179 | 15793 | 8472 | 1969.0 | 953968 | 144433 | 121293.0 | 5940.45 | 1604.83 | 6060.13 | url | contain: nowcast | 84.37 | 1324 | 143109|| 
 ||34

In [221]:
pd.set_option('display.max_colwidth', 5000)

In [222]:
bad_goals_cube_df[bad_goals_cube_df.goal_id == '198184135']

Unnamed: 0,bad_goals,bad_goals_share,counter_id,goal_id,goal_types,is_yandex_counter,layer_id,num_conditions,pattern,total_goals
128112,16,88.89,82709953,198184135,url,0,1,4,"contain: /contacts, contain: company_contacts_hovered, contain: company_contacts_viewed, contain: contacts_view_on_company_site",18


In [223]:
tiu_bad_goals = bad_goals_cube_df[list(map(
    lambda x: x == 'contain: /contacts, contain: company_contacts_hovered, contain: company_contacts_viewed, contain: contacts_view_on_company_site',
    bad_goals_cube_df.pattern
))].goal_id.values

In [224]:
tiu_tmp_df = cmp_df[cmp_df.goal_id.isin(tiu_bad_goals)]
tiu_tmp_df = get_coefs(tiu_tmp_df)

goals: 187 , matched goals: 8
total visits: 25 vs 1248 (4892.00%)
conversion: 10 vs 722 (7120.00%)
conversion visits: 9 vs 486 (5300.00%)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
