In [37]:
import yt.wrapper as yt

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import mannwhitneyu, ttest_ind
import os
import scipy.stats as st

from tqdm.auto import tqdm
from collections import Counter

%matplotlib inline

In [38]:
yt.config["proxy"]["url"] = 'hahn.yt.yandex.net'
client = yt.YtClient(proxy='hahn', token=os.environ.get('YT_TOKEN'))

In [39]:
def ci_bounds(pos, n, confidence = 0.95):
    if n == 0:
        return [0, 1]
    z = st.norm.ppf(1 - (1 - confidence) / 2)
    phat = 1.0 * pos / n
    return [(phat + z * z / (2 * n) - z * math.sqrt((phat * (1 - phat) + z * z / (4 * n)) / n)) / (1 + z * z / n),
    (phat + z * z / (2 * n) + z * math.sqrt((phat * (1 - phat) + z * z / (4 * n)) / n)) / (1 + z * z / n)
    ]

def norm_test(pos1, n1, pos2, n2):
    p1 = pos1 / n1
    p2 = pos2 / n2
    
    p = (pos1 + pos2) / (n1 + n2)
    
    s = np.sqrt(p * (1 - p) * (1 / n1 + 1 / n2) )
    
    z_stat = (p1 - p2)/s
    
    return 2*(1 - st.norm.cdf(abs(z_stat)))

def calc_metric_good(data, part_coef=0.8, test_name="test", 
                    buckets=50):
    
    flag = (data["test_name"] == test_name)
    
    nes_part = data[flag]
    
    raw_resulrs = 1*(nes_part["is_good"] == "yes")
        
    bucket = nes_part["uuid"].apply(lambda x: hash(x) % buckets)
    
    pd_results = pd.DataFrame({
        "metric": raw_resulrs.values,
        "bucket": bucket.values
    })
    
    return pd_results

def calc_metric_bad(data, part_coef=0.8, test_name="test", 
                    buckets=50):
    
    flag = (data["test_name"] == test_name)
    
    nes_part = data[flag]
    
    raw_resulrs = 1*(nes_part["is_bad"] == "yes")
        
    bucket = nes_part["uuid"].apply(lambda x: hash(x) % buckets)
    
    pd_results = pd.DataFrame({
        "metric": raw_resulrs.values,
        "bucket": bucket.values
    })
    
    return pd_results

def calc_metric_all(data, good_coef=1.0, bad_coef=-1.0, test_name="test", 
                    buckets=50):
    
    flag = (data["test_name"] == test_name)
    
    nes_part = data[flag]
    
    coefs = {
        "good": good_coef,
        "zero": 0,
        "bad": bad_coef
    }
    
    raw_resulrs = (nes_part["is_good"] == "yes")* good_coef + (nes_part["is_bad"] == "yes")* bad_coef
        
    bucket = nes_part["uuid"].apply(lambda x: hash(x) % buckets)
    
    pd_results = pd.DataFrame({
        "metric": raw_resulrs.values,
        "bucket": bucket.values
    })
    
    return pd_results

def process_exp(table_name, 
                calc_metric=calc_metric_good, 
                test_name="test",
                cntrl_name="cntrl",
                **kwargs
            ):
    read_result = pd.DataFrame(client.read_table(table_name, raw=False))

    pd_results = calc_metric(read_result, test_name=test_name, **kwargs)
    test_raw = pd_results.groupby(by=["bucket"]).mean()["metric"].values
    
    pd_results = calc_metric(read_result, test_name=cntrl_name, **kwargs)
    ctrl_raw = pd_results.groupby(by=["bucket"]).mean()["metric"].values


    pvalue=mannwhitneyu(ctrl_raw, test_raw).pvalue
    effect = (test_raw.mean() - ctrl_raw.mean())/abs(ctrl_raw.mean())
    message = f'pvalue={pvalue:.3f}, effect={effect:.3f}, ctrl metric = {ctrl_raw.mean():.3f}, test_metric = {test_raw.mean():.3f}, cnt = {len(ctrl_raw) + len(test_raw)}'

    res_dict={"p_value": pvalue, "cntr_value": ctrl_raw.mean(), "test_value": test_raw.mean()}
    
    return res_dict

def process_exp_by_part(table_name, 
                calc_metric=calc_metric_good, 
                test_name="test",
                cntrl_name="cntrl",
                **kwargs
            ):
    read_result = pd.DataFrame(client.read_table(table_name, raw=False))

    pd_results = calc_metric(read_result, test_name=test_name, **kwargs)
    test_pos = pd_results["metric"].sum()
    test_cnt = len(pd_results["metric"])
    
    pd_results = calc_metric(read_result, test_name=cntrl_name, **kwargs)
    cntrl_pos = pd_results["metric"].sum()
    cntrl_cnt = len(pd_results["metric"])


    pvalue=norm_test(cntrl_pos, cntrl_cnt, test_pos, test_cnt)

    res_dict={"p_value": pvalue, "cntr_value": cntrl_pos/cntrl_cnt, "test_value": test_pos/test_cnt}
    
    return res_dict





def make_report_table_for_exp_without_sampl(table, test_name="test",
                cntrl_name="ctrl"):
    
    read_result = pd.DataFrame(client.read_table(table, raw=False))
    
    agreement_good = read_result["probability_good"].values.mean()
    agreement_bad = read_result["probability_bad"].values.mean()
    
    part_of_good = process_exp(table,
                                calc_metric=calc_metric_good,
                                test_name=test_name,
                                cntrl_name=cntrl_name 
                                 )
    part_of_good_by_part = process_exp_by_part(table,
                                calc_metric=calc_metric_good,
                                test_name=test_name,
                                cntrl_name=cntrl_name 
                                 )
        
    part_of_bad = process_exp(table,
                            calc_metric=calc_metric_bad,
                            test_name=test_name,
                            cntrl_name=cntrl_name 
                             )
    part_of_bad_by_part = process_exp_by_part(table,
                            calc_metric=calc_metric_bad,
                            test_name=test_name,
                            cntrl_name=cntrl_name 
                             )

    
    metrica_all = process_exp(table,
                            calc_metric=calc_metric_all,
                            test_name=test_name,
                            cntrl_name=cntrl_name 
                             )
    
    
    results = pd.DataFrame([
        {
            "metric": "Доля good",
            "cntrl_value": part_of_good["cntr_value"],
            "test_value": part_of_good["test_value"],
            "diff": part_of_good["test_value"] - part_of_good["cntr_value"],
            "p_value": part_of_good["p_value"],
        
        },
        
        {
            "metric": "Доля good с критерием долей",
            "cntrl_value": part_of_good_by_part["cntr_value"],
            "test_value": part_of_good_by_part["test_value"],
            "diff": part_of_good_by_part["test_value"] - part_of_good_by_part["cntr_value"],
            "p_value": part_of_good_by_part["p_value"],
        
        },
        
        {
            "metric": "Доля bad",
            "cntrl_value": part_of_bad["cntr_value"],
            "test_value": part_of_bad["test_value"],
            "diff": part_of_bad["test_value"] - part_of_bad["cntr_value"],
            "p_value": part_of_bad["p_value"],
        
        },
        
        {
            "metric": "Доля bad с критерием долей",
            "cntrl_value": part_of_bad_by_part["cntr_value"],
            "test_value": part_of_bad_by_part["test_value"],
            "diff": part_of_bad_by_part["test_value"] - part_of_bad_by_part["cntr_value"],
            "p_value": part_of_bad_by_part["p_value"],
        
        },
        
        {
            "metric": "Метрика со всеми",
            "cntrl_value": metrica_all["cntr_value"],
            "test_value": metrica_all["test_value"],
            "diff": metrica_all["test_value"] - metrica_all["cntr_value"],
            "p_value": metrica_all["p_value"],
        
        },
        {
            "metric": "Количество сессий",
            "cntrl_value": (read_result["test_name"] == cntrl_name).sum(),
            "test_value": (read_result["test_name"] == test_name).sum(),
            "diff": None,
            "p_value": None,
        
        },
        {
            "metric": "Согласованность good",
            "cntrl_value": agreement_good,
            "test_value": None,
            "diff": None,
            "p_value": None,
        
        },
        {
            "metric": "Согласованность bad",
            "cntrl_value": agreement_bad,
            "test_value": None,
            "diff": None,
            "p_value": None,
        
        },
    ])
    
    return results

# Отключение BERT

In [54]:
make_report_table_for_exp_without_sampl(
    "//home/voice/eliseevmax/session_quality/good_and_bad/EXPERIMENTS-73207/2022-07-29/joined_results_by_hand",
    cntrl_name = "378111",
    test_name = "378112"
)



Unnamed: 0,metric,cntrl_value,test_value,diff,p_value
0,Доля good,0.18683,0.175415,-0.011415,0.677973
1,Доля good с критерием долей,0.191489,0.164241,-0.027248,0.271762
2,Доля bad,0.788322,0.785183,-0.00314,0.96418
3,Доля bad с критерием долей,0.791489,0.783784,-0.007706,0.771457
4,Метрика со всеми,-0.601493,-0.609768,-0.008275,0.983485
5,Количество сессий,470.0,481.0,,
6,Согласованность good,0.831406,,,
7,Согласованность bad,0.8279,,,


In [55]:
make_report_table_for_exp_without_sampl(
    "//home/voice/eliseevmax/session_quality/good_and_bad/EXPERIMENTS-73207/2022-07-29/joined_results_dawid_skene",
    cntrl_name = "378111",
    test_name = "378112"
)

Unnamed: 0,metric,cntrl_value,test_value,diff,p_value
0,Доля good,0.19553,0.177788,-0.017742,0.474082
1,Доля good с критерием долей,0.208511,0.170478,-0.038032,0.13441
2,Доля bad,0.788201,0.778803,-0.009398,0.611719
3,Доля bad с критерием долей,0.791489,0.779626,-0.011864,0.655881
4,Метрика со всеми,-0.59267,-0.601015,-0.008344,0.673689
5,Количество сессий,470.0,481.0,,
6,Согласованность good,0.879569,,,
7,Согласованность bad,0.885413,,,


# Гениальная болталка

In [56]:
make_report_table_for_exp_without_sampl(
    "//home/voice/eliseevmax/session_quality/good_and_bad/EXPERIMENTS-81406/2022-07-29/joined_results_by_hand",
    cntrl_name = "430421",
    test_name = "430422"
)

Unnamed: 0,metric,cntrl_value,test_value,diff,p_value
0,Доля good,0.151005,0.150961,-4.4e-05,0.751528
1,Доля good с критерием долей,0.148454,0.150106,0.001652,0.942807
2,Доля bad,0.594334,0.626937,0.032604,0.125612
3,Доля bad с критерием долей,0.587629,0.627907,0.040278,0.201803
4,Метрика со всеми,-0.443328,-0.475976,-0.032648,0.401893
5,Количество сессий,485.0,473.0,,
6,Согласованность good,0.853166,,,
7,Согласованность bad,0.796103,,,


In [58]:
make_report_table_for_exp_without_sampl(
    "//home/voice/eliseevmax/session_quality/good_and_bad/EXPERIMENTS-81406/2022-07-29/joined_results_dawid_skene",
    cntrl_name = "430421",
    test_name = "430422"
)

Unnamed: 0,metric,cntrl_value,test_value,diff,p_value
0,Доля good,0.165019,0.184997,0.019979,0.625693
1,Доля good с критерием долей,0.158763,0.179704,0.020941,0.387308
2,Доля bad,0.624832,0.68333,0.058498,0.020173
3,Доля bad с критерием долей,0.624742,0.678647,0.053905,0.080044
4,Метрика со всеми,-0.459813,-0.498333,-0.03852,0.281817
5,Количество сессий,485.0,473.0,,
6,Согласованность good,0.888212,,,
7,Согласованность bad,0.86322,,,


# Дроп глубины контекста

## general_conversation

In [59]:
# глубина 1
make_report_table_for_exp_without_sampl(
    "//home/voice/eliseevmax/session_quality/good_and_bad/EXPERIMENTS-99465/2022-07-29/joined_results_by_hand",
    cntrl_name = "583841",
    test_name = "583842"
)



Unnamed: 0,metric,cntrl_value,test_value,diff,p_value
0,Доля good,0.217838,0.043267,-0.17457,0.000276
1,Доля good с критерием долей,0.17284,0.066176,-0.106663,0.000166
2,Доля bad,0.584866,0.743288,0.158421,0.029569
3,Доля bad с критерием долей,0.670782,0.665441,-0.005341,0.897774
4,Метрика со всеми,-0.367029,-0.700021,-0.332992,0.00054
5,Количество сессий,243.0,272.0,,
6,Согласованность good,0.849571,,,
7,Согласованность bad,0.813363,,,


In [60]:
# глубина 3
make_report_table_for_exp_without_sampl(
    "//home/voice/eliseevmax/session_quality/good_and_bad/EXPERIMENTS-99465/2022-07-29/joined_results_by_hand",
    cntrl_name = "583841",
    test_name = "583846"
)

Unnamed: 0,metric,cntrl_value,test_value,diff,p_value
0,Доля good,0.217838,0.105007,-0.112831,0.062665
1,Доля good с критерием долей,0.17284,0.128079,-0.044761,0.190355
2,Доля bad,0.584866,0.645496,0.060629,0.395976
3,Доля bad с критерием долей,0.670782,0.674877,0.004095,0.926876
4,Метрика со всеми,-0.367029,-0.540488,-0.17346,0.128197
5,Количество сессий,243.0,203.0,,
6,Согласованность good,0.849571,,,
7,Согласованность bad,0.813363,,,


In [61]:
# глубина 5
make_report_table_for_exp_without_sampl(
    "//home/voice/eliseevmax/session_quality/good_and_bad/EXPERIMENTS-99465/2022-07-29/joined_results_by_hand",
    cntrl_name = "583841",
    test_name = "583847"
)

Unnamed: 0,metric,cntrl_value,test_value,diff,p_value
0,Доля good,0.217838,0.110474,-0.107364,0.019679
1,Доля good с критерием долей,0.17284,0.137143,-0.035697,0.323445
2,Доля bad,0.584866,0.629425,0.044559,0.523636
3,Доля bad с критерием долей,0.670782,0.657143,-0.013639,0.770685
4,Метрика со всеми,-0.367029,-0.518951,-0.151923,0.123302
5,Количество сессий,243.0,175.0,,
6,Согласованность good,0.849571,,,
7,Согласованность bad,0.813363,,,


In [62]:
# глубина 1
make_report_table_for_exp_without_sampl(
    "//home/voice/eliseevmax/session_quality/good_and_bad/EXPERIMENTS-99465/2022-07-29/joined_results_dawid_skene",
    cntrl_name = "583841",
    test_name = "583842"
)

Unnamed: 0,metric,cntrl_value,test_value,diff,p_value
0,Доля good,0.18432,0.048144,-0.136176,0.002177
1,Доля good с критерием долей,0.164609,0.073529,-0.09108,0.001299
2,Доля bad,0.57331,0.765985,0.192676,0.004019
3,Доля bad с критерием долей,0.703704,0.694853,-0.008851,0.82696
4,Метрика со всеми,-0.38899,-0.717841,-0.328852,0.000355
5,Количество сессий,243.0,272.0,,
6,Согласованность good,0.908673,,,
7,Согласованность bad,0.899523,,,


In [63]:
# глубина 3
make_report_table_for_exp_without_sampl(
    "//home/voice/eliseevmax/session_quality/good_and_bad/EXPERIMENTS-99465/2022-07-29/joined_results_dawid_skene",
    cntrl_name = "583841",
    test_name = "583846"
)

Unnamed: 0,metric,cntrl_value,test_value,diff,p_value
0,Доля good,0.18432,0.104172,-0.080148,0.085381
1,Доля good с критерием долей,0.164609,0.137931,-0.026678,0.435098
2,Доля bad,0.57331,0.622574,0.049265,0.381782
3,Доля bad с критерием долей,0.703704,0.694581,-0.009122,0.83424
4,Метрика со всеми,-0.38899,-0.518402,-0.129413,0.250013
5,Количество сессий,243.0,203.0,,
6,Согласованность good,0.908673,,,
7,Согласованность bad,0.899523,,,


In [64]:
# глубина 5
make_report_table_for_exp_without_sampl(
    "//home/voice/eliseevmax/session_quality/good_and_bad/EXPERIMENTS-99465/2022-07-29/joined_results_dawid_skene",
    cntrl_name = "583841",
    test_name = "583847"
)

Unnamed: 0,metric,cntrl_value,test_value,diff,p_value
0,Доля good,0.18432,0.156971,-0.027349,0.49347
1,Доля good с критерием долей,0.164609,0.177143,0.012534,0.736369
2,Доля bad,0.57331,0.652933,0.079624,0.227764
3,Доля bad с критерием долей,0.703704,0.702857,-0.000847,0.985086
4,Метрика со всеми,-0.38899,-0.495963,-0.106973,0.40356
5,Количество сессий,243.0,175.0,,
6,Согласованность good,0.908673,,,
7,Согласованность bad,0.899523,,,


## external_skill_gc

# Постклассификатор

In [34]:
make_report_table_for_exp_without_sampl(
    "//home/voice/eliseevmax/session_quality/good_and_bad/EXPERIMENTS-75877/2022-07-29/joined_results_by_hand",
    cntrl_name = "396029",
    test_name = "396030"
)

Unnamed: 0,metric,cntrl_value,test_value,diff,p_value
0,Доля good,0.139102,0.13174,-0.007361,0.615632
1,Доля good с критерием долей,0.133056,0.125261,-0.007795,0.718794
2,Доля bad,0.69984,0.658995,-0.040846,0.369652
3,Доля bad с критерием долей,0.694387,0.661795,-0.032591,0.279828
4,Метрика со всеми,-0.560739,-0.527254,0.033484,0.415339
5,Количество сессий,481.0,479.0,,
6,Согласованность good,0.859722,,,
7,Согласованность bad,0.819792,,,


In [36]:
make_report_table_for_exp_without_sampl(
    "//home/voice/eliseevmax/session_quality/good_and_bad/EXPERIMENTS-75877/2022-07-29/joined_results_dawid_skene",
    cntrl_name = "396029",
    test_name = "396030"
)



Unnamed: 0,metric,cntrl_value,test_value,diff,p_value
0,Доля good,0.188729,0.165753,-0.022976,0.475397
1,Доля good с критерием долей,0.185031,0.160752,-0.02428,0.319925
2,Доля bad,0.671819,0.623269,-0.048549,0.184042
3,Доля bad с критерием долей,0.665281,0.628392,-0.036888,0.231819
4,Метрика со всеми,-0.48309,-0.457517,0.025573,0.798487
5,Количество сессий,481.0,479.0,,
6,Согласованность good,0.876832,,,
7,Согласованность bad,0.872259,,,


# Снижение интересности

In [51]:
#interest 0

make_report_table_for_exp_without_sampl(
    "//home/voice/eliseevmax/session_quality/good_and_bad/EXPERIMENTS-97773/2022-07-23/joined_results_by_hand",
    cntrl_name = "571519",
    test_name = "571520"
)

Unnamed: 0,metric,cntrl_value,test_value,diff,p_value
0,Доля good,0.155566,0.173681,0.018115,0.65907
1,Доля good с критерием долей,0.164634,0.185185,0.020551,0.397554
2,Доля bad,0.602397,0.598846,-0.003551,0.665341
3,Доля bad с критерием долей,0.609756,0.600823,-0.008933,0.775055
4,Метрика со всеми,-0.446831,-0.425164,0.021666,0.663285
5,Количество сессий,492.0,486.0,,
6,Согласованность good,0.841907,,,
7,Согласованность bad,0.80607,,,


In [52]:
#interest -0.8

make_report_table_for_exp_without_sampl(
    "//home/voice/eliseevmax/session_quality/good_and_bad/EXPERIMENTS-97773/2022-07-23/joined_results_by_hand",
    cntrl_name = "571519",
    test_name = "571521"
)

Unnamed: 0,metric,cntrl_value,test_value,diff,p_value
0,Доля good,0.155566,0.17453,0.018964,0.594295
1,Доля good с критерием долей,0.164634,0.18125,0.016616,0.493358
2,Доля bad,0.602397,0.630651,0.028255,0.321458
3,Доля bad с критерием долей,0.609756,0.6375,0.027744,0.372104
4,Метрика со всеми,-0.446831,-0.456121,-0.00929,0.852009
5,Количество сессий,492.0,480.0,,
6,Согласованность good,0.841907,,,
7,Согласованность bad,0.80607,,,


In [70]:
#interest -0.4

make_report_table_for_exp_without_sampl(
    "//home/voice/eliseevmax/session_quality/good_and_bad/EXPERIMENTS-97773/2022-07-23/joined_results_by_hand",
    cntrl_name = "571519",
    test_name = "571522"
)

Unnamed: 0,metric,cntrl_value,test_value,diff,p_value
0,Доля good,0.155566,0.110111,-0.045455,0.05502
1,Доля good с критерием долей,0.164634,0.117284,-0.04735,0.033442
2,Доля bad,0.602397,0.643773,0.041377,0.129111
3,Доля bad с критерием долей,0.609756,0.648148,0.038392,0.214027
4,Метрика со всеми,-0.446831,-0.533662,-0.086831,0.033369
5,Количество сессий,492.0,486.0,,
6,Согласованность good,0.841907,,,
7,Согласованность bad,0.80607,,,


# Отключение болталки

In [68]:
make_report_table_for_exp_without_sampl(
    "//home/voice/eliseevmax/session_quality/good_and_bad/EXPERIMENTS-100449/2022-07-29/joined_results_by_hand",
    cntrl_name = "590414",
    test_name = "590415"
)

Unnamed: 0,metric,cntrl_value,test_value,diff,p_value
0,Доля good,0.159036,0.14002,-0.019017,0.511041
1,Доля good с критерием долей,0.157233,0.139583,-0.017649,0.442507
2,Доля bad,0.664014,0.658321,-0.005693,0.745464
3,Доля bad с критерием долей,0.66457,0.65625,-0.00832,0.785814
4,Метрика со всеми,-0.504978,-0.518302,-0.013324,0.779541
5,Количество сессий,477.0,480.0,,
6,Согласованность good,0.855799,,,
7,Согласованность bad,0.819575,,,


In [69]:
make_report_table_for_exp_without_sampl(
    "//home/voice/eliseevmax/session_quality/good_and_bad/EXPERIMENTS-100449/2022-07-29/joined_results_dawid_skene",
    cntrl_name = "590414",
    test_name = "590415"
)

Unnamed: 0,metric,cntrl_value,test_value,diff,p_value
0,Доля good,0.187683,0.156392,-0.03129,0.156455
1,Доля good с критерием долей,0.194969,0.158333,-0.036635,0.137271
2,Доля bad,0.692538,0.696624,0.004086,0.997245
3,Доля bad с критерием долей,0.696017,0.695833,-0.000183,0.995079
4,Метрика со всеми,-0.504855,-0.540232,-0.035376,0.580546
5,Количество сессий,477.0,480.0,,
6,Согласованность good,0.877406,,,
7,Согласованность bad,0.887233,,,
