In [91]:
import yt.wrapper as yt

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import mannwhitneyu, ttest_ind
import os
import scipy.stats as st

from tqdm.auto import tqdm
from collections import Counter

%matplotlib inline

In [92]:
yt.config["proxy"]["url"] = 'hahn.yt.yandex.net'
client = yt.YtClient(proxy='hahn', token=os.environ.get('YT_TOKEN'))

In [93]:
def ci_bounds(pos, n, confidence = 0.95):
    if n == 0:
        return [0, 1]
    z = st.norm.ppf(1 - (1 - confidence) / 2)
    phat = 1.0 * pos / n
    return [(phat + z * z / (2 * n) - z * math.sqrt((phat * (1 - phat) + z * z / (4 * n)) / n)) / (1 + z * z / n),
    (phat + z * z / (2 * n) + z * math.sqrt((phat * (1 - phat) + z * z / (4 * n)) / n)) / (1 + z * z / n)
    ]

def norm_test(pos1, n1, pos2, n2):
    p1 = pos1 / n1
    p2 = pos2 / n2
    
    p = (pos1 + pos2) / (n1 + n2)
    
    s = np.sqrt(p * (1 - p) * (1 / n1 + 1 / n2) )
    
    z_stat = (p1 - p2)/s
    
    return 2*(1 - st.norm.cdf(abs(z_stat)))

def calc_metric_good(data, part_coef=0.8, test_name="test", 
                    buckets=50):
    
    flag = (data["test_name"] == test_name)
    
    nes_part = data[flag]
    
    raw_resulrs = 1*(nes_part["result"] == "good")
        
    bucket = nes_part["uuid"].apply(lambda x: hash(x) % buckets)
    
    pd_results = pd.DataFrame({
        "metric": raw_resulrs.values,
        "bucket": bucket.values
    })
    
    return pd_results

def calc_metric_bad(data, part_coef=0.8, test_name="test", 
                    buckets=50):
    
    flag = (data["test_name"] == test_name)
    
    nes_part = data[flag]
    
    raw_resulrs = 1*(nes_part["result"] == "bad")
        
    bucket = nes_part["uuid"].apply(lambda x: hash(x) % buckets)
    
    pd_results = pd.DataFrame({
        "metric": raw_resulrs.values,
        "bucket": bucket.values
    })
    
    return pd_results

def calc_metric_zero(data, part_coef=0.8, test_name="test", 
                    buckets=50):
    
    flag = (data["test_name"] == test_name)
    
    nes_part = data[flag]
    
    raw_resulrs = 1*(nes_part["result"] == "zero")
        
    bucket = nes_part["uuid"].apply(lambda x: hash(x) % buckets)
    
    pd_results = pd.DataFrame({
        "metric": raw_resulrs.values,
        "bucket": bucket.values
    })
    
    return pd_results

def calc_metric_all(data, good_coef=1.0, bad_coef=-1.0, test_name="test", 
                    buckets=50):
    
    flag = (data["test_name"] == test_name)
    
    nes_part = data[flag]
    
    coefs = {
        "good": good_coef,
        "zero": 0,
        "bad": bad_coef
    }
    
    raw_resulrs = nes_part["result"].apply(lambda x: coefs[x])
        
    bucket = nes_part["uuid"].apply(lambda x: hash(x) % buckets)
    
    pd_results = pd.DataFrame({
        "metric": raw_resulrs.values,
        "bucket": bucket.values
    })
    
    return pd_results

def process_exp(table_name, 
                calc_metric=calc_metric_good, 
                test_name="test",
                cntrl_name="cntrl",
                **kwargs
            ):
    read_result = pd.DataFrame(client.read_table(table_name, raw=False))

    pd_results = calc_metric(read_result, test_name=test_name, **kwargs)
    test_raw = pd_results.groupby(by=["bucket"]).mean()["metric"].values
    
    pd_results = calc_metric(read_result, test_name=cntrl_name, **kwargs)
    ctrl_raw = pd_results.groupby(by=["bucket"]).mean()["metric"].values


    pvalue=mannwhitneyu(ctrl_raw, test_raw).pvalue
    effect = (test_raw.mean() - ctrl_raw.mean())/abs(ctrl_raw.mean())
    message = f'pvalue={pvalue:.3f}, effect={effect:.3f}, ctrl metric = {ctrl_raw.mean():.3f}, test_metric = {test_raw.mean():.3f}, cnt = {len(ctrl_raw) + len(test_raw)}'

    res_dict={"p_value": pvalue, "cntr_value": ctrl_raw.mean(), "test_value": test_raw.mean()}
    
    return res_dict


def process_exp_by_part(table_name, 
                calc_metric=calc_metric_good, 
                test_name="test",
                cntrl_name="cntrl",
                **kwargs
            ):
    read_result = pd.DataFrame(client.read_table(table_name, raw=False))

    pd_results = calc_metric(read_result, test_name=test_name, **kwargs)
    test_pos = pd_results["metric"].sum()
    test_cnt = len(pd_results["metric"])
    
    pd_results = calc_metric(read_result, test_name=cntrl_name, **kwargs)
    cntrl_pos = pd_results["metric"].sum()
    cntrl_cnt = len(pd_results["metric"])


    pvalue=norm_test(cntrl_pos, cntrl_cnt, test_pos, test_cnt)

    res_dict={"p_value": pvalue, "cntr_value": cntrl_pos/cntrl_cnt, "test_value": test_pos/test_cnt}
    
    return res_dict



def make_report_table_for_exp_without_sampl(table, test_name="test",
                cntrl_name="ctrl"):
    
    read_result = pd.DataFrame(client.read_table(table, raw=False))
    
    agreement = read_result["probability"].values.mean()
    
    part_of_good = process_exp(table,
                                calc_metric=calc_metric_good,
                                test_name=test_name,
                                cntrl_name=cntrl_name 
                                 )
    part_of_good_by_part = process_exp_by_part(table,
                                calc_metric=calc_metric_good,
                                test_name=test_name,
                                cntrl_name=cntrl_name 
                                 )
        
    part_of_bad = process_exp(table,
                            calc_metric=calc_metric_bad,
                            test_name=test_name,
                            cntrl_name=cntrl_name 
                             )
    part_of_bad_by_part = process_exp_by_part(table,
                            calc_metric=calc_metric_bad,
                            test_name=test_name,
                            cntrl_name=cntrl_name 
                             )


    part_of_zero = process_exp(table,
                            calc_metric=calc_metric_zero,
                            test_name=test_name,
                            cntrl_name=cntrl_name 
                             )
    
    metrica_all = process_exp(table,
                            calc_metric=calc_metric_all,
                            test_name=test_name,
                            cntrl_name=cntrl_name 
                             )
    
    
    results = pd.DataFrame([
        {
            "metric": "Доля good",
            "cntrl_value": part_of_good["cntr_value"],
            "test_value": part_of_good["test_value"],
            "diff": part_of_good["test_value"] - part_of_good["cntr_value"],
            "p_value": part_of_good["p_value"],
        
        },
        
        {
            "metric": "Доля good с критерием долей",
            "cntrl_value": part_of_good_by_part["cntr_value"],
            "test_value": part_of_good_by_part["test_value"],
            "diff": part_of_good_by_part["test_value"] - part_of_good_by_part["cntr_value"],
            "p_value": part_of_good_by_part["p_value"],
        
        },
        
        {
            "metric": "Доля bad",
            "cntrl_value": part_of_bad["cntr_value"],
            "test_value": part_of_bad["test_value"],
            "diff": part_of_bad["test_value"] - part_of_bad["cntr_value"],
            "p_value": part_of_bad["p_value"],
        
        },
        
        {
            "metric": "Доля bad с критерием долей",
            "cntrl_value": part_of_bad_by_part["cntr_value"],
            "test_value": part_of_bad_by_part["test_value"],
            "diff": part_of_bad_by_part["test_value"] - part_of_bad_by_part["cntr_value"],
            "p_value": part_of_bad_by_part["p_value"],
        
        },
        
        {
            "metric": "Доля zero",
            "cntrl_value": part_of_zero["cntr_value"],
            "test_value": part_of_zero["test_value"],
            "diff": part_of_zero["test_value"] - part_of_zero["cntr_value"],
            "p_value": part_of_zero["p_value"],
        
        },
        {
            "metric": "Метрика со всеми",
            "cntrl_value": metrica_all["cntr_value"],
            "test_value": metrica_all["test_value"],
            "diff": metrica_all["test_value"] - metrica_all["cntr_value"],
            "p_value": metrica_all["p_value"],
        
        },
        {
            "metric": "Количество сессий",
            "cntrl_value": (read_result["test_name"] == cntrl_name).sum(),
            "test_value": (read_result["test_name"] == test_name).sum(),
            "diff": None,
            "p_value": None,
        
        },
        {
            "metric": "Согласованность",
            "cntrl_value": agreement,
            "test_value": None,
            "diff": None,
            "p_value": None,
        
        },
    ])
    
    return results

# Отключение BERT

In [94]:
make_report_table_for_exp_without_sampl(
    "//home/voice/eliseevmax/tasks/context/EXPERIMENTS-73207_external_skill_gc_3_1_portion_for_marking",
    cntrl_name = "cntrl_378111",
    test_name = "test_378112"
)

Unnamed: 0,metric,cntrl_value,test_value,diff,p_value
0,Доля good,0.223872,0.169743,-0.054129,4.4e-05
1,Доля good с критерием долей,0.223043,0.167967,-0.055077,4e-06
2,Доля bad,0.299964,0.28506,-0.014903,0.258142
3,Доля bad с критерием долей,0.297826,0.286829,-0.010997,0.418705
4,Доля zero,0.476164,0.545197,0.069033,0.000109
5,Метрика со всеми,-0.076092,-0.115318,-0.039226,0.065135
6,Количество сессий,2300.0,2179.0,,
7,Согласованность,0.720972,,,


In [95]:
make_report_table_for_exp_without_sampl(
    "//home/voice/eliseevmax/tasks/context/EXPERIMENTS-73207_external_skill_gc_5_1_portion_for_marking",
    cntrl_name = "cntrl_378111",
    test_name = "test_378112"
)

Unnamed: 0,metric,cntrl_value,test_value,diff,p_value
0,Доля good,0.200252,0.159848,-0.040404,0.000283
1,Доля good с критерием долей,0.199593,0.157759,-0.041834,0.000165
2,Доля bad,0.286531,0.255255,-0.031277,0.034594
3,Доля bad с критерием долей,0.288798,0.259052,-0.029747,0.021305
4,Доля zero,0.513217,0.584897,0.071681,4.1e-05
5,Метрика со всеми,-0.086279,-0.095407,-0.009127,0.614686
6,Количество сессий,2455.0,2320.0,,
7,Согласованность,0.719595,,,


In [96]:
make_report_table_for_exp_without_sampl(
    "//home/voice/eliseevmax/session_quality/context/EXPERIMENTS-73207/2022-07-23/joined_results",
    cntrl_name = "378111",
    test_name = "378112"
)

Unnamed: 0,metric,cntrl_value,test_value,diff,p_value
0,Доля good,0.129888,0.086357,-0.043531,0.132659
1,Доля good с критерием долей,0.136842,0.086134,-0.050708,0.012975
2,Доля bad,0.298618,0.275871,-0.022747,0.319666
3,Доля bad с критерием долей,0.307368,0.27521,-0.032158,0.275119
4,Доля zero,0.571494,0.637772,0.066278,0.023539
5,Метрика со всеми,-0.16873,-0.189513,-0.020784,0.89369
6,Количество сессий,475.0,476.0,,
7,Согласованность,0.709078,,,


# Гениальная болталка

In [97]:
make_report_table_for_exp_without_sampl(
    "//home/voice/eliseevmax/session_quality/context/EXPERIMENTS-81406/2022-07-22/joined_results",
    cntrl_name = "430421",
    test_name = "430422"
)

Unnamed: 0,metric,cntrl_value,test_value,diff,p_value
0,Доля good,0.177026,0.208554,0.031527,0.279344
1,Доля good с критерием долей,0.171548,0.195833,0.024285,0.331793
2,Доля bad,0.340793,0.314097,-0.026696,0.648661
3,Доля bad с критерием долей,0.334728,0.33125,-0.003478,0.909071
4,Доля zero,0.48218,0.47735,-0.004831,0.66029
5,Метрика со всеми,-0.163767,-0.105543,0.058224,0.42931
6,Количество сессий,478.0,480.0,,
7,Согласованность,0.720251,,,


# Дроп глубины контекста

## general_conversation

In [98]:
# контекст 1
make_report_table_for_exp_without_sampl(
    "//home/voice/eliseevmax/session_quality/context/EXPERIMENTS-99465/2022-07-21/joined_results",
    cntrl_name = "583841",
    test_name = "583842"
)

Unnamed: 0,metric,cntrl_value,test_value,diff,p_value
0,Доля good,0.200623,0.033564,-0.167059,2.353348e-06
1,Доля good с критерием долей,0.176955,0.040441,-0.136514,4.464512e-07
2,Доля bad,0.305673,0.474836,0.169163,0.001760691
3,Доля bad с критерием долей,0.378601,0.474265,0.095664,0.02855157
4,Доля zero,0.493703,0.4916,-0.002104,0.8531346
5,Метрика со всеми,-0.10505,-0.441272,-0.336222,1.984499e-05
6,Количество сессий,243.0,272.0,,
7,Согласованность,0.754013,,,


In [99]:
# контекст 3
make_report_table_for_exp_without_sampl(
    "//home/voice/eliseevmax/session_quality/context/EXPERIMENTS-99465/2022-07-21/joined_results",
    cntrl_name = "583841",
    test_name = "583846"
)

Unnamed: 0,metric,cntrl_value,test_value,diff,p_value
0,Доля good,0.200623,0.216872,0.016248,0.98337
1,Доля good с критерием долей,0.176955,0.197044,0.02009,0.587218
2,Доля bad,0.305673,0.337589,0.031916,0.379706
3,Доля bad с критерием долей,0.378601,0.349754,-0.028847,0.528701
4,Доля zero,0.493703,0.445539,-0.048164,0.515755
5,Метрика со всеми,-0.10505,-0.120718,-0.015667,0.638773
6,Количество сессий,243.0,203.0,,
7,Согласованность,0.754013,,,


In [100]:
# контекст 5
make_report_table_for_exp_without_sampl(
    "//home/voice/eliseevmax/session_quality/context/EXPERIMENTS-99465/2022-07-21/joined_results",
    cntrl_name = "583841",
    test_name = "583847"
)

Unnamed: 0,metric,cntrl_value,test_value,diff,p_value
0,Доля good,0.200623,0.200634,1.1e-05,0.217399
1,Доля good с критерием долей,0.176955,0.165714,-0.01124,0.763983
2,Доля bad,0.305673,0.398659,0.092985,0.240158
3,Доля bad с критерием долей,0.378601,0.337143,-0.041458,0.384021
4,Доля zero,0.493703,0.400708,-0.092996,0.38739
5,Метрика со всеми,-0.10505,-0.198025,-0.092975,0.1551
6,Количество сессий,243.0,175.0,,
7,Согласованность,0.754013,,,


## external_skill_gc

In [101]:
# контекст 1
make_report_table_for_exp_without_sampl(
    "//home/voice/eliseevmax/session_quality/context/EXPERIMENTS-99465/2022-07-23/joined_results",
    cntrl_name = "583841",
    test_name = "583842"
)

  effect = (test_raw.mean() - ctrl_raw.mean())/abs(ctrl_raw.mean())
  z_stat = (p1 - p2)/s


Unnamed: 0,metric,cntrl_value,test_value,diff,p_value
0,Доля good,0.0,0.0,0.0,1.0
1,Доля good с критерием долей,0.0,0.0,0.0,
2,Доля bad,0.3,0.272222,-0.027778,0.888834
3,Доля bad с критерием долей,0.4,0.388889,-0.011111,0.940968
4,Доля zero,0.7,0.727778,0.027778,0.888834
5,Метрика со всеми,-0.3,-0.272222,0.027778,0.888834
6,Количество сессий,15.0,36.0,,
7,Согласованность,0.658192,,,


In [102]:
# контекст 3
make_report_table_for_exp_without_sampl(
    "//home/voice/eliseevmax/session_quality/context/EXPERIMENTS-99465/2022-07-23/joined_results",
    cntrl_name = "583841",
    test_name = "583846"
)

  effect = (test_raw.mean() - ctrl_raw.mean())/abs(ctrl_raw.mean())


Unnamed: 0,metric,cntrl_value,test_value,diff,p_value
0,Доля good,0.0,0.146465,0.146465,0.157395
1,Доля good с критерием долей,0.0,0.111111,0.111111,0.17753
2,Доля bad,0.3,0.134848,-0.165152,0.185949
3,Доля bad с критерием долей,0.4,0.155556,-0.244444,0.046574
4,Доля zero,0.7,0.718687,0.018687,0.765662
5,Метрика со всеми,-0.3,0.011616,0.311616,0.099456
6,Количество сессий,15.0,45.0,,
7,Согласованность,0.658192,,,


In [103]:
# контекст 5
make_report_table_for_exp_without_sampl(
    "//home/voice/eliseevmax/session_quality/context/EXPERIMENTS-99465/2022-07-23/joined_results",
    cntrl_name = "583841",
    test_name = "583847"
)

  effect = (test_raw.mean() - ctrl_raw.mean())/abs(ctrl_raw.mean())


Unnamed: 0,metric,cntrl_value,test_value,diff,p_value
0,Доля good,0.0,0.125,0.125,0.527089
1,Доля good с критерием долей,0.0,0.045455,0.045455,0.402529
2,Доля bad,0.3,0.083333,-0.216667,0.176349
3,Доля bad с критерием долей,0.4,0.090909,-0.309091,0.02494
4,Доля zero,0.7,0.791667,0.091667,0.556275
5,Метрика со всеми,-0.3,0.041667,0.341667,0.153024
6,Количество сессий,15.0,22.0,,
7,Согласованность,0.658192,,,


# Постклассификатор

In [104]:
make_report_table_for_exp_without_sampl(
    "//home/voice/eliseevmax/session_quality/context/EXPERIMENTS-75877/2022-07-23/joined_results",
    cntrl_name = "396029",
    test_name = "396030"
)

Unnamed: 0,metric,cntrl_value,test_value,diff,p_value
0,Доля good,0.095548,0.079193,-0.016355,0.355185
1,Доля good с критерием долей,0.08805,0.081761,-0.006289,0.727499
2,Доля bad,0.304884,0.308116,0.003232,0.860249
3,Доля bad с критерием долей,0.30608,0.29979,-0.006289,0.832601
4,Доля zero,0.599568,0.61269,0.013123,0.628813
5,Метрика со всеми,-0.209336,-0.228923,-0.019587,0.740075
6,Количество сессий,477.0,477.0,,
7,Согласованность,0.734556,,,


# Снижение интересности

In [105]:
#interest 0

make_report_table_for_exp_without_sampl(
    "//home/voice/eliseevmax/session_quality/context/EXPERIMENTS-97773/2022-07-23/joined_results",
    cntrl_name = "571519",
    test_name = "571520"
)

Unnamed: 0,metric,cntrl_value,test_value,diff,p_value
0,Доля good,0.084073,0.085496,0.001423,0.78316
1,Доля good с критерием долей,0.091667,0.07377,-0.017896,0.311988
2,Доля bad,0.267088,0.309839,0.042751,0.236548
3,Доля bad с критерием долей,0.277083,0.317623,0.04054,0.167765
4,Доля zero,0.648839,0.604665,-0.044173,0.33919
5,Метрика со всеми,-0.183016,-0.224343,-0.041328,0.257455
6,Количество сессий,480.0,488.0,,
7,Согласованность,0.729637,,,


In [106]:
#interest -0.8

make_report_table_for_exp_without_sampl(
    "//home/voice/eliseevmax/session_quality/context/EXPERIMENTS-97773/2022-07-23/joined_results",
    cntrl_name = "571519",
    test_name = "571521"
)

Unnamed: 0,metric,cntrl_value,test_value,diff,p_value
0,Доля good,0.084073,0.111213,0.02714,0.268376
1,Доля good с критерием долей,0.091667,0.107438,0.015771,0.41357
2,Доля bad,0.267088,0.288173,0.021085,0.54589
3,Доля bad с критерием долей,0.277083,0.299587,0.022503,0.44061
4,Доля zero,0.648839,0.600614,-0.048224,0.237945
5,Метрика со всеми,-0.183016,-0.176961,0.006055,0.983458
6,Количество сессий,480.0,484.0,,
7,Согласованность,0.729637,,,


In [107]:
#interest -0.4

make_report_table_for_exp_without_sampl(
    "//home/voice/eliseevmax/session_quality/context/EXPERIMENTS-97773/2022-07-23/joined_results",
    cntrl_name = "571519",
    test_name = "571522"
)

Unnamed: 0,metric,cntrl_value,test_value,diff,p_value
0,Доля good,0.084073,0.098431,0.014358,0.346298
1,Доля good с критерием долей,0.091667,0.094142,0.002476,0.895004
2,Доля bad,0.267088,0.28583,0.018742,0.494179
3,Доля bad с критерием долей,0.277083,0.301255,0.024172,0.409306
4,Доля zero,0.648839,0.615739,-0.033099,0.342484
5,Метрика со всеми,-0.183016,-0.187399,-0.004384,0.7902
6,Количество сессий,480.0,478.0,,
7,Согласованность,0.729637,,,
