In [1]:
import os
from functools import reduce
from spyt import spark_session
import logging.config
import spyt
import pandas as pd
import warnings
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql.functions import col, lit
from pyspark.sql.window import Window
from sklearn.metrics import confusion_matrix
from itertools import combinations
from scipy.stats import fisher_exact
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score
from clan_tools.utils.spark import SPARK_CONF_MEDIUM
from clan_tools.logging.logger import default_log_config
from clan_tools.data_adapters.YTAdapter import YTAdapter
from clan_tools.data_adapters.crm.CRMModelAdapter import upsale_to_update_leads
from clan_tools.data_adapters.crm.CRMHistoricalDataAdapter import CRMHistoricalDataAdapter

warnings.filterwarnings("ignore")
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 250)

In [2]:
from clan_tools.secrets.Vault import Vault
Vault().get_secrets()
yt_adapter = YTAdapter()
yt = yt_adapter.yt

spark = spyt.connect(spark_conf_args=SPARK_CONF_MEDIUM)
spyt.info(spark)

2021-11-25 12:55:01,894 - INFO - spyt.client - SPYT Cluster version: 3.0.1-1.20.1+yandex
2021-11-25 12:55:01,896 - INFO - spyt.client - SPYT library version: 1.3.5


In [3]:
def max_by(x, y):
    return F.expr(f'max_by({x}, {y})')


def make_date(string_datetime):
    string_date = pd.to_datetime(string_datetime).strftime('%Y-%m-%d')
    return string_date


def load_all_tables(spark, path):
    tables = yt.list(path)
    spdfs = [spark.read.yt(f'{path}/{table}').withColumn("date", lit(make_date(table))) for table in tables]
    res_spdf = reduce(lambda x, y: x.union(y), spdfs)
    return res_spdf


def get_last_table(yt_folder):
    yt_table = max(yt.list(yt_folder))
    return os.path.join(yt_folder, yt_table)


# make dataset
ab_test_path = "//home/cloud_analytics/ml/scoring/consumption_predictor_v2/experiment/csm"
raw_leads = '//home/cloud_analytics/dwh/raw/crm/leads'
raw_billingaccounts = '//home/cloud_analytics/dwh/raw/crm/billingaccounts'
raw_leads_billing_accounts = '//home/cloud_analytics/dwh/raw/crm/leads_billing_accounts'
raw_tag_bean_rel = '//home/cloud_analytics/dwh/raw/crm/tag_bean_rel'
raw_tags = '//home/cloud_analytics/dwh/raw/crm/tags'

In [4]:
leads_ab = (
    load_all_tables(spark, ab_test_path)
    .select('group', 'date', 'billing_account_id')
    .cache()
)

spdf_leads = (
    spark.read.yt(get_last_table(raw_leads)).alias('leads')
    .join(
        spark.read.yt(get_last_table(raw_tag_bean_rel)).alias('tag_bean_rel'),
        on=col('leads.id')==col('tag_bean_rel.bean_id'), how='inner'
    )
    .join(
        spark.read.yt(get_last_table(raw_tags)).alias('tags'),
        on=col('tag_bean_rel.tag_id')==col('tags.id'), how='inner'
    )
    .join(
        spark.read.yt(get_last_table(raw_leads_billing_accounts)).alias('leads_billing_accounts'),
        on=col('leads.id')==col('leads_billing_accounts.leads_id'), how='inner'
    )
    .join(
        spark.read.yt(get_last_table(raw_billingaccounts)).alias('billingaccounts'),
        on=col('billingaccounts.id')==col('leads_billing_accounts.billingaccounts_id'), how='inner'
    )
    .groupby(col('billingaccounts.ba_id').alias('billing_account_id'))
    .agg(
        max_by('leads.status', 'leads.date_modified').alias('status'),
        max_by('leads.lead_source_description', 'leads.date_modified').alias('lead_source_description'),
        max_by('leads.assigned_user_id', 'leads.date_modified').alias('assigned_user_id'),
        max_by('tags.name', 'tags.date_modified').alias('tag_name'),
        max_by('leads.org_type', 'leads.date_modified').alias('org_type'),
    )
)

spdf_leads2 = (
    CRMHistoricalDataAdapter(yt_adapter, spark)
    .historical_preds()
    .groupby('billing_account_id')
    .agg(
        max_by('status', 'date_modified').alias('status'),
        max_by('lead_source_crm', 'date_modified').alias('lead_source_crm'),
        max_by('description', 'date_modified').alias('description'),
    )
    
)

spdf = leads_ab.join(spdf_leads2, on='billing_account_id', how='inner')
dft = spdf.toPandas()
# dft.head()

In [5]:
def test(test_col, df):
    table = df[['total', test_col]]
    table['total'] = table['total'] - table[test_col]
    table = table.T.values
    pval = fisher_exact(table, alternative='two-sided')[1]
    return pval

### Test

In [6]:
dff = dft.copy()
dff['total'] = 1


dm_sts = pd.get_dummies(dff['status']).astype(int)

dff_comp_det = pd.concat([dff[['group', 'date', 'total']],
                          dm_sts], axis=1).groupby(['group', 'date']).sum()
dff_comp_agg = pd.concat([dff[['group', 'total']], dm_sts], axis=1).groupby(['group']).sum()

In [7]:
dff_comp_det.astype(int)

Unnamed: 0_level_0,Unnamed: 1_level_0,total,Assigned,Converted,In Process,Pending,Recycled
group,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
New model,2021-09-13,3,0,0,0,0,3
New model,2021-09-14,3,0,0,0,0,3
New model,2021-09-15,3,0,0,0,0,3
New model,2021-09-16,3,0,0,0,0,3
New model,2021-09-17,3,0,1,0,0,2
New model,2021-09-19,3,0,0,1,0,2
New model,2021-09-21,3,0,0,0,0,3
New model,2021-09-24,3,0,0,0,0,3
New model,2021-09-27,3,0,1,0,0,2
New model,2021-09-28,1,0,0,0,0,1


In [8]:
dff_comp_agg.astype(int)

Unnamed: 0_level_0,total,Assigned,Converted,In Process,Pending,Recycled
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
New model,116,22,8,30,2,54
Old model,81,13,2,13,7,46


In [9]:
res_test = dff_comp_agg.copy()
res_test['total'] = res_test['total']-res_test['In Process']-res_test['Pending']
res_test = res_test[['total', 'Assigned', 'Converted', 'Recycled']]

res_test

2021-11-25 12:56:45,961 - INFO - numexpr.utils - Note: NumExpr detected 32 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2021-11-25 12:56:45,962 - INFO - numexpr.utils - NumExpr defaulting to 8 threads.


Unnamed: 0_level_0,total,Assigned,Converted,Recycled
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
New model,84,22,8,54
Old model,61,13,2,46


In [10]:
test('Recycled', res_test)

0.2030628046696725

In [11]:
test('Converted', res_test)

0.19213404547000762

#### Тест фишера
Проводился [Тест Фишера](https://ru.wikipedia.org/wiki/%D0%A2%D0%BE%D1%87%D0%BD%D1%8B%D0%B9_%D1%82%D0%B5%D1%81%D1%82_%D0%A4%D0%B8%D1%88%D0%B5%D1%80%D0%B0) на конверсии показателей Converted и Recycled.

В ячейках 10 и 11 выведены значения [p-value](https://ru.wikipedia.org/wiki/P-%D0%B7%D0%BD%D0%B0%D1%87%D0%B5%D0%BD%D0%B8%D0%B5). 

```p-value > 0.05``` - можно интерпретировать как отсутствие статистически-значимой разницы между конверсиями.