In [1]:
import os
from functools import reduce
from spyt import spark_session
import logging.config
import spyt
import pandas as pd
import warnings
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql.functions import col, lit
from pyspark.sql.window import Window
from sklearn.metrics import confusion_matrix
from itertools import combinations
from scipy.stats import fisher_exact
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score
from clan_tools.utils.spark import SPARK_CONF_MEDIUM
from clan_tools.logging.logger import default_log_config
from clan_tools.data_adapters.YTAdapter import YTAdapter
from clan_tools.data_adapters.crm.CRMModelAdapter import upsale_to_update_leads
from clan_tools.data_adapters.crm.CRMHistoricalDataAdapter import CRMHistoricalDataAdapter

warnings.filterwarnings("ignore")
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 250)

In [2]:
from clan_tools.secrets.Vault import Vault
Vault().get_secrets()
yt_adapter = YTAdapter()
yt = yt_adapter.yt

spark = spyt.connect(spark_conf_args=SPARK_CONF_MEDIUM)
spyt.info(spark)

2021-11-29 16:05:34,348 - INFO - spyt.client - SPYT Cluster version: 3.0.1-1.20.1+yandex
2021-11-29 16:05:34,350 - INFO - spyt.client - SPYT library version: 1.3.5


In [3]:
def max_by(x, y):
    return F.expr(f'max_by({x}, {y})')


def make_date(string_datetime):
    string_date = pd.to_datetime(string_datetime).strftime('%Y-%m-%d')
    return string_date


def load_all_tables(spark, path):
    tables = yt.list(path)
    spdfs = [spark.read.yt(f'{path}/{table}').withColumn("date", lit(make_date(table))) for table in tables]
    res_spdf = reduce(lambda x, y: x.union(y), spdfs)
    return res_spdf


def get_last_table(yt_folder):
    yt_table = max(yt.list(yt_folder))
    return os.path.join(yt_folder, yt_table)


# make dataset
ab_test_path = "//home/cloud_analytics/ml/scoring/consumption_predictor_v2/experiment/onboarding"
raw_leads = '//home/cloud_analytics/dwh/raw/crm/leads'
raw_billingaccounts = '//home/cloud_analytics/dwh/raw/crm/billingaccounts'
raw_leads_billing_accounts = '//home/cloud_analytics/dwh/raw/crm/leads_billing_accounts'
raw_tag_bean_rel = '//home/cloud_analytics/dwh/raw/crm/tag_bean_rel'
raw_tags = '//home/cloud_analytics/dwh/raw/crm/tags'

In [4]:
leads_ab = (
    load_all_tables(spark, ab_test_path)
    .select('group', 'date', col('ba_id').alias('billing_account_id'))
    .cache()
)

spdf_leads = (
    spark.read.yt(get_last_table(raw_leads)).alias('leads')
    .join(
        spark.read.yt(get_last_table(raw_tag_bean_rel)).alias('tag_bean_rel'),
        on=col('leads.id')==col('tag_bean_rel.bean_id'), how='inner'
    )
    .join(
        spark.read.yt(get_last_table(raw_tags)).alias('tags'),
        on=col('tag_bean_rel.tag_id')==col('tags.id'), how='inner'
    )
    .join(
        spark.read.yt(get_last_table(raw_leads_billing_accounts)).alias('leads_billing_accounts'),
        on=col('leads.id')==col('leads_billing_accounts.leads_id'), how='inner'
    )
    .join(
        spark.read.yt(get_last_table(raw_billingaccounts)).alias('billingaccounts'),
        on=col('billingaccounts.id')==col('leads_billing_accounts.billingaccounts_id'), how='inner'
    )
    .filter(~col('leads.deleted'))
    .filter(~col('tag_bean_rel.deleted'))
    .filter(~col('tags.deleted'))
    .filter(col('leads.lead_source') == 'trial')
    .groupby(col('billingaccounts.ba_id').alias('billing_account_id'))
    .agg(
        max_by('leads.status', 'leads.date_modified').alias('status'),
        max_by('leads.lead_source_description', 'leads.date_modified').alias('lead_source_description'),
        max_by('leads.assigned_user_id', 'leads.date_modified').alias('assigned_user_id'),
        max_by('tags.name', 'tags.date_modified').alias('tag_name'),
        max_by('leads.org_type', 'leads.date_modified').alias('org_type'),
    )
)

spdf = leads_ab.join(spdf_leads, on='billing_account_id', how='inner')
dft = spdf.toPandas()
dft.head()

Unnamed: 0,billing_account_id,group,date,status,lead_source_description,assigned_user_id,tag_name,org_type
0,dn202nk7gs5smf1q71do,Old model,2021-09-19,Recycled,Client is Individual,1,annulated,company
1,dn202nk7gs5smf1q71do,New model,2021-09-20,Recycled,Client is Individual,1,annulated,company
2,dn206l8akjhv1pf6kli7,Old model,2021-10-16,Recycled,Client is Individual,1,annulated,company
3,dn20afltf05sdmms7vka,Old model,2021-10-24,Recycled,Client is Individual,1,annulated,company
4,dn20gj4ebqldldqofir5,New model,2021-09-28,Recycled,Client is Company,3f2e98a0-1903-11eb-98b5-53f18afd3eb9,potential fraud,company


In [5]:
def test(test_col, df):
    table = df[['total', test_col]]
    table['total'] = table['total'] - table[test_col]
    table = table.T.values
    pval = fisher_exact(table, alternative='two-sided')[1]
    return pval

### Companies

In [6]:
dff = dft.copy()
dff['total'] = 1
dff = dff[dff['lead_source_description']=='Client is Company']

mapping = {
    'already_paid': 'already paid',
    'potential fraud': 'potential fraud',
}
mapping.update({key:'other' for key in set(dff['tag_name'].unique()) - set(mapping.keys())})
dff['tag_name'] = dff['tag_name'].map(mapping)

dm_sts = pd.get_dummies(dff['status']).astype(int)
dm_tgs = pd.get_dummies(dff['tag_name']).astype(int)

dff_comp_det = pd.concat([dff[['group', 'date', 'total']],
                          dm_sts, dm_tgs], axis=1).groupby(['group', 'date']).sum()
dff_comp_agg = pd.concat([dff[['group', 'total']], dm_sts, dm_tgs], axis=1).groupby(['group']).sum()

In [7]:
dff_comp_det.astype(int)

Unnamed: 0_level_0,Unnamed: 1_level_0,total,Converted,Recycled,already paid,other,potential fraud
group,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
New model,2021-09-17,1,0,1,1,0,0
New model,2021-09-20,1,0,1,0,0,1
New model,2021-09-23,4,1,3,3,1,0
New model,2021-09-24,1,0,1,1,0,0
New model,2021-09-25,2,0,2,1,0,1
New model,2021-09-26,7,0,7,0,0,7
New model,2021-09-28,8,1,7,0,1,7
New model,2021-09-29,4,2,2,0,2,2
New model,2021-09-30,2,1,1,0,1,1
New model,2021-10-02,1,0,1,1,0,0


In [8]:
dff_comp_agg.astype(int)

Unnamed: 0_level_0,total,Converted,Recycled,already paid,other,potential fraud
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
New model,94,10,84,48,13,33
Old model,9,4,5,5,4,0


In [9]:
test('Converted', dff_comp_agg)

2021-11-29 16:06:52,296 - INFO - numexpr.utils - Note: NumExpr detected 32 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2021-11-29 16:06:52,297 - INFO - numexpr.utils - NumExpr defaulting to 8 threads.


0.018644394113862813

### Individual

In [10]:
dff = dft.copy()
dff['total'] = 1
dff = dff[dff['lead_source_description']=='Client is Individual']

dm_sts = pd.get_dummies(dff['status']).astype(int)
dm_tgs = pd.get_dummies(dff['tag_name']).astype(int)

dff_ind_det = pd.concat([dff[['group', 'date', 'total']],
                          dm_sts, dm_tgs], axis=1).groupby(['group', 'date']).sum()
dff_ind_agg = pd.concat([dff[['group', 'total']], dm_sts, dm_tgs], axis=1).groupby(['group']).sum()

In [11]:
dff_ind_det

Unnamed: 0_level_0,Unnamed: 1_level_0,total,Assigned,Awareness,Converted,In Process,Pending,Recycled,already_paid,annulated,new_BA,validated,прозвон_кц
group,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
New model,2021-09-17,26,0,0,1,0,0,25,1,24,0,1,0
New model,2021-09-18,4,0,0,0,0,0,4,1,3,0,0,0
New model,2021-09-19,47,0,0,0,0,0,47,0,47,0,0,0
New model,2021-09-20,35,0,0,0,0,0,35,0,35,0,0,0
New model,2021-09-21,40,0,0,1,0,0,39,0,38,0,2,0
New model,2021-09-22,16,0,0,0,0,0,16,0,16,0,0,0
New model,2021-09-23,14,0,0,0,0,0,14,0,14,0,0,0
New model,2021-09-24,13,0,0,0,0,0,13,0,12,0,1,0
New model,2021-09-25,10,0,0,1,0,0,9,0,9,0,1,0
New model,2021-09-26,34,0,0,0,0,0,34,0,34,0,0,0


In [12]:
dff_ind_agg

Unnamed: 0_level_0,total,Assigned,Awareness,Converted,In Process,Pending,Recycled,already_paid,annulated,new_BA,validated,прозвон_кц
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
New model,1071,50,1,14,2,1,1003,7,971,1,42,50
Old model,523,0,2,7,0,0,514,6,495,0,21,1


In [13]:
test('Assigned', dff_ind_agg)

3.1959138401945662e-09