In [1]:
import os, sys, pandas as pd, datetime, numpy as np
module_path = os.path.abspath(os.path.join('/Users/ktereshin/Desktop/yandex/arcadia/cloud/analytics/python/work'))
if module_path not in sys.path:
    sys.path.append(module_path)
from data_loader import clickhouse
from global_variables import (
    metrika_clickhouse_param_dict,
    cloud_clickhouse_param_dict
)
from vault_client import instances

from nile.api.v1 import (
    clusters,
    aggregators as na,
    extractors as ne,
    Record
)

In [2]:
def works_with_emails(mail_):
    mail_parts = str(mail_).split('@')
    if len(mail_parts) > 1:
        if 'yandex.' in mail_parts[1].lower() or 'ya.' in mail_parts[1].lower():
            domain = 'yandex.ru'
            login = mail_parts[0].lower().replace('.', '-')
            return login + '@' + domain
        else:
            return str(mail_).lower()
    else:
        return str(mail_).lower()

In [3]:
client = instances.Production()
yt_creds = client.get_version('ver-01d33pgv8pzc7t99s3egm24x47')
metrika_creds = client.get_version('ver-01d2z36msatt9mp9pcfptezksp')
yc_ch_creds = client.get_version('ver-01d2z39xj02xw7gqvv9wq757ne')

cluster = clusters.yt.Hahn(
    token = yt_creds['value']['token'],
    pool = yt_creds['value']['pool']
)
metrika_clickhouse_param_dict['user'] = metrika_creds['value']['login']
metrika_clickhouse_param_dict['password'] = metrika_creds['value']['pass']

cloud_clickhouse_param_dict['user'] = yc_ch_creds['value']['login']
cloud_clickhouse_param_dict['password'] = yc_ch_creds['value']['pass']

In [4]:
query = '''
SELECT
    PassportUserID as puid,
    --product,
    page_type,
    COUNT(*) as pageviews
FROM(
    SELECT  
        PassportUserID,
        multiIf(
            match(URL, 'mongodb') , 'mongodb',
            match(URL, 'postgresql'), 'postgresql',
            match(URL, 'clickhouse'), 'clickhouse',
            match(URL, 'redis'), 'redis',
            match(URL, 'mysql'),'mysql',
            'other'
        ) as product,
        multiIf(
            match(URL, 'services/managed-|docs/managed-|folders/.+/managed-') AND match(URL, 'price|pricing'), 'pricing',
            match(URL, 'services/managed-'), 'service_info',
            match(URL, 'docs/managed-'), 'service_doc',
            match(URL, 'folders/.+/managed-.*/cluster/.+'), 'service_cluster_page',
            match(URL, 'folders/.+/managed-.*/create-cluster'), 'service_create_cluster_page',
            match(URL, 'folders/.+/managed-'), 'service_console',
            'other'
        ) as page_type
    FROM
        hits_all
    WHERE
        EventTime > toDate('2019-01-01')
        AND CounterID = 51465824
        AND PassportUserID > 0
        AND (
            (match(URL, 'services/managed-|docs/managed-|folders/.+/managed-') AND match(URL, 'price|pricing'))
            OR match(URL, 'docs/managed-')
            OR match(URL, 'folders/.+/managed-.*/create-cluster')
            )
) as t0
ALL LEFT JOIN(
    SELECT
        DISTINCT PassportUserID
    FROM
        hits_all
    WHERE
        EventTime > toDate('2019-01-01')
        AND CounterID = 51465824
        AND PassportUserID > 0
        --AND match(URL, 'folders/.+/managed-')
        AND match(URL, 'folders/.+/managed-.*/cluster/.+')
) as t1 
ON t0.PassportUserID = t1.PassportUserID
WHERE   
    t1.PassportUserID = 0
    OR t1.PassportUserID IS NULL
GROUP BY
    puid,
    --product,
    page_type
FORMAT TabSeparatedWithNames
'''

In [5]:
query = '''
SELECT
    PassportUserID as puid,
    product,
    page_type,
    COUNT(*) as pageviews
FROM(
    SELECT  
        PassportUserID,
        multiIf(
            match(URL, 'mongodb') , 'mongodb',
            match(URL, 'postgresql'), 'postgresql',
            match(URL, 'clickhouse'), 'clickhouse',
            match(URL, 'redis'), 'redis',
            match(URL, 'mysql'),'mysql',
            'other'
        ) as product,
        multiIf(
            match(URL, 'services/managed-|docs/managed-|folders/.+/managed-') AND match(URL, 'price|pricing'), 'pricing',
            match(URL, 'services/managed-'), 'service_info',
            match(URL, 'docs/managed-'), 'service_doc',
            match(URL, 'folders/.+/managed-.*/cluster/.+'), 'service_cluster_page',
            match(URL, 'folders/.+/managed-.*/create-cluster'), 'service_create_cluster_page',
            match(URL, 'folders/.+/managed-'), 'service_console',
            'other'
        ) as page_type
    FROM
        hits_all
    WHERE
        EventTime > toDate('2019-01-01')
        --AND EventTime <= toDate('2019-02-28')
        AND CounterID = 51465824
        AND PassportUserID > 0
        AND (
            (match(URL, 'services/managed-|docs/managed-|folders/.+/managed-') AND match(URL, 'price|pricing'))
            OR match(URL, 'docs/managed-')
            OR match(URL, 'folders/.+/managed-.*/create-cluster')
            )
) as t0
GROUP BY
    puid,
    product,
    page_type
FORMAT TabSeparatedWithNames
'''

In [6]:
metrika_clickhouse_param_dict['query'] = query
data = clickhouse.get_clickhouse_data(**metrika_clickhouse_param_dict)

In [7]:
data['pageviews'] = data['pageviews'].astype(int)

In [8]:
pv = pd.pivot_table(
    data,
    index = ['puid', 'product'],
    columns = ['page_type'],
    values = 'pageviews',
    fill_value = 0
    
).reset_index()

In [9]:
pv

page_type,puid,product,pricing,service_create_cluster_page,service_doc
0,100003556,mysql,0,0,1
1,100102193,postgresql,1,0,0
2,100456022,postgresql,0,0,10
3,10065226,postgresql,0,1,0
4,100670388,clickhouse,1,0,0
5,100818,clickhouse,2,0,3
6,101001344,mongodb,1,1,1
7,101442810,postgresql,2,0,1
8,101448625,clickhouse,1,0,0
9,101534172,clickhouse,1,0,0


In [10]:
pv['total_pv'] = pv['pricing'] + pv['service_create_cluster_page'] + pv['service_doc']

In [11]:
pv = pd.pivot_table(
    pv,
    index = ['puid'],
    columns = ['product'],
    values = 'total_pv',
    fill_value = 0
    
).reset_index()

In [12]:
query = '''
SELECT
    puid,
    email,
    user_settings_email,
    ba_state,
    multiIf( service_name LIKE '%mdb%', 'mdb', service_name) as service,
    SUM(real_consumption) + SUM(trial_consumption) as total_consumption

FROM
    cloud_analytics_testing.acquisition_cube_test
WHERE
    mail_promo == 1 OR mail_event == 1 OR mail_feature == 1 OR mail_info == 1 OR mail_testing == 1
GROUP BY
    puid,
    service,
    user_settings_email,
    ba_state,
    email
FORMAT TabSeparatedWithNames
'''

cloud_clickhouse_param_dict['query'] = query
cons = clickhouse.get_clickhouse_data(**cloud_clickhouse_param_dict)

In [13]:
cons['total_consumption'] = cons['total_consumption'].astype(float)

In [14]:
cons_by_product = pd.pivot_table(
    cons,
    index = ['puid', 'email', 'user_settings_email', 'ba_state'],
    columns = 'service',
    values = 'total_consumption',
    fill_value = 0
).reset_index()

In [15]:
cons_by_product['not_mdb'] = cons_by_product['cloud_ai'] + cons_by_product['cloud_network'] + cons_by_product['compute']

In [16]:
res = pd.merge(
    pv,
    cons_by_product,
    on = 'puid',
    how = 'left'
    
)

In [17]:
res = res[(res['email'] != '\N') & (res['email'].notnull())]

In [18]:
res['email'].nunique()

1677

In [19]:
res = res[(res['mdb'] <= 10) & (res['not_mdb'] > 0)]

In [21]:
res['email'] = res['user_settings_email'].apply(works_with_emails)

In [22]:
sample = cluster.read('//home/cloud_analytics/emailing/samples/2019-02-28_mdb_visits_not_consumed_all_product').as_dataframe()

In [23]:
sample[sample['sample'] == 'test']

Unnamed: 0,ba_state,cloud_ai,cloud_network,compute,email,mdb,nlb,not_mdb,pricing,puid,sample,service_create_cluster_page,service_doc,storage,target,total_pv
0,active,0.000000,0.050842,0.169593,a-arepo@yandex.ru,0.000000,0,0.220435,0,100003556,test,0,1,0.000000,start_consume_mdb,1
1,active,0.000000,125.366087,444.942908,dennisvoloshko@yandex.ru,0.000000,0,570.308995,1,101448625,test,0,0,0.000612,start_consume_mdb,1
2,suspended,0.000000,79.855024,6118.524804,bubushic2010@yandex.ru,0.000000,0,6198.379828,1,101534172,test,0,0,0.000000,start_consume_mdb,1
4,active,1.828800,0.000196,0.090633,domackii@yandex.ru,0.000000,0,1.919629,3,104753941,test,0,7,0.000611,start_consume_mdb,10
5,active,0.000000,1534.310348,5652.670600,crapulya@yandex.ru,0.000000,0,7186.980948,0,104972765,test,1,2,0.000000,start_consume_mdb,3
6,active,0.000000,130.846762,127.615300,kareem2005@yandex.ru,0.000000,0,258.462061,2,10641749,test,0,4,0.000000,start_consume_mdb,6
7,suspended,0.000000,18.304374,1160.322149,ndrewk@yandex.ru,0.000000,0,1178.626523,0,106563898,test,2,0,0.000000,start_consume_mdb,2
8,active,7.674588,14.398822,38.540795,cartier1@yandex.ru,0.000000,0,60.614204,0,10684335,test,1,0,0.000696,start_consume_mdb,1
9,suspended,0.000000,2.581429,1373.795990,rem-58@yandex.ru,0.000000,0,1376.377419,0,107429777,test,0,1,0.527503,start_consume_mdb,1
11,active,0.000000,110.259516,1235.546564,dmitryaswork@yandex.ru,0.000000,0,1345.806080,0,107761755,test,1,0,0.000000,start_consume_mdb,1


In [24]:
res_ = pd.merge(
    sample[sample['sample'] == 'test'][['puid', 'sample']],
    res,
    on = 'puid',
    how = 'left'
)

In [37]:
res_ = pd.merge(
    res,
    sample[sample['sample'] == 'control'][['puid', 'sample']],
    on = 'puid',
    how = 'inner'
)

In [38]:
res_

Unnamed: 0,puid,clickhouse,mongodb,mysql,postgresql,redis,email,user_settings_email,ba_state,\N,cloud_ai,cloud_network,compute,mdb,nlb,storage,not_mdb,sample
0,102251436,0,0,0,6,0,megacall1@yandex.ru,megacall1@yandex.ru,active,0.0,2.459472,220.295563,202.390566,0.000000,0.0,0.000000,425.145601,control
1,107540248,0,0,0,1,0,rocco66max@yandex.ru,rocco66max@yandex.ru,suspended,0.0,0.000000,148.251199,544.633839,0.000000,0.0,0.000000,692.885038,control
2,110704472,2,0,0,3,0,serg2011-a@yandex.ru,serg2011-a@yandex.ru,active,0.0,0.000000,64.669531,198.734027,0.000000,0.0,0.000000,263.403558,control
3,11256450,2,0,0,0,0,pomah3927@yandex.ru,POMAH3927@yandex.ru,suspended,0.0,0.000000,416.140520,982.064272,0.000000,0.0,64.813604,1398.204792,control
4,112643060,0,0,2,0,0,dihippi@yandex.ru,dihippi@yandex.ru,active,0.0,0.000000,452.796300,1784.668208,0.133050,0.0,0.000000,2237.464507,control
5,1130000000132271,2,0,0,2,0,ivan@iptt.ru,ivan@iptt.ru,suspended,0.0,0.000000,320.439337,606.755987,0.000000,0.0,0.001839,927.195324,control
6,1130000001291773,0,0,0,3,0,mail@kerby.ru,mail@kerby.ru,active,0.0,0.000000,96.208010,255.308791,0.000000,0.0,0.000000,351.516802,control
7,1130000010413641,3,0,0,0,0,alexey@donin.biz,alexey@donin.biz,active,0.0,0.000000,0.172926,0.027160,0.000000,0.0,0.000000,0.200086,control
8,1130000019133224,0,3,0,0,0,i@trainin.ru,i@trainin.ru,active,0.0,0.000000,0.658233,0.000000,0.000000,0.0,0.478515,0.658233,control
9,1130000019437483,0,0,0,3,0,kia@telestore.ru,kia@telestore.ru,active,0.0,0.000000,123.781017,584.529635,0.000000,0.0,0.000000,708.310652,control


In [39]:
import random
list_ = ['clickhouse','mongodb', 'mysql', 'postgresql', 'redis']
random.shuffle(list_)
list_

['clickhouse', 'mysql', 'postgresql', 'mongodb', 'redis']

In [40]:
list_

['clickhouse', 'mysql', 'postgresql', 'mongodb', 'redis']

In [41]:
def get_service(row):
    max_ = -1
    max_mdb = ''
    list_ = ['clickhouse','mongodb', 'mysql', 'postgresql', 'redis']
    random.shuffle(list_)
    for mdb in list_:
        if row[mdb] >= max_:
            max_mdb = mdb
            max_ = row[mdb]
    return max_mdb

In [42]:
res_['service'] = res_.apply(get_service, axis=1)

In [43]:
res_

Unnamed: 0,puid,clickhouse,mongodb,mysql,postgresql,redis,email,user_settings_email,ba_state,\N,cloud_ai,cloud_network,compute,mdb,nlb,storage,not_mdb,sample,service
0,102251436,0,0,0,6,0,megacall1@yandex.ru,megacall1@yandex.ru,active,0.0,2.459472,220.295563,202.390566,0.000000,0.0,0.000000,425.145601,control,postgresql
1,107540248,0,0,0,1,0,rocco66max@yandex.ru,rocco66max@yandex.ru,suspended,0.0,0.000000,148.251199,544.633839,0.000000,0.0,0.000000,692.885038,control,postgresql
2,110704472,2,0,0,3,0,serg2011-a@yandex.ru,serg2011-a@yandex.ru,active,0.0,0.000000,64.669531,198.734027,0.000000,0.0,0.000000,263.403558,control,postgresql
3,11256450,2,0,0,0,0,pomah3927@yandex.ru,POMAH3927@yandex.ru,suspended,0.0,0.000000,416.140520,982.064272,0.000000,0.0,64.813604,1398.204792,control,clickhouse
4,112643060,0,0,2,0,0,dihippi@yandex.ru,dihippi@yandex.ru,active,0.0,0.000000,452.796300,1784.668208,0.133050,0.0,0.000000,2237.464507,control,mysql
5,1130000000132271,2,0,0,2,0,ivan@iptt.ru,ivan@iptt.ru,suspended,0.0,0.000000,320.439337,606.755987,0.000000,0.0,0.001839,927.195324,control,clickhouse
6,1130000001291773,0,0,0,3,0,mail@kerby.ru,mail@kerby.ru,active,0.0,0.000000,96.208010,255.308791,0.000000,0.0,0.000000,351.516802,control,postgresql
7,1130000010413641,3,0,0,0,0,alexey@donin.biz,alexey@donin.biz,active,0.0,0.000000,0.172926,0.027160,0.000000,0.0,0.000000,0.200086,control,clickhouse
8,1130000019133224,0,3,0,0,0,i@trainin.ru,i@trainin.ru,active,0.0,0.000000,0.658233,0.000000,0.000000,0.0,0.478515,0.658233,control,mongodb
9,1130000019437483,0,0,0,3,0,kia@telestore.ru,kia@telestore.ru,active,0.0,0.000000,123.781017,584.529635,0.000000,0.0,0.000000,708.310652,control,postgresql


In [30]:
res_[res_['sample'] != 'test']

Unnamed: 0,puid,sample,clickhouse,mongodb,mysql,postgresql,redis,email,user_settings_email,ba_state,\N,cloud_ai,cloud_network,compute,mdb,nlb,storage,not_mdb,service


In [None]:
res_[res_['ba_state'].notna()]

In [32]:
test_ = cluster.read('//home/cloud_analytics/emailing/samples/2019-03-04_mdb_visits_not_consumed_by_product').as_dataframe()

In [45]:
res_

Unnamed: 0,puid,clickhouse,mongodb,mysql,postgresql,redis,email,user_settings_email,ba_state,\N,cloud_ai,cloud_network,compute,mdb,nlb,storage,not_mdb,sample,service
0,102251436,0,0,0,6,0,megacall1@yandex.ru,megacall1@yandex.ru,active,0.0,2.459472,220.295563,202.390566,0.000000,0.0,0.000000,425.145601,control,postgresql
1,107540248,0,0,0,1,0,rocco66max@yandex.ru,rocco66max@yandex.ru,suspended,0.0,0.000000,148.251199,544.633839,0.000000,0.0,0.000000,692.885038,control,postgresql
2,110704472,2,0,0,3,0,serg2011-a@yandex.ru,serg2011-a@yandex.ru,active,0.0,0.000000,64.669531,198.734027,0.000000,0.0,0.000000,263.403558,control,postgresql
3,11256450,2,0,0,0,0,pomah3927@yandex.ru,POMAH3927@yandex.ru,suspended,0.0,0.000000,416.140520,982.064272,0.000000,0.0,64.813604,1398.204792,control,clickhouse
4,112643060,0,0,2,0,0,dihippi@yandex.ru,dihippi@yandex.ru,active,0.0,0.000000,452.796300,1784.668208,0.133050,0.0,0.000000,2237.464507,control,mysql
5,1130000000132271,2,0,0,2,0,ivan@iptt.ru,ivan@iptt.ru,suspended,0.0,0.000000,320.439337,606.755987,0.000000,0.0,0.001839,927.195324,control,clickhouse
6,1130000001291773,0,0,0,3,0,mail@kerby.ru,mail@kerby.ru,active,0.0,0.000000,96.208010,255.308791,0.000000,0.0,0.000000,351.516802,control,postgresql
7,1130000010413641,3,0,0,0,0,alexey@donin.biz,alexey@donin.biz,active,0.0,0.000000,0.172926,0.027160,0.000000,0.0,0.000000,0.200086,control,clickhouse
8,1130000019133224,0,3,0,0,0,i@trainin.ru,i@trainin.ru,active,0.0,0.000000,0.658233,0.000000,0.000000,0.0,0.478515,0.658233,control,mongodb
9,1130000019437483,0,0,0,3,0,kia@telestore.ru,kia@telestore.ru,active,0.0,0.000000,123.781017,584.529635,0.000000,0.0,0.000000,708.310652,control,postgresql


In [44]:
cluster.write('//home/cloud_analytics/emailing/samples/{0}_mdb_visits_not_consumed_by_product'.format(datetime.date.today()), res_[res_['ba_state'].notna()].drop(["\\N"], axis=1))

In [46]:
res_[res_['ba_state'].notna()].drop(["\\N"], axis=1)

Unnamed: 0,puid,clickhouse,mongodb,mysql,postgresql,redis,email,user_settings_email,ba_state,cloud_ai,cloud_network,compute,mdb,nlb,storage,not_mdb,sample,service
0,102251436,0,0,0,6,0,megacall1@yandex.ru,megacall1@yandex.ru,active,2.459472,220.295563,202.390566,0.000000,0.0,0.000000,425.145601,control,postgresql
1,107540248,0,0,0,1,0,rocco66max@yandex.ru,rocco66max@yandex.ru,suspended,0.000000,148.251199,544.633839,0.000000,0.0,0.000000,692.885038,control,postgresql
2,110704472,2,0,0,3,0,serg2011-a@yandex.ru,serg2011-a@yandex.ru,active,0.000000,64.669531,198.734027,0.000000,0.0,0.000000,263.403558,control,postgresql
3,11256450,2,0,0,0,0,pomah3927@yandex.ru,POMAH3927@yandex.ru,suspended,0.000000,416.140520,982.064272,0.000000,0.0,64.813604,1398.204792,control,clickhouse
4,112643060,0,0,2,0,0,dihippi@yandex.ru,dihippi@yandex.ru,active,0.000000,452.796300,1784.668208,0.133050,0.0,0.000000,2237.464507,control,mysql
5,1130000000132271,2,0,0,2,0,ivan@iptt.ru,ivan@iptt.ru,suspended,0.000000,320.439337,606.755987,0.000000,0.0,0.001839,927.195324,control,clickhouse
6,1130000001291773,0,0,0,3,0,mail@kerby.ru,mail@kerby.ru,active,0.000000,96.208010,255.308791,0.000000,0.0,0.000000,351.516802,control,postgresql
7,1130000010413641,3,0,0,0,0,alexey@donin.biz,alexey@donin.biz,active,0.000000,0.172926,0.027160,0.000000,0.0,0.000000,0.200086,control,clickhouse
8,1130000019133224,0,3,0,0,0,i@trainin.ru,i@trainin.ru,active,0.000000,0.658233,0.000000,0.000000,0.0,0.478515,0.658233,control,mongodb
9,1130000019437483,0,0,0,3,0,kia@telestore.ru,kia@telestore.ru,active,0.000000,123.781017,584.529635,0.000000,0.0,0.000000,708.310652,control,postgresql


In [None]:
res['sample'] = res['email'].apply(lambda x: np.random.choice(['control', 'test']))

In [None]:
res['target'] = 'start_consume_mdb'

In [None]:
res.drop('\N', axis=1)

In [None]:
cluster.write('//home/cloud_analytics/emailing/samples/{0}_mdb_visits_not_consumed_all_product'.format(datetime.date.today()), res.drop(['\N', 'user_settings_email'], axis=1))

In [None]:
sample = cluster.read('//home/cloud_analytics/emailing/samples/2019-02-28_mdb_visits_not_consumed_all_product').as_dataframe()

In [None]:
query = '''
SELECT
    puid,
    SUM(real_consumption) + SUM(trial_consumption) as mdb_consumption

FROM
    cloud_analytics_testing.acquisition_cube_test
WHERE
    service_name LIKE '%mdb%'
GROUP BY
    puid
FORMAT TabSeparatedWithNames
'''

cloud_clickhouse_param_dict['query'] = query
mdb_cons = clickhouse.get_clickhouse_data(**cloud_clickhouse_param_dict)

In [None]:
mdb_cons['mdb_consumption'] = mdb_cons['mdb_consumption'].astype(float)

In [None]:
res = pd.merge(
    sample,
    mdb_cons,
    on = 'puid',
    how = 'left'
)

In [None]:
res

In [None]:
res['mdb_consumption'] = res['mdb_consumption'].fillna(0)
res['convertion'] = res['mdb_consumption'].apply(lambda x: 1 if x > 0 else 0)
res['users'] = res['mdb_consumption'].apply(lambda x: 1 if x > 0 else 0)
res['convertion_10_rub'] = res['mdb_consumption'].apply(lambda x: 1 if x > 10 else 0)
res['users_10_rub'] = res['mdb_consumption'].apply(lambda x: 1 if x > 10 else 0)

In [None]:
res[res['mdb_consumption']>10]

In [None]:
res.groupby('sample').agg({'email':'count', 'users':'sum', 'convertion':'mean'}).reset_index()

In [None]:
res.groupby('sample').agg({'email':'count', 'users_10_rub':'sum', 'convertion_10_rub':'mean'}).reset_index()

In [None]:
res[res['mdb_consumption'] > 10][['email', 'sample']]