In [2]:
import pandas as pd, datetime, ast, os,sys, pymysql, logging, requests
module_path = os.path.abspath(os.path.join('/home/ktereshin/yandex/arcadia/cloud/analytics/python/work'))
if module_path not in sys.path:
    sys.path.append(module_path)
from data_loader import clickhouse
from global_variables import (
    metrika_clickhouse_param_dict,
    cloud_clickhouse_param_dict
)
from nile.api.v1 import (
    clusters,
    aggregators as na,
    extractors as ne,
    filters as nf,
    Record
)
from vault_client import instances

In [3]:
def execute_query(query, cluster, alias, token, timeout=600):
    logger.info("Executing query: %s", query)
    proxy = "http://{}.yt.yandex.net".format(cluster)
    s = requests.Session()
    url = "{proxy}/query?database={alias}&password={token}".format(proxy=proxy, alias=alias, token=token)
    resp = s.post(url, data=query, timeout=timeout)
    if resp.status_code != 200:
        logger.error("Response status: %s", resp.status_code)
        logger.error("Response headers: %s", resp.headers)
        logger.error("Response content: %s", resp.content)
    resp.raise_for_status()
    rows = resp.content.strip().split('\n')
    logger.info("Time spent: %s seconds, rows returned: %s", resp.elapsed.total_seconds(), len(rows))
    return rows

In [4]:
logger = logging.getLogger(__name__)
client = instances.Production()
yt_creds = client.get_version('ver-01d33pgv8pzc7t99s3egm24x47')
cluster_yt = clusters.yt.Hahn(
    token = yt_creds['value']['token'],
    pool = yt_creds['value']['pool'],
    
).env(

    templates=dict(
        dates='{2019-03-28..2019-03-31}'
    )
)

In [5]:
cluster = 'hahn'
alias = "*ch_public"
token = '%s' % (yt_creds['value']['token'])

In [6]:
query = '''
SELECT
    DISTINCT
    puid,
    multiIf(first_first_paid_consumption_datetime < '2020-01-01 00:00:00', 1,0) as start_paid_consumption
FROM(
    SELECT
        puid,
        first_first_trial_consumption_datetime,
        addDays(toDateTime(first_first_trial_consumption_datetime), 7) AS last_event_datetime,
        multiIf(first_first_paid_consumption_datetime IS NULL OR first_first_paid_consumption_datetime = '','2020-01-01 00:00:00',first_first_paid_consumption_datetime) as first_first_paid_consumption_datetime
    FROM
        "//home/cloud_analytics_test/cubes/acquisition_cube/cube"
    WHERE
        event = 'first_trial_consumption'
        AND toDate(event_time) > toDate('2018-12-10')
        AND toDate(event_time) <= toDate(addDays(NOW(), -70))
)
WHERE
    toDateTime(first_first_paid_consumption_datetime) > toDateTime(last_event_datetime)
'''

result = execute_query(query=query, cluster=cluster, alias=alias, token=token)

users = pd.DataFrame([row.split('\t') for row in result], columns = ['puid', 'start_paid_consumption'])

In [7]:
int_cols = ['start_paid_consumption']
for col in int_cols:
    users[col] = users[col].astype(int)

In [8]:
query = '''
SELECT
    DISTINCT
    t0.puid,
    multiIf(start_compute < '2020-01-01 00:00:00', 1,0) as start_compute,
    multiIf(start_mdb < '2020-01-01 00:00:00', 1,0) as start_mdb,
    multiIf(start_storage < '2020-01-01 00:00:00', 1,0) as start_storage,
    multiIf(start_ai < '2020-01-01 00:00:00', 1,0) as start_ai
FROM(
    SELECT
        puid,
        MAX(multiIf(service_name = 'compute', time, '2020-01-01 00:00:00')) as start_compute,
        MAX(multiIf(service_name = 'mdb', time, '2020-01-01 00:00:00')) as start_mdb,
        MAX(multiIf(service_name = 'storage', time, '2020-01-01 00:00:00')) as start_storage,
        MAX(multiIf(service_name = 'cloud_ai', time, '2020-01-01 00:00:00')) as start_ai
    FROM(
        SELECT
            puid,
            multiIf(service_name LIKE '%mdb%', 'mdb', service_name) as service_name,
            MIN(event_time) as time
        FROM
            "//home/cloud_analytics_test/cubes/acquisition_cube/cube"
        WHERE
            event = 'day_use'
            AND multiIf(service_name LIKE '%mdb%', 'mdb', service_name) IN ('compute', 'mdb', 'cloud_ai', 'storage')
            AND real_consumption > 0
        GROUP BY
            puid,
            service_name
    )
    GROUP BY
        puid
    ) as t0
ANY INNER JOIN(
    SELECT
        puid
    FROM(
        SELECT
            puid,
            first_first_trial_consumption_datetime,
            addDays(toDateTime(first_first_trial_consumption_datetime), 7) AS last_event_datetime,
            multiIf(first_first_paid_consumption_datetime IS NULL OR first_first_paid_consumption_datetime = '','2020-01-01 00:00:00',first_first_paid_consumption_datetime) as first_first_paid_consumption_datetime
        FROM
            "//home/cloud_analytics_test/cubes/acquisition_cube/cube"
        WHERE
            event = 'first_trial_consumption'
            AND toDate(event_time) > toDate('2018-12-10')
            AND toDate(event_time) <= toDate(addDays(NOW(), -70))
    )
    WHERE
        toDateTime(first_first_paid_consumption_datetime) > toDateTime(last_event_datetime)
) as t1
ON t0.puid = t1.puid
'''

result = execute_query(query=query, cluster=cluster, alias=alias, token=token)
users_services = pd.DataFrame([row.split('\t') for row in result], columns = ['puid', 'start_compute', 'start_mdb', 'start_storage', 'start_ai'])

In [9]:
int_cols = ['start_compute', 'start_mdb', 'start_storage', 'start_ai']
for col in int_cols:
    users_services[col] = users_services[col].astype(int)

In [14]:
columns = [
    'puid',
    'all_paid_consumption',
    'all_trial_consumption_count',
    'all_trial_consumption_sum',
    'all_trial_consumption_avg',
    'all_trial_consumption_max',
    'all_trial_consumption_min',
    'all_trial_consumption_median',
    'all_trial_consumption_std',
    'all_trial_consumption_count_more_avg',
    'mdb_trial_consumption_count',
    'mdb_trial_consumption_sum',
    'mdb_trial_consumption_avg',
    'mdb_trial_consumption_max',
    'mdb_trial_consumption_min',
    'mdb_trial_consumption_median',
    'mdb_trial_consumption_std',
    'mdb_trial_consumption_count_more_avg',
    'ai_trial_consumption_count',
    'ai_trial_consumption_sum',
    'ai_trial_consumption_avg',
    'ai_trial_consumption_max',
    'ai_trial_consumption_min',
    'ai_trial_consumption_median',
    'ai_trial_consumption_std',
    'ai_trial_consumption_count_more_avg',
    'storage_trial_consumption_count',
    'storage_trial_consumption_sum',
    'storage_trial_consumption_avg',
    'storage_trial_consumption_max',
    'storage_trial_consumption_min',
    'storage_trial_consumption_median',
    'storage_trial_consumption_std',
    'storage_trial_consumption_count_more_avg',
    'network_trial_consumption_count',
    'network_trial_consumption_sum',
    'network_trial_consumption_avg',
    'network_trial_consumption_max',
    'network_trial_consumption_min',
    'network_trial_consumption_median',
    'network_trial_consumption_std',
    'network_trial_consumption_count_more_avg',
    'nlb_trial_consumption_count',
    'nlb_trial_consumption_sum',
    'nlb_trial_consumption_avg',
    'nlb_trial_consumption_max',
    'nlb_trial_consumption_min',
    'nlb_trial_consumption_median',
    'nlb_trial_consumption_std',
    'nlb_trial_consumption_count_more_avg',
    'marketplace_trial_consumption_count',
    'marketplace_trial_consumption_sum',
    'marketplace_trial_consumption_avg',
    'marketplace_trial_consumption_max',
    'marketplace_trial_consumption_min',
    'marketplace_trial_consumption_median',
    'marketplace_trial_consumption_std',
    'marketplace_trial_consumption_count_more_avg',
    'nbs_trial_consumption_count',
    'nbs_trial_consumption_sum',
    'nbs_trial_consumption_avg',
    'nbs_trial_consumption_max',
    'nbs_trial_consumption_min',
    'nbs_trial_consumption_median',
    'nbs_trial_consumption_std',
    'nbs_trial_consumption_count_more_avg',
    'snapshot_trial_consumption_count',
    'snapshot_trial_consumption_sum',
    'snapshot_trial_consumption_avg',
    'snapshot_trial_consumption_max',
    'snapshot_trial_consumption_min',
    'snapshot_trial_consumption_median',
    'snapshot_trial_consumption_std',
    'snapshot_trial_consumption_count_more_avg',
    'image_trial_consumption_count',
    'image_trial_consumption_sum',
    'image_trial_consumption_avg',
    'image_trial_consumption_max',
    'image_trial_consumption_min',
    'image_trial_consumption_median',
    'image_trial_consumption_std',
    'image_trial_consumption_count_more_avg'
]

In [15]:
query = '''
SELECT
    puid,
    SUM(all_paid_consumption) as all_paid_consumption,
    SUM(multiIf(all_trial_consumption > 0, 1,0)) as all_trial_consumption_count,
    SUM(all_trial_consumption) as all_trial_consumption_sum,
    AVG(all_trial_consumption) as all_trial_consumption_avg,
    MAX(all_trial_consumption) as all_trial_consumption_max,
    MIN(all_trial_consumption) as all_trial_consumption_min,
    median(all_trial_consumption) as all_trial_consumption_median,
    stddevPop(all_trial_consumption) as all_trial_consumption_std,
    arraySum(arrayMap(x -> x > all_trial_consumption_avg, groupArray(all_trial_consumption))) as all_trial_consumption_count_more_avg,
    SUM(multiIf(mdb_trial_consumption > 0, 1,0)) as mdb_trial_consumption_count,
    SUM(mdb_trial_consumption) as mdb_trial_consumption_sum,
    AVG(mdb_trial_consumption) as mdb_trial_consumption_avg,
    MAX(mdb_trial_consumption) as mdb_trial_consumption_max,
    MIN(mdb_trial_consumption) as mdb_trial_consumption_min,
    median(mdb_trial_consumption) as mdb_trial_consumption_median,
    stddevPop(mdb_trial_consumption) as mdb_trial_consumption_std,
    arraySum(arrayMap(x -> x > mdb_trial_consumption_avg, groupArray(mdb_trial_consumption))) as mdb_trial_consumption_count_more_avg,
    SUM(multiIf(ai_trial_consumption > 0, 1,0)) as ai_trial_consumption_count,
    SUM(ai_trial_consumption) as ai_trial_consumption_sum,
    AVG(ai_trial_consumption) as ai_trial_consumption_avg,
    MAX(ai_trial_consumption) as ai_trial_consumption_max,
    MIN(ai_trial_consumption) as ai_trial_consumption_min,
    median(ai_trial_consumption) as ai_trial_consumption_median,
    stddevPop(ai_trial_consumption) as ai_trial_consumption_std,
    arraySum(arrayMap(x -> x > ai_trial_consumption_avg, groupArray(ai_trial_consumption))) as ai_trial_consumption_count_more_avg,
    SUM(multiIf(storage_trial_consumption > 0, 1,0)) as storage_trial_consumption_count,
    SUM(storage_trial_consumption) as storage_trial_consumption_sum,
    AVG(storage_trial_consumption) as storage_trial_consumption_avg,
    MAX(storage_trial_consumption) as storage_trial_consumption_max,
    MIN(storage_trial_consumption) as storage_trial_consumption_min,
    median(storage_trial_consumption) as storage_trial_consumption_median,
    stddevPop(storage_trial_consumption) as storage_trial_consumption_std,
    arraySum(arrayMap(x -> x > storage_trial_consumption_avg, groupArray(storage_trial_consumption))) as storage_trial_consumption_count_more_avg,
    SUM(multiIf(network_trial_consumption > 0, 1,0)) as network_trial_consumption_count,
    SUM(network_trial_consumption) as network_trial_consumption_sum,
    AVG(network_trial_consumption) as network_trial_consumption_avg,
    MAX(network_trial_consumption) as network_trial_consumption_max,
    MIN(network_trial_consumption) as network_trial_consumption_min,
    median(network_trial_consumption) as network_trial_consumption_median,
    stddevPop(network_trial_consumption) as network_trial_consumption_std,
    arraySum(arrayMap(x -> x > network_trial_consumption_avg, groupArray(network_trial_consumption))) as network_trial_consumption_count_more_avg,
    SUM(multiIf(nlb_trial_consumption > 0, 1,0)) as nlb_trial_consumption_count,
    SUM(nlb_trial_consumption) as nlb_trial_consumption_sum,
    AVG(nlb_trial_consumption) as nlb_trial_consumption_avg,
    MAX(nlb_trial_consumption) as nlb_trial_consumption_max,
    MIN(nlb_trial_consumption) as nlb_trial_consumption_min,
    median(nlb_trial_consumption) as nlb_trial_consumption_median,
    stddevPop(nlb_trial_consumption) as nlb_trial_consumption_std,
    arraySum(arrayMap(x -> x > nlb_trial_consumption_avg, groupArray(nlb_trial_consumption))) as nlb_trial_consumption_count_more_avg,
    SUM(multiIf(marketplace_trial_consumption > 0, 1,0)) as marketplace_trial_consumption_count,
    SUM(marketplace_trial_consumption) as marketplace_trial_consumption_sum,
    AVG(marketplace_trial_consumption) as marketplace_trial_consumption_avg,
    MAX(marketplace_trial_consumption) as marketplace_trial_consumption_max,
    MIN(marketplace_trial_consumption) as marketplace_trial_consumption_min,
    median(marketplace_trial_consumption) as marketplace_trial_consumption_median,
    stddevPop(marketplace_trial_consumption) as marketplace_trial_consumption_std,
    arraySum(arrayMap(x -> x > marketplace_trial_consumption_avg, groupArray(marketplace_trial_consumption))) as marketplace_trial_consumption_count_more_avg,
    SUM(multiIf(nbs_trial_consumption > 0, 1,0)) as nbs_trial_consumption_count,
    SUM(nbs_trial_consumption) as nbs_trial_consumption_sum,
    AVG(nbs_trial_consumption) as nbs_trial_consumption_avg,
    MAX(nbs_trial_consumption) as nbs_trial_consumption_max,
    MIN(nbs_trial_consumption) as nbs_trial_consumption_min,
    median(nbs_trial_consumption) as nbs_trial_consumption_median,
    stddevPop(nbs_trial_consumption) as nbs_trial_consumption_std,
    arraySum(arrayMap(x -> x > nbs_trial_consumption_avg, groupArray(nbs_trial_consumption))) as nbs_trial_consumption_count_more_avg,
    SUM(multiIf(snapshot_trial_consumption > 0, 1,0)) as snapshot_trial_consumption_count,
    SUM(snapshot_trial_consumption) as snapshot_trial_consumption_sum,
    AVG(snapshot_trial_consumption) as snapshot_trial_consumption_avg,
    MAX(snapshot_trial_consumption) as snapshot_trial_consumption_max,
    MIN(snapshot_trial_consumption) as snapshot_trial_consumption_min,
    median(snapshot_trial_consumption) as snapshot_trial_consumption_median,
    stddevPop(snapshot_trial_consumption) as snapshot_trial_consumption_std,
    arraySum(arrayMap(x -> x > snapshot_trial_consumption_avg, groupArray(snapshot_trial_consumption))) as snapshot_trial_consumption_count_more_avg,
    SUM(multiIf(image_trial_consumption > 0, 1,0)) as image_trial_consumption_count,
    SUM(image_trial_consumption) as image_trial_consumption_sum,
    AVG(image_trial_consumption) as image_trial_consumption_avg,
    MAX(image_trial_consumption) as image_trial_consumption_max,
    MIN(image_trial_consumption) as image_trial_consumption_min,
    median(image_trial_consumption) as image_trial_consumption_median,
    stddevPop(image_trial_consumption) as image_trial_consumption_std,
    arraySum(arrayMap(x -> x > image_trial_consumption_avg, groupArray(image_trial_consumption))) as image_trial_consumption_count_more_avg
FROM(
    SELECT
        *
    FROM(
        SELECT
            t0.puid,
            toDate(event_time) as date,
            SUM(trial_consumption) as all_trial_consumption,
            SUM(real_consumption) as all_paid_consumption,
            SUM(multiIf(service_name LIKE '%compute%', trial_consumption, 0)) as compute_trial_consumption,
            SUM(multiIf(service_name LIKE '%mdb%', trial_consumption, 0)) as mdb_trial_consumption,
            SUM(multiIf(service_name LIKE '%_ai%', trial_consumption, 0)) as ai_trial_consumption,
            SUM(multiIf(service_name LIKE '%storage%', trial_consumption, 0)) as storage_trial_consumption,
            SUM(multiIf(service_name LIKE '%network%', trial_consumption, 0)) as network_trial_consumption,
            SUM(multiIf(service_name LIKE '%nlb%', trial_consumption, 0)) as nlb_trial_consumption,
            SUM(multiIf(name LIKE '%marketplace%', trial_consumption, 0)) as marketplace_trial_consumption,
            SUM(multiIf(name LIKE '%nbs.%', trial_consumption, 0)) as nbs_trial_consumption,
            SUM(multiIf(name LIKE '%snapshot%', trial_consumption, 0)) as snapshot_trial_consumption,
            SUM(multiIf(name LIKE '%image%', trial_consumption, 0)) as image_trial_consumption
        FROM
            "//home/cloud_analytics_test/cubes/acquisition_cube/cube" as t0
        WHERE
            event = 'day_use'
        GROUP BY
            puid,
            date
    ) as t0
    ANY INNER JOIN (
        SELECT
            *
        FROM(
            SELECT
                puid,
                first_first_trial_consumption_datetime,
                addDays(toDateTime(first_first_trial_consumption_datetime), 7) AS last_event_datetime,
                multiIf(first_first_paid_consumption_datetime IS NULL OR first_first_paid_consumption_datetime = '','2020-01-01 00:00:00',first_first_paid_consumption_datetime) as first_first_paid_consumption_datetime
            FROM
                "//home/cloud_analytics_test/cubes/acquisition_cube/cube"
            WHERE
                event = 'first_trial_consumption'
                AND toDate(event_time) > toDate('2018-12-10')
                AND toDate(event_time) <= toDate(addDays(NOW(), -70))
        )
        WHERE
            toDateTime(first_first_paid_consumption_datetime) > toDateTime(last_event_datetime)
    ) as t1 
    ON t0.puid = t1.puid
    WHERE  
         t0.date < toDate(t1.last_event_datetime)
    ORDER BY
        puid,
        date
)
GROUP BY
    puid
'''

result = execute_query(query=query, cluster=cluster, alias=alias, token=token)
cunsumption_stat = pd.DataFrame([row.split('\t') for row in result], columns = columns)

In [20]:
float_cols = [
    'all_paid_consumption',
    'all_trial_consumption_count',
    'all_trial_consumption_sum',
    'all_trial_consumption_avg',
    'all_trial_consumption_max',
    'all_trial_consumption_min',
    'all_trial_consumption_median',
    'all_trial_consumption_std',
    'all_trial_consumption_count_more_avg',
    'mdb_trial_consumption_count',
    'mdb_trial_consumption_sum',
    'mdb_trial_consumption_avg',
    'mdb_trial_consumption_max',
    'mdb_trial_consumption_min',
    'mdb_trial_consumption_median',
    'mdb_trial_consumption_std',
    'mdb_trial_consumption_count_more_avg',
    'ai_trial_consumption_count',
    'ai_trial_consumption_sum',
    'ai_trial_consumption_avg',
    'ai_trial_consumption_max',
    'ai_trial_consumption_min',
    'ai_trial_consumption_median',
    'ai_trial_consumption_std',
    'ai_trial_consumption_count_more_avg',
    'storage_trial_consumption_count',
    'storage_trial_consumption_sum',
    'storage_trial_consumption_avg',
    'storage_trial_consumption_max',
    'storage_trial_consumption_min',
    'storage_trial_consumption_median',
    'storage_trial_consumption_std',
    'storage_trial_consumption_count_more_avg',
    'network_trial_consumption_count',
    'network_trial_consumption_sum',
    'network_trial_consumption_avg',
    'network_trial_consumption_max',
    'network_trial_consumption_min',
    'network_trial_consumption_median',
    'network_trial_consumption_std',
    'network_trial_consumption_count_more_avg',
    'nlb_trial_consumption_count',
    'nlb_trial_consumption_sum',
    'nlb_trial_consumption_avg',
    'nlb_trial_consumption_max',
    'nlb_trial_consumption_min',
    'nlb_trial_consumption_median',
    'nlb_trial_consumption_std',
    'nlb_trial_consumption_count_more_avg',
    'marketplace_trial_consumption_count',
    'marketplace_trial_consumption_sum',
    'marketplace_trial_consumption_avg',
    'marketplace_trial_consumption_max',
    'marketplace_trial_consumption_min',
    'marketplace_trial_consumption_median',
    'marketplace_trial_consumption_std',
    'marketplace_trial_consumption_count_more_avg',
    'nbs_trial_consumption_count',
    'nbs_trial_consumption_sum',
    'nbs_trial_consumption_avg',
    'nbs_trial_consumption_max',
    'nbs_trial_consumption_min',
    'nbs_trial_consumption_median',
    'nbs_trial_consumption_std',
    'nbs_trial_consumption_count_more_avg',
    'snapshot_trial_consumption_count',
    'snapshot_trial_consumption_sum',
    'snapshot_trial_consumption_avg',
    'snapshot_trial_consumption_max',
    'snapshot_trial_consumption_min',
    'snapshot_trial_consumption_median',
    'snapshot_trial_consumption_std',
    'snapshot_trial_consumption_count_more_avg',
    'image_trial_consumption_count',
    'image_trial_consumption_sum',
    'image_trial_consumption_avg',
    'image_trial_consumption_max',
    'image_trial_consumption_min',
    'image_trial_consumption_median',
    'image_trial_consumption_std',
    'image_trial_consumption_count_more_avg'
]
for col in float_cols:
    cunsumption_stat[col] = cunsumption_stat[col].astype(float)

In [21]:
query = '''
SELECT
    DISTINCT
    t0.puid,
    multiIf(unreachible_count = calls, 0, 1) as is_reachible
FROM(
    SELECT
        puid,
        groupArray(event_time) as event_times,
        groupArray(call_status) as call_statuses,
        arraySum(arrayMap(x -> x LIKE '%unreachible%', call_statuses)) as unreachible_count,
        arrayCount(arrayMap(x -> x IS NOT NULL, call_statuses)) as calls,
        event_times[1] as first_call_dt
    FROM(
        SELECT
            *
        FROM
            "//home/cloud_analytics_test/cubes/crm_leads/cube"
        WHERE
            event = 'call'
            AND puid != ''
            AND puid != '0'
        ORDER BY
            puid,
            event_time
    )
    GROUP BY 
        puid
) as t0
ANY INNER JOIN (
    SELECT
        *
    FROM(
        SELECT
            puid,
            first_first_trial_consumption_datetime,
            addDays(toDateTime(first_first_trial_consumption_datetime), 7) AS last_event_datetime,
            multiIf(first_first_paid_consumption_datetime IS NULL OR first_first_paid_consumption_datetime = '','2020-01-01 00:00:00',first_first_paid_consumption_datetime) as first_first_paid_consumption_datetime
        FROM
            "//home/cloud_analytics_test/cubes/acquisition_cube/cube"
        WHERE
            event = 'first_trial_consumption'
            AND toDate(event_time) > toDate('2018-12-10')
            AND toDate(event_time) <= toDate(addDays(NOW(), -70))
    )
    WHERE
        toDateTime(first_first_paid_consumption_datetime) > toDateTime(last_event_datetime)
) as t1 
ON t0.puid = t1.puid
WHERE  
     toDate(t0.first_call_dt) > toDate(t1.last_event_datetime)
'''

result = execute_query(query=query, cluster=cluster, alias=alias, token=token)
calls = pd.DataFrame([row.split('\t') for row in result], columns = ['puid', 'is_reachible'])

In [22]:
calls['is_reachible'] = calls['is_reachible'].astype(int)

In [23]:
columns = [
    'puid',
    'segment',
    'is_yandex_email',
    'is_corporate_email',
    'mobile_phone_vendor',
    'device_type',
    'days_between_first_visit_cloud',
    'days_between_cloud_ba',
    'hits',
    'os',
    'is_robot',
    'total_visits',
    'interests',
    'sex',
    'age',
    'session_start_time',
    'ad_block',
    'country',
    'search_phrase',
     'visit_version',
    'income',
    'channel',
    'promocode_source',
    'resolution_width',
    'resolution_height',
    'size_cat'
]

In [24]:
query = '''
SELECT
    t0.*
FROM(
    SELECT
        puid,
        segment,
        multiIf(email LIKE '%@yandex.%' OR email LIKE '%@ya.%', 1, 0) AS is_yandex_email,
        multiIf(match(email,'.*@yandex\..*|.*@ya\..*|.*@gmail\..*|.*@mail\..*|.*@tut\..*|.*@linqcorp\..*'), 0, 1) AS is_corporate_email,
        mobile_phone_vendor as mobile_phone_vendor,
        multiIf(device_type = '', 'unknown', device_type) as device_type,
        toDate(first_cloud_created_datetime) - toDate(multiIf(first_visit_datetime = '','2030-01-01 00:00:00',first_visit_datetime)) as days_between_first_visit_cloud,
        toDate(multiIf(first_ba_created_datetime = '','2030-01-01 00:00:00',first_ba_created_datetime)) - toDate(multiIf(first_cloud_created_datetime = '','2030-01-01 00:00:00',first_cloud_created_datetime)) as days_between_cloud_ba,
        hits,
        lowerUTF8(multiIf(os = '', 'unknown', os)) as os,
        CAST(multiIf(is_robot = '', '-1', is_robot) as Int32) as is_robot,
        total_visits,
        CAST(multiIf(interests = '', '-1', interests) as Int32) as interests,
        multiIf(sex = '', 'unknown', sex) as sex,
        multiIf(age = '', 'unknown', age) as age,
        session_start_time,
        ad_block,
        lowerUTF8(multiIf(country = '', 'unknown', country)) as country,
        lowerUTF8(multiIf(search_phrase = '', 'unknown', search_phrase)) as search_phrase,
        CAST(multiIf(visit_version = '', '-1', visit_version) as Int32) as visit_version,
        income,
        channel,
        promocode_source,
        resolution_width,
        resolution_height,
        multiIf( resolution_height > 0, resolution_width/resolution_height, 0) as size_cat
    FROM
        "//home/cloud_analytics_test/cubes/acquisition_cube/cube"
    WHERE
        event = 'cloud_created'
) as t0
ANY INNER JOIN (
    SELECT
        *
    FROM(
        SELECT
            puid,
            first_first_trial_consumption_datetime,
            addDays(toDateTime(first_first_trial_consumption_datetime), 7) AS last_event_datetime,
            multiIf(first_first_paid_consumption_datetime IS NULL OR first_first_paid_consumption_datetime = '','2020-01-01 00:00:00',first_first_paid_consumption_datetime) as first_first_paid_consumption_datetime
        FROM
            "//home/cloud_analytics_test/cubes/acquisition_cube/cube"
        WHERE
            event = 'first_trial_consumption'
            AND toDate(event_time) > toDate('2018-12-10')
            AND toDate(event_time) <= toDate(addDays(NOW(), -70))
    )
    WHERE
        toDateTime(first_first_paid_consumption_datetime) > toDateTime(last_event_datetime)
) as t1 
ON t0.puid = t1.puid
'''

result = execute_query(query=query, cluster=cluster, alias=alias, token=token)
user_meta_info = pd.DataFrame([row.split('\t') for row in result], columns = columns)

In [25]:
int_cols = [
    'is_yandex_email',
    'mobile_phone_vendor',
    'hits',
    'is_robot',
    'total_visits',
    'interests',
    'ad_block',
    'visit_version',
    'income',
    'resolution_width',
    'resolution_height',
    'is_corporate_email',
    'days_between_first_visit_cloud',
    'days_between_cloud_ba'
]
float_cols = [
    'size_cat'
]
for col in int_cols:
    user_meta_info[col] = user_meta_info[col].astype(int)
    
for col in float_cols:
    user_meta_info[col] = user_meta_info[col].astype(float)

In [26]:
user_meta_info

Unnamed: 0,puid,segment,is_yandex_email,is_corporate_email,mobile_phone_vendor,device_type,days_between_first_visit_cloud,days_between_cloud_ba,hits,os,...,ad_block,country,search_phrase,visit_version,income,channel,promocode_source,resolution_width,resolution_height,size_cat
0,24443253,mass,1,0,0,desktop,0,50,2,mac os x sierra,...,2,россия,unknown,2,3,Direct,waitlist,1440,900,1.600000
1,767367729,mass,1,0,207,mobile,0,1,5,google android 8.1 oreo,...,2,россия,unknown,7,2,Direct,unknown,424,895,0.473743
2,1130000033408795,mass,0,1,0,desktop,0,0,1,windows 7 или 2008 server,...,1,россия,"zyltrc j,kfrj",2,3,Yandex Portal,unknown,1366,768,1.778646
3,68135435,mass,1,0,7,mobile,0,0,1,google android 6.0 marshmallow,...,2,россия,unknown,2,0,Referrals,unknown,360,640,0.562500
4,84354432,mass,1,0,0,desktop,0,0,1,windows 7 или 2008 server,...,2,россия,unknown,1,0,Referrals,unknown,1366,768,1.778646
5,799895205,mass,1,0,0,desktop,0,0,9,windows 8.1,...,2,россия,unknown,17,3,Referrals,unknown,1366,768,1.778646
6,586822771,mass,1,0,207,mobile,0,3,7,google android 6.0 marshmallow,...,2,россия,unknown,8,3,Perfomance,unknown,360,640,0.562500
7,216735641,mass,1,0,0,desktop,0,0,63,windows 10,...,2,украина,unknown,63,3,Organic Search,unknown,1920,1080,1.777778
8,12026445,mass,1,0,0,desktop,0,0,53,linux (другие или не определено),...,2,россия,unknown,72,3,Organic Search,unknown,1366,768,1.778646
9,94332003,mass,1,0,0,unknown,0,0,0,unknown,...,0,unknown,unknown,-1,0,Unknown,unknown,0,0,0.000000


In [27]:
query = '''
SELECT
    puid,
    groupArray(payment_cycle_type)[1] as ba_payment_cycle_type,
    groupArray(state)[1] as ba_state,
    groupArray(person_type)[1] as ba_person_type,
    groupArray(payment_type)[1] as ba_payment_type,
    groupArray(usage_status)[1] as ba_usage_status,
    groupArray(type)[1] as ba_type
FROM (
    SELECT
        t0.*,
        t1.puid as puid,
        t1.last_event_datetime as last_event_datetime
    FROM(
        SELECT
            toDateTime(updated_at) as datetime,
            *
        FROM
            "//home/logfeller/logs/yc-billing-export-billing-accounts-history/1h/2019-04-02T14:00:00"
    ) as t0
    ANY INNER JOIN (
        SELECT
            *
        FROM(
            SELECT
                puid,
                billing_account_id,
                first_first_trial_consumption_datetime,
                addDays(toDateTime(first_first_trial_consumption_datetime), 7) AS last_event_datetime,
                multiIf(first_first_paid_consumption_datetime IS NULL OR first_first_paid_consumption_datetime = '','2020-01-01 00:00:00',first_first_paid_consumption_datetime) as first_first_paid_consumption_datetime
            FROM
                "//home/cloud_analytics_test/cubes/acquisition_cube/cube"
            WHERE
                event = 'first_trial_consumption'
                AND puid != ''
                AND toDate(event_time) > toDate('2018-12-10')
                AND toDate(event_time) <= toDate(addDays(NOW(), -70))
        )
        WHERE
            toDateTime(first_first_paid_consumption_datetime) > toDateTime(last_event_datetime)
    ) as t1 
    ON t0.billing_account_id = t1.billing_account_id
    WHERE
        toDate(t0.datetime) < toDate(t1.last_event_datetime)
    ORDER BY
        puid,
        datetime DESC
)
GROUP BY
    puid
'''

result = execute_query(query=query, cluster=cluster, alias=alias, token=token)
ba_meta_info = pd.DataFrame([row.split('\t') for row in result], columns = ['puid','ba_payment_cycle_type','ba_state','ba_person_type','ba_payment_type','ba_usage_status','ba_type'])

In [28]:
query = '''
SELECT
    DISTINCT
    puid,
    is_see_in_metriks
FROM(
    SELECT
        puid,
        1 as is_see_in_metriks
    FROM(
        SELECT
            puid,
            splitByString(' ', event_time)[1] as date
        FROM
            "//home/cloud_analytics_test/cubes/acquisition_cube/cube"
        WHERE
            puid != ''
            AND event = 'visit'
    ) as t0
    ANY INNER JOIN (
        SELECT
            *
        FROM(
            SELECT
                DISTINCT puid,
                first_first_trial_consumption_datetime,
                addDays(toDateTime(first_first_trial_consumption_datetime), 7) AS last_event_datetime,
                multiIf(first_first_paid_consumption_datetime IS NULL OR first_first_paid_consumption_datetime = '','2020-01-01 00:00:00',first_first_paid_consumption_datetime) as first_first_paid_consumption_datetime
            FROM
                "//home/cloud_analytics_test/cubes/acquisition_cube/cube"
            WHERE
                event = 'first_trial_consumption'
                AND toDate(event_time) > toDate('2018-12-10')
                AND toDate(event_time) <= toDate(addDays(NOW(), -70))
        )
        WHERE
            toDateTime(first_first_paid_consumption_datetime) > toDateTime(last_event_datetime)
    ) as t1 
    ON t0.puid = t1.puid
    WHERE 
        toDate(t0.date) < toDate(t1.last_event_datetime)
)
'''

result = execute_query(query=query, cluster=cluster, alias=alias, token=token)
metrika_site_events = pd.DataFrame([row.split('\t') for row in result], columns = ['puid', 'is_see_in_metriks'])

In [29]:
metrika_site_events['is_see_in_metriks'] = metrika_site_events['is_see_in_metriks'].astype(int)

In [30]:
targets = pd.merge(
    users[['puid', 'start_paid_consumption']],
    users_services,
    on = 'puid',
    how = 'left'
).fillna(0)

targets = pd.merge(
    targets,
    calls,
    on = 'puid',
    how = 'left'
).fillna(-1)
targets

Unnamed: 0,puid,start_paid_consumption,start_compute,start_mdb,start_storage,start_ai,is_reachible
0,324165636,0,0.0,0.0,0.0,0.0,-1.0
1,747780349,0,0.0,0.0,0.0,0.0,-1.0
2,7241497,0,0.0,0.0,0.0,0.0,1.0
3,813752483,1,1.0,0.0,0.0,0.0,-1.0
4,35988888,1,1.0,0.0,0.0,0.0,-1.0
5,794730817,0,0.0,0.0,0.0,0.0,-1.0
6,813896260,0,0.0,0.0,0.0,0.0,-1.0
7,813826853,0,0.0,0.0,0.0,0.0,0.0
8,287637894,0,0.0,0.0,0.0,0.0,-1.0
9,813531567,1,1.0,0.0,0.0,0.0,-1.0


In [31]:
cluster_yt.write('//home/cloud_analytics/scoring/targets', targets)

In [32]:
data = pd.merge(
    users[['puid']],
    cunsumption_stat.drop('all_paid_consumption', axis = 1),
    on = 'puid',
    how = 'left'
).fillna(0)

data = pd.merge(
    data,
    user_meta_info,
    on = 'puid',
    how = 'left'
).fillna(0)
data = pd.merge(
    data,
    ba_meta_info,
    on = 'puid',
    how = 'left'
).fillna('unknown')
data = pd.merge(
    data,
    metrika_site_events,
    on = 'puid',
    how = 'left'
).fillna(0)

In [33]:
cluster_yt.write('//home/cloud_analytics/scoring/meta_info', data)

In [34]:
query = '''
SELECT
    t0.*,
    runningDifference(t0.ts) as delta
FROM(
    SELECT
        puid,
        event_type,
        event,
        timestamp,
        ts,
        splitByString('T', timestamp)[1] as date
    FROM
        "//home/cloud_analytics/import/console_logs/events"
    WHERE
        puid != ''
    ORDER BY
        puid,
        timestamp
) as t0
ANY INNER JOIN (
    SELECT
        *
    FROM(
        SELECT
            puid,
            first_first_trial_consumption_datetime,
            addDays(toDateTime(first_first_trial_consumption_datetime), 7) AS last_event_datetime,
            multiIf(first_first_paid_consumption_datetime IS NULL OR first_first_paid_consumption_datetime = '','2020-01-01 00:00:00',first_first_paid_consumption_datetime) as first_first_paid_consumption_datetime
        FROM
            "//home/cloud_analytics_test/cubes/acquisition_cube/cube"
        WHERE
            event = 'first_trial_consumption'
            AND toDate(event_time) > toDate('2018-12-10')
            AND toDate(event_time) <= toDate(addDays(NOW(), -70))
    )
    WHERE
        toDateTime(first_first_paid_consumption_datetime) > toDateTime(last_event_datetime)
) as t1 
ON t0.puid = t1.puid
WHERE 
    toDate(t0.date) < toDate(t1.last_event_datetime)
'''

result = execute_query(query=query, cluster=cluster, alias=alias, token=token)
site_events = pd.DataFrame([row.split('\t') for row in result], columns=['puid','event_type','event','timestamp','ts','date', 'delta'])

In [35]:
cluster_yt.write('//home/cloud_analytics/scoring/events', site_events)