In [1]:
import pandas as pd, datetime, ast
from nile.api.v1 import (
    clusters,
    aggregators as na,
    extractors as ne,
    filters as nf,
    Record
)
from vault_client import instances

In [2]:
def get_datetime_from_epoch(epoch):
    try:
        return str(datetime.datetime.fromtimestamp(int(epoch)))
    except:
        return None

def apply_types_in_project(schema_):
    apply_types_dict = {}
    for col in schema_:
        
        if schema_[col] == str:
            apply_types_dict[col] = ne.custom(lambda x: str(x).replace('"', '').replace("'", '').replace('\\','') if x not in ['', None] else None, col)
            
        elif schema_[col] == int:
            apply_types_dict[col] = ne.custom(lambda x: int(x) if x not in ['', None] else None, col)
            
        elif schema_[col] == float:
            apply_types_dict[col] = ne.custom(lambda x: float(x) if x not in ['', None] else None, col)
    return apply_types_dict

def works_with_emails(mail_):
    mail_parts = str(mail_).split('@')
    if len(mail_parts) > 1:
        if 'yandex.' in mail_parts[1].lower() or 'ya.' in mail_parts[1].lower():
            domain = 'yandex.ru'
            login = mail_parts[0].lower().replace('.', '-')
            return login + '@' + domain
        else:
            return mail_.lower()

In [3]:
def pivot_events(groups):
    for key, records in groups:
        event_list = [
            'cloud_created',
            'ba_created',
            'first_trial_consumption',
            'first_paid_consumption',
            'ba_became_paid'
        ]
        result_dict = {}
        baid = None
        puid = None
        ba_state = None
        block_reason = None
        for rec in records:
            if rec['billing_account_id']:
                baid = rec['billing_account_id']
            if rec['puid']:
                puid = rec['puid']
            if rec['ba_state']:
                ba_state = rec['ba_state']
            if rec['block_reason']:
                block_reason = rec['block_reason']
            if rec['event'] in event_list:
                result_dict[rec['event'] + '_time'] = rec['event_time']
                result_dict['is_' + rec['event']] = 1
                event_list.remove(rec['event'])
            else:
                continue
        result_dict['billing_account_id'] = baid
        result_dict['puid'] = puid
        result_dict['ba_state'] = ba_state
        result_dict['block_reason'] = block_reason
        yield Record(key, **result_dict)

def get_mail_id(mail_desc):
    mail_desc = mail_desc.lower()
    if '|' in mail_desc:
        if '2019' in mail_desc.split('|')[0]:
            st = mail_desc.split('|')[1].lower()
            st = ' '.join(st.split())
            return st.replace(' ', '-')
        else:
            st = mail_desc.split('|')[0].lower()
            st = ' '.join(st.split())
            return st.replace(' ', '-')
    else:
        if 'test' in mail_desc or 'тест' in mail_desc:
            return 'testing'

        if 'terms-update' in mail_desc or 'terms_update' in mail_desc:
            return 'terms-update'

        if 'activation' in mail_desc or 'act-' in mail_desc:
            return 'ba-activation'

        if 'scenario' in mail_desc:
            return '3-scenarios-to-paid'

        if 'go-to-paid' in mail_desc and 'promo' not in mail_desc:
            return 'go-to-paid'

        if 'beginners' in mail_desc and 'promo' in mail_desc:
            return 'promo-beginners'

        if 'active-user' in mail_desc and 'promo' in mail_desc:
            return 'promo-active-users'

        if 'we-are-public' in mail_desc:
            return 'we-are-public'

        if 'start-usage' in mail_desc and 'promo' in mail_desc:
            return 'promo-start-usage'

        if 'go-to-paid' in mail_desc and 'promo' in mail_desc:
            return 'promo-go-to-paid'

        if 'reminder' in mail_desc and 'promo' in mail_desc:
            return 'promo-reminder'
        
        if 'trial' in mail_desc and 'extended' in mail_desc:
            return 'trial-extended'

        if 'typical' in mail_desc:
            return 'typical-task'

        if 'grant' in mail_desc and 'use' in mail_desc:
            return 'use-grant'

        if 'open' in mail_desc:
            return 'we-are-open'

        if 'follow' in mail_desc:
            return 'webinar-follow-up'

        if 'error' in mail_desc:
            return 'error-payment-method'
        
        if 'cloud-functionality' in mail_desc:
            return 'cloud-functionality'
        
        if '[' in mail_desc:
            st = mail_desc.split('[')[0].lower()
            st = ' '.join(st.split())
            if st[-1] == '-':
                return st[:-1].replace(' ', '-')
            return st.replace(' ', '-')
        return mail_desc.replace(' ', '-')

def works_with_emails(mail_):
    mail_parts = str(mail_).split('@')
    if len(mail_parts) > 1:
        if 'yandex.' in mail_parts[1].lower() or 'ya.' in mail_parts[1].lower():
            domain = 'yandex.ru'
            login = mail_parts[0].lower().replace('.', '-')
            return login + '@' + domain
        else:
            return mail_.lower()

In [4]:
client = instances.Production()
yt_creds = client.get_version('ver-01d33pgv8pzc7t99s3egm24x47')
cluster = clusters.yt.Hahn(
    token = yt_creds['value']['token'],
    pool = yt_creds['value']['pool']
)

paths_dict_test = {
    'send_email': '//home/cloud_analytics/import/marketo/send_email',
    'send_email_1': '//home/cloud_analytics/import/marketo_bak_04.03.2019/send_email',
    'send_email_2': '//home/cloud_analytics/import/marketo_bak_06.03.2019/send_email',
    'send_email_3': '//home/cloud_analytics/import/marketo_bak_22.02.2019/send_email',
    'email_delivered': '//home/cloud_analytics/import/marketo/email_delivered',
    'email_delivered_1': '//home/cloud_analytics/import/marketo_bak_04.03.2019/email_delivered',
    'email_delivered_2': '//home/cloud_analytics/import/marketo_bak_06.03.2019/email_delivered',
    'email_delivered_3': '//home/cloud_analytics/import/marketo_bak_22.02.2019/email_delivered',
    'open_email': '//home/cloud_analytics/import/marketo/open_email',
    'open_email_1': '//home/cloud_analytics/import/marketo_bak_04.03.2019/open_email',
    'open_email_2': '//home/cloud_analytics/import/marketo_bak_06.03.2019/open_email',
    'open_email_3': '//home/cloud_analytics/import/marketo_bak_22.02.2019/open_email',
    'click_email': '//home/cloud_analytics/import/marketo/click_email',
    'click_email_1': '//home/cloud_analytics/import/marketo_bak_04.03.2019/click_email',
    'click_email_2': '//home/cloud_analytics/import/marketo_bak_06.03.2019/click_email',
    'click_email_3': '//home/cloud_analytics/import/marketo_bak_22.02.2019/click_email',
    'acquisition_cube': '//home/cloud_analytics_test/cubes/acquisition_cube/cube',
    'emailing_cube':'//home/cloud_analytics_test/cubes/emailing/cube'
}
paths_dict_prod = {
    'send_email': '//home/cloud_analytics/import/marketo/send_email',
    'send_email_1': '//home/cloud_analytics/import/marketo_bak_04.03.2019/send_email',
    'send_email_2': '//home/cloud_analytics/import/marketo_bak_06.03.2019/send_email',
    'send_email_3': '//home/cloud_analytics/import/marketo_bak_22.02.2019/send_email',
    'email_delivered': '//home/cloud_analytics/import/marketo/email_delivered',
    'email_delivered_1': '//home/cloud_analytics/import/marketo_bak_04.03.2019/email_delivered',
    'email_delivered_2': '//home/cloud_analytics/import/marketo_bak_06.03.2019/email_delivered',
    'email_delivered_3': '//home/cloud_analytics/import/marketo_bak_22.02.2019/email_delivered',
    'open_email': '//home/cloud_analytics/import/marketo/open_email',
    'open_email_1': '//home/cloud_analytics/import/marketo_bak_04.03.2019/open_email',
    'open_email_2': '//home/cloud_analytics/import/marketo_bak_06.03.2019/open_email',
    'open_email_3': '//home/cloud_analytics/import/marketo_bak_22.02.2019/open_email',
    'click_email': '//home/cloud_analytics/import/marketo/click_email',
    'click_email_1': '//home/cloud_analytics/import/marketo_bak_04.03.2019/click_email',
    'click_email_2': '//home/cloud_analytics/import/marketo_bak_06.03.2019/click_email',
    'click_email_3': '//home/cloud_analytics/import/marketo_bak_22.02.2019/click_email',
    'acquisition_cube': '//home/cloud_analytics/cubes/acquisition_cube/cube',
    'emailing_cube':'//home/cloud_analytics/cubes/emailing/cube',
}

mode = 'test'
if mode == 'test':
    paths_dict = paths_dict_test
elif mode == 'prod':
    paths_dict = paths_dict_prod

job = cluster.job()
send_email = job.concat(
        job.table(paths_dict_test['send_email']),
        job.table(paths_dict_test['send_email_1']),
        job.table(paths_dict_test['send_email_2']),
        job.table(paths_dict_test['send_email_3'])
    ) \
    .unique(
        'marketo_id'
    ) \
    .project(
        event = ne.const('email_sended'),
        event_time = ne.custom(lambda x: get_datetime_from_epoch(x), 'created'),
        email = ne.custom(works_with_emails,'email'),
        mail_id = ne.custom(get_mail_id, 'mailing_name'),
        
    ) \
.put()



email_delivered = job.concat(
        job.table(paths_dict_test['email_delivered']),
        job.table(paths_dict_test['email_delivered_1']),
        job.table(paths_dict_test['email_delivered_2']),
        job.table(paths_dict_test['email_delivered_3'])
    ) \
    .unique(
        'marketo_id'
    )\
    .project(
        delivery_time = ne.custom(lambda x: get_datetime_from_epoch(x), 'created'),
        email = ne.custom(works_with_emails,'email'),
        mail_id = ne.custom(get_mail_id, 'mailing_name'),
        
    ) \
    .groupby(
        'email',
        'mail_id'
    ) \
    .aggregate(
        delivery_time = na.min('delivery_time')
    )

open_email = job.concat(
        job.table(paths_dict_test['open_email']),
        job.table(paths_dict_test['open_email_1']),
        job.table(paths_dict_test['open_email_2']),
        job.table(paths_dict_test['open_email_3'])
    ) \
    .unique(
        'marketo_id'
    ) \
    .project(
        open_time = ne.custom(lambda x: get_datetime_from_epoch(x), 'created'),
        email = ne.custom(works_with_emails,'email'),
        mail_id = ne.custom(get_mail_id, 'mailing_name'),
        
    ) \
    .groupby(
        'email',
        'mail_id'
    ) \
    .aggregate(
        open_time = na.min('open_time')
    )

click_email = job.concat(
        job.table(paths_dict_test['click_email']),
        job.table(paths_dict_test['click_email_1']),
        job.table(paths_dict_test['click_email_2']),
        job.table(paths_dict_test['click_email_3'])
    ) \
    .unique(
        'marketo_id'
    ) \
    .project(
        click_time = ne.custom(lambda x: get_datetime_from_epoch(x), 'created'),
        email = ne.custom(works_with_emails,'email'),
        mail_id = ne.custom(get_mail_id, 'mailing_name'),
        
    ) \
    .groupby(
        'email',
        'mail_id'
    ) \
    .aggregate(
        click_time = na.min('click_time')
    )

funnel_events = job.table(paths_dict_test['acquisition_cube']) \
    .filter(
        nf.custom(lambda x: x not in ['visit', 'call', 'day_use'])
    ) \
    .project(
        'mail_tech',
        'mail_testing',
        'mail_info',
        'mail_feature',
        'mail_event',
        'mail_promo',
        'mail_billing',
        event = 'event',
        event_time = 'event_time',
        email = ne.custom(works_with_emails,'user_settings_email'),
        billing_account_id = 'billing_account_id',
        ba_state = 'ba_state',
        block_reason = 'block_reason',
        puid = 'puid',
    ) \
    .groupby(
        'email',
        'mail_tech',
        'mail_testing',
        'mail_info',
        'mail_feature',
        'mail_event',
        'mail_promo',
        'mail_billing'
    ) \
    .sort(
        'event_time'
    ) \
    .reduce(
        pivot_events
    )


res = send_email \
    .join(
        email_delivered,
        by = ['email', 'mail_id'],
        type = 'left'
    ) \
    .join(
        open_email,
        by = ['email', 'mail_id'],
        type = 'left'
    ) \
    .join(
        click_email,
        by = ['email', 'mail_id'],
        type = 'left'
    ) \
    .join(
        funnel_events,
        by = ['email'],
        type = 'left'
    ) \
    .unique(
        "ba_became_paid_time",
        "ba_created_time",
        "billing_account_id",
        "ba_state",
        "block_reason",
        "click_time",
        "cloud_created_time",
        "delivery_time",
        "email",
        "event",
        "event_time",
        "first_paid_consumption_time",
        "first_trial_consumption_time",
        "is_ba_became_paid",
        "is_ba_created",
        "is_cloud_created",
        "is_first_paid_consumption",
        "is_first_trial_consumption",
        "mail_id",
        "open_time"
    ) \
    .put(paths_dict_test['emailing_cube']+'_temp')
job.run()

try:
    cluster.driver.remove(paths_dict_test['emailing_cube'])
except:
    pass
schema = {
    "ba_became_paid_time": str,
    "ba_created_time": str,
    "billing_account_id": str,
    "ba_state": str,
    "block_reason": str,
    "click_time": str,
    "cloud_created_time": str,
    "delivery_time": str,
    "email": str,
    "event": str,
    "event_time": str,
    "first_paid_consumption_time": str,
    "first_trial_consumption_time": str,
    "is_ba_became_paid": int,
    "is_ba_created": int,
    "is_cloud_created": int,
    "is_first_paid_consumption": int,
    "is_first_trial_consumption": int,
    "mail_id": str,
    "open_time": str,
    "puid": str,
    "paid_consumption": float,
    "is_paid_more_then_10_rur": int,
    "is_trial_more_then_10_rur": int,
    "trial_consumption": float
    
}
job = cluster.job()
source = job.table(paths_dict_test['emailing_cube']+'_temp')
cunsumption = job.table(paths_dict_test['acquisition_cube']) \
    .filter(
        nf.or_(
            nf.custom(lambda x: x > 0, 'real_consumption'),
            nf.custom(lambda x: x > 0, 'trial_consumption')
        )
    ) \
    .project(
        'real_consumption',
        'trial_consumption',
        con_time = 'event_time',
        billing_account_id = 'billing_account_id',
        is_paid_more_then_10_rur = ne.custom(lambda x: 1 if x > 10 else 0, 'real_consumption_cum'),
        is_trial_more_then_10_rur = ne.custom(lambda x: 1 if x > 10 else 0, 'trial_consumption_cum') 
    )
source = source \
    .join(
        cunsumption,
        by = 'billing_account_id',
        type = 'left'
    ) \
    .groupby(
        "ba_became_paid_time",
        "ba_created_time",
        "billing_account_id",
        "ba_state",
        "block_reason",
        "click_time",
        "cloud_created_time",
        "delivery_time",
        "email",
        "event",
        "event_time",
        "first_paid_consumption_time",
        "first_trial_consumption_time",
        "is_ba_became_paid",
        "is_ba_created",
        "is_cloud_created",
        "is_first_paid_consumption",
        "is_first_trial_consumption",
        "mail_id",
        "open_time",
        "puid",
    ) \
    .aggregate(
        paid_consumption = na.sum('real_consumption', missing = 0),
        trial_consumption = na.sum('trial_consumption', missing = 0),
        is_paid_more_then_10_rur = na.max('is_paid_more_then_10_rur'),
        is_trial_more_then_10_rur = na.max('is_trial_more_then_10_rur')
    ) \
    .project(
        **apply_types_in_project(schema)
    ) \
    .put(paths_dict_test['emailing_cube'], schema = schema)
job.run()

In [None]:
job = cluster.job()
source = job.table('//home/cloud_analytics/ktereshin/tasks/email_stat/funnel_consumption')
calls = job.table('//home/cloud_analytics_test/cooking_cubes/acquisition_cube/sources/calls') \
    .filter(
        nf.custom(lambda x: True if x else False, 'puid')
    ) \
    .project(
        call = ne.const(1),
        puid = 'puid'
    )
source = source \
    .join(
        calls,
        by = 'puid',
        type = 'left'
    ) \
    .groupby(
        "ba_became_paid_time",
        "ba_created_time",
        "billing_account_id",
        "click_time",
        "cloud_created_time",
        "delivery_time",
        "email",
        "event",
        "event_time",
        "first_paid_consumption_time",
        "first_trial_consumption_time",
        "is_ba_became_paid",
        "is_ba_created",
        "is_cloud_created",
        "is_first_paid_consumption",
        "is_first_trial_consumption",
        "mail_id",
        "open_time",
        "puid",
        "paid_consumption"
    ) \
    .aggregate(
        calls = na.sum('call', missing=0)
    ) \
    .put('//home/cloud_analytics/ktereshin/tasks/email_stat/funnel_consumption_calls')
job.run()

In [None]:
data = cluster.read('//home/cloud_analytics/ktereshin/tasks/email_stat/funnel_consumption_calls').as_dataframe()

In [None]:
data.columns

In [None]:
data.columns

In [None]:
report = pd.concat(
    [
        data,
        pd.get_dummies(data['mail_id'])
    ],
    axis = 1,
    sort=False
)

In [None]:
report['event_date'] = report['event_time'].apply(lambda x: x.split(' ')[0])
report['is_open_mail'] = report['open_time'].apply(lambda x: 1 if x > '0' else 0)
report['is_click_mail'] = report['click_time'].apply(lambda x: 1 if x > '0' else 0)

In [None]:
report['is_click_mail'].value_counts()

In [None]:
report.columns

In [None]:
temp = report.groupby(['mail_id', 'event_date']).agg(
    {
        'email': 'nunique',
        'is_open_mail': 'sum',
        'is_click_mail': 'sum',
        'is_ba_created': 'sum',
        'is_cloud_created': 'sum',
        'is_first_trial_consumption': 'sum',
        'is_first_paid_consumption': 'sum'
    }
)[['email', 'is_open_mail', 'is_click_mail', 'is_cloud_created', 'is_ba_created', 'is_first_trial_consumption', 'is_first_paid_consumption']].reset_index()
temp.to_csv('emailing.csv', sep='\t', index = False)

In [None]:
temp.sort_values(by='event_date')