In [13]:
import scipy.stats as sps
from sklearn.model_selection import train_test_split
import gc
from dateutil.parser import *
import pandas as pd
import numpy as np
import os
import yt.wrapper as yt 
os.chdir('/Users/lunin-dv/Desktop/Library/')
import importlib
import robot_lib as lib
import operator
import re
importlib.reload(lib)
os.chdir('/Users/lunin-dv/Desktop/data/')

In [14]:
def copy_before(table_name):
    if yt.row_count("//home/cloud_analytics/emailing/sender/" + table_name) == 0:
        yt.copy("//home/cloud_analytics/emailing/sender_copy/" + table_name,
                "//home/cloud_analytics/emailing/sender/" + table_name,
                force=True)
        print('saved')
copy_before("Go-to-paid-stream-OLD")
copy_before("Go-to-paid-stream")

# 0. Общая информация

## Гранты

In [15]:
grant_req = """
SELECT
    billing_account_id,
    SUM(if(consumed_amount < 0, 0, consumed_amount)) as consumed_amount,
    SUM(initial_amount) as grant_amount,
    MAX(toDate(end_time)) as grant_end_date,
    IF (
            (grant_amount - consumed_amount <= 0 
             AND
             toDate(grant_end_date) >= toDate(NOW())) 
        OR
         
            (toDate(grant_end_date) >= addDays(toDate(NOW()), -7)
             AND 
             toDate(grant_end_date) < toDate(NOW())), 1, 0) 
        as grant_ended_last_7_days,
    grant_amount - consumed_amount as remained_grant_amount
FROM "//home/cloud_analytics/tmp/artkaz/grants_spending"
WHERE id in (SELECT DISTINCT id 
             FROM "//home/cloud/billing/exported-billing-tables/monetary_grants_prod"
             WHERE id != ''
             AND source == 'default')
GROUP BY billing_account_id
FORMAT TabSeparatedWithNames
"""
grant_df = lib.execute_query(grant_req)

## Язык

In [16]:
req = f"""
    SELECT
        DISTINCT
        max(user_settings_language) as user_settings_language,
        CAST(passport_uid as String) as puid
    FROM "//home/cloud_analytics/import/iam/cloud_owners_history"
    GROUP BY puid
    FORMAT TabSeparatedWithNames
"""
language_df = lib.execute_query(req)

## cube

In [17]:
req = """
SELECT 
    billing_account_id,
    lower(user_settings_email) as email,
    ba_state,
    puid,
    ba_usage_status as usage_status,
    block_reason,
    mail_feature,
    mail_info,
    mail_promo,
    is_isv,
    if (first_first_paid_consumption_datetime == '0000-00-00 00:00:00', '',
        first_first_paid_consumption_datetime) as first_first_paid_consumption_datetime,
    modulo(toInt64(puid), 100) as group_index
FROM "//home/cloud_analytics/cubes/acquisition_cube/cube"
WHERE event == 'ba_created'
AND billing_account_id != ''
AND puid != ''
FORMAT TabSeparatedWithNames
"""
main_df = lib.execute_query(req)

In [21]:
main_df.dtypes

billing_account_id                       object
email                                    object
ba_state                                 object
puid                                     object
usage_status                             object
block_reason                             object
mail_feature                              int64
mail_info                                 int64
mail_promo                                int64
is_isv                                    int64
first_first_paid_consumption_datetime    object
group_index                               int64
dtype: object

## Drop_list

In [18]:
drop_req = """
SELECT
    *
FROM "//home/cloud_analytics/emailing/sender/drop_list"
FORMAT TabSeparatedWithNames
"""
dropped_emails = lib.execute_query(drop_req)
dropped_emails = set(dropped_emails['email'])

# 1. Обновление

In [19]:
def update_current_table(table_name, path, tables_to_update):
    full_path = path + "/" + table_name
    req = f"""
    SELECT
        *
    FROM "{full_path}"
    FORMAT TabSeparatedWithNames
    """
    old_df = lib.execute_query(req)
    if 'sended_mails' in old_df.columns:
        old_df['sended_mails'] = old_df['sended_mails'].apply(
            lambda x: x.replace("\\", ""))
    old_df['is_dropped'] = (old_df['email'].isin(dropped_emails)).astype(int)
    if "OLD" in table_name:
        old_df = old_df[old_df['is_dropped'] == 0]
    for table, key in tables_to_update:
        old_df = pd.merge(old_df,
                          table, on=key, suffixes=('', '_new'), how='left')
        
    new_cols = []
    for column in old_df.columns:
        if "_new" == column[-4:]:
            new_cols.append(column)
            old_column = column[:-4]
            old_df[old_column] = old_df[[old_column, column]].apply(
            lambda x: x[column] if not pd.isnull(x[column]) else x[old_column], axis=1
            )
    old_df.drop(columns=new_cols, inplace=True)
    if len(old_df) > 0:
        lib.save_table(table_name, path, old_df)

In [20]:
tables_to_update = [(main_df, "billing_account_id"), (language_df, 'puid'),
                    (grant_df, "billing_account_id")]

In [22]:
update_current_table("Go-to-paid-stream", "//home/cloud_analytics/emailing/sender", 
                     tables_to_update)

In [23]:
update_current_table("Go-to-paid-stream-OLD", "//home/cloud_analytics/emailing/sender", 
                     tables_to_update)

# 2. Добавление

## old Marketo

In [24]:
marketo_previous_req = """
SELECT
    DISTINCT
    billing_account_id,
    email
FROM "//home/cloud_analytics/cubes/emailing_events/emailing_events"
WHERE event == 'add_to_nurture_stream'
AND lower(program_name) like '%go-to-paid%'
AND lower(program_name) not like '%go-to-paid-trial-usage%'
FORMAT TabSeparatedWithNames
"""
marketo_prev_df = lib.execute_query(marketo_previous_req)

In [25]:
marketo_prev_df.shape

(82088, 2)

## old in program

In [26]:
req = """
SELECT
    DISTINCT
        email,
        billing_account_id
FROM "//home/cloud_analytics/emailing/sender/Go-to-paid-stream"
FORMAT TabSeparatedWithNames
"""
old_df = lib.execute_query(req)

## Подготовка новой порции

In [27]:
from datetime import timedelta

In [28]:
min_date_of_grant_ended = parse(lib.get_current_date_as_str()) - timedelta(days=30)

In [29]:
new_table = main_df.copy()
new_table = pd.merge(new_table, language_df, on='puid', how='inner')
new_table = new_table[
    (~new_table['email'].isin(marketo_prev_df['email'])) &
    (~new_table['billing_account_id'].isin(marketo_prev_df['billing_account_id']))
]
new_table = new_table[~new_table['email'].isin(dropped_emails)]
new_table = new_table[
    (~new_table['email'].isin(old_df['email'])) &
    (~new_table['billing_account_id'].isin(old_df['billing_account_id']))
]


new_table = pd.merge(new_table, grant_df, on='billing_account_id', how='inner')
new_table = new_table[(new_table['is_isv'] == 0) &
                      (new_table['first_first_paid_consumption_datetime'] == '')]
new_table = new_table[((new_table['ba_state'] == 'suspended') & 
                        (new_table['grant_ended_last_7_days'] == 1)) |
            ((new_table['block_reason'] == 'trial_expired') &
             (new_table['ba_state'].isin(['suspended', 
                                         'inactive',
                                         'payment_required'])))]
new_table = new_table[(new_table['mail_feature'] == 1) |
                      (new_table['mail_info'] == 1) |
                      (new_table['mail_promo'] == 1)]
new_table = new_table[pd.to_datetime(new_table['grant_end_date']) >=
                      min_date_of_grant_ended]

In [32]:
def make_trigger(row):
    try:
        if row['ba_state'] == 'suspended' and row['grant_ended_last_7_days'] == 1:
            return 'Suspended'
        return 'Trial_Expired'
    except Exception:
        print(row)

In [34]:
if len(new_table) > 0:
    new_table['trigger'] = new_table.apply(lambda row: make_trigger(row), axis=1)
    new_table["stream"] = new_table['usage_status'].apply(
        lambda x: 'BA paid - General' if x == 'paid' else 'Go-to-paid-general')
    new_table['Group'] = new_table['puid'].astype(int).apply(
        lambda x: 'control' if x % 100 >= 35 and x % 100 <= 44 else 'test')
    new_table['experiment_date'] = lib.get_current_date_as_str()
    new_table['is_dropped'] = (new_table['email'].isin(dropped_emails)).astype(int)
    lib.save_table("Go-to-paid-stream", "//home/cloud_analytics/emailing/sender", 
               new_table, append=True)

In [35]:
def copy_after(table_name):
    if yt.row_count("//home/cloud_analytics/emailing/sender/" + table_name) != 0:
        yt.copy("//home/cloud_analytics/emailing/sender/" + table_name,
                "//home/cloud_analytics/emailing/sender_copy/" + table_name, force=True)
copy_after("Go-to-paid-stream")
copy_after("Go-to-paid-stream-OLD")

---

In [31]:
# marketo_previous_req = """
# SELECT
#     DISTINCT
#     billing_account_id,
#     any(email) as email,
#     groupUniqArray(mailing_name) as sended_mails,
#     'test' as Group
# FROM "//home/cloud_analytics/cubes/emailing_events/emailing_events"
# WHERE lower(mailing_name) like '%q1-2020-go-to-paid-trial-expired%'
# AND billing_account_id != ''
# GROUP BY billing_account_id
# FORMAT TabSeparatedWithNames
# """
# marketo_prev_df = lib.execute_query(marketo_previous_req)
# marketo_prev_df['sended_mails'] = marketo_prev_df['sended_mails'].astype(str)
# lib.save_table("Go-to-paid-stream-OLD", "//home/cloud_analytics/emailing/sender", 
#                marketo_prev_df)