In [1]:
import scipy.stats as sps
from sklearn.model_selection import train_test_split
import gc
from dateutil.parser import *
import pandas as pd
import numpy as np
import os
import yt.wrapper as yt 
os.chdir('/Users/lunin-dv/Desktop/Library/')
import importlib
import my_library as lib
import operator
import re
importlib.reload(lib)
os.chdir('/Users/lunin-dv/Desktop/data/')

In [2]:
def copy_before(table_name):
    if yt.row_count("//home/cloud_analytics/emailing/sender/" + table_name) == 0:
        yt.copy("//home/cloud_analytics/emailing/sender_copy/" + table_name,
                "//home/cloud_analytics/emailing/sender/" + table_name,
                force=True)
        print('saved')
copy_before("ISV-Cloud-Boost")
copy_before("ISV-Cloud-Boost-OLD")

# 0. Общая информация

## Drop_list

In [3]:
drop_req = """
SELECT
    *
FROM "//home/cloud_analytics/emailing/sender/drop_list"
FORMAT TabSeparatedWithNames
"""
dropped_emails = lib.execute_query(drop_req)
dropped_emails = set(dropped_emails['email'])

## Cube

In [4]:
req = """
SELECT
    DISTINCT
    billing_account_id,
    lower(user_settings_email) as email,
    puid,
    event_time as console_regstration_date,
    multiIf(first_ba_created_datetime = '0000-00-00 00:00:00', '',
            first_ba_created_datetime) as ba_created,
    
    if (first_first_paid_consumption_datetime == '0000-00-00 00:00:00', '',
        first_first_paid_consumption_datetime) as first_first_paid_consumption_datetime,
    ba_usage_status as usage_status,
    is_isv,
    mail_feature,
    mail_info,
    mail_promo,
    language,
    modulo(toInt64(puid), 100) as group_index,
    toDate(NOW()) - toDate(console_regstration_date) as days_since_console_registration,
    'test' as Group
FROM "//home/cloud_analytics/cubes/acquisition_cube/cube" as a
ANY LEFT JOIN(
    SELECT
        passport_uid as puid,
        user_settings_language as language
    FROM "//home/cloud_analytics/import/iam/cloud_owners_history"
) as b
ON a.puid == b.puid
WHERE
    (
        (event == 'cloud_created' and  billing_account_id in
         (SELECT DISTINCT billing_account_id 
          FROM "//home/cloud_analytics/cubes/acquisition_cube/cube"
          WHERE event == 'cloud_created')
         )
      OR
        (event == 'ba_created' and  billing_account_id not in
         (SELECT DISTINCT billing_account_id 
          FROM "//home/cloud_analytics/cubes/acquisition_cube/cube"
          WHERE event == 'cloud_created')
        )
    )
    AND billing_account_id != ''
    AND puid != ''
FORMAT TabSeparatedWithNames
"""
main_df = lib.execute_query(req)

## ISV

In [5]:
req = """
SELECT
    billing_account_id,
    toDate(isv_start_date) as isv_start_date,
    toDate(isv_end_date) as isv_end_date
FROM (
    SELECT
        billing_account_id,
        isv_start_date,
        if (isNotNull(isv_start_date) and isv_end_date != 0, isv_end_date, null) as isv_end_date
    FROM (
        SELECT
            billing_account_id,
            min(if (is_isv == 1, 
                    updated_at, null)
                ) as isv_start_date,
            max(if (is_isv == 1, 
                    updated_at, 0)
                ) as isv_before_end_date,
            groupArray(if (is_isv == 0, updated_at, null)) as all_not_isv_times,
            arrayFilter(x -> assumeNotNull(x) > assumeNotNull(isv_before_end_date), 
                        all_not_isv_times)[1] as isv_end_date
        FROM (
            SELECT
                billing_account_id,
                updated_at,
                if (feature_flags like '%isv%' 
                and feature_flags not like '%false%', 1, 0) as is_isv
            FROM "//home/cloud/billing/exported-billing-tables/billing_accounts_history_prod"
            ORDER BY billing_account_id, updated_at
        )
        GROUP BY billing_account_id
    )
    WHERE isNotNull(isv_start_date)
)
FORMAT TabSeparatedWithNames
"""
isv_df = lib.execute_query(req)

In [6]:
req = """
SELECT
    groupUniqArray(isv_status) as isv_status,
    email
FROM "//home/cloud_analytics/import/crm/crm_leads_isv_var"
GROUP BY email
HAVING length(isv_status) > 0
FORMAT TabSeparatedWithNames
"""
isv_next = lib.execute_query(req)

In [7]:
order = list(reversed(['Pending', 'New', 'Assigned', 'In Process', 'Recycled', 'Converted']))

In [8]:
def get_type(x):
    for y in order:
        if y in x:
            return y
    assert False, x

In [9]:
isv_next['isv_status'] = isv_next['isv_status'].apply(
    lambda x: get_type(x))

In [10]:
isv_next.shape

(1281, 2)

In [11]:
main_df = pd.merge(main_df, isv_df, on='billing_account_id', how='left')
main_df = pd.merge(main_df, isv_next, on='email', how='left')

# 1. Обновление

In [14]:
def update_current_table(table_name, path, tables_to_update):
    full_path = path + "/" + table_name
    req = f"""
    SELECT
        *
    FROM "{full_path}"
    FORMAT TabSeparatedWithNames
    """
    old_df = lib.execute_query(req)
    if 'sended_mails' in old_df.columns:
        old_df['sended_mails'] = old_df['sended_mails'].apply(
            lambda x: x.replace("\\", ""))
    old_df['is_dropped'] = (old_df['email'].isin(dropped_emails)).astype(int)
    if "OLD" in table_name:
        old_df = old_df[old_df['is_dropped'] == 0]
    for table, key in tables_to_update:
        old_df = pd.merge(old_df,
                          table, on=key, suffixes=('', '_new'), how='left')
        
    new_cols = []
    for column in old_df.columns:
        if "_new" == column[-4:]:
            new_cols.append(column)
            old_column = column[:-4]
            old_df[old_column] = old_df[[old_column, column]].apply(
            lambda x: x[column] if not pd.isnull(x[column]) else x[old_column], axis=1
            )
    old_df.drop(columns=new_cols, inplace=True)
    lib.save_table(table_name, path, old_df)

In [15]:
tables_to_update = [(main_df, "billing_account_id")]

In [16]:
update_current_table("ISV-Cloud-Boost-OLD", 
                            "//home/cloud_analytics/emailing/sender", 
                            tables_to_update)

In [17]:
update_current_table("ISV-Cloud-Boost", "//home/cloud_analytics/emailing/sender", 
                     tables_to_update)

# 2. Добавление

## old Marketo

In [17]:
marketo_previous_req = """
SELECT
    DISTINCT
    billing_account_id,
    email
FROM "//home/cloud_analytics/cubes/emailing_events/emailing_events"
WHERE event == 'add_to_nurture_stream'
AND lower(program_name) like '%cloud-boost-program%'
FORMAT TabSeparatedWithNames
"""
marketo_prev_df = lib.execute_query(marketo_previous_req)

In [18]:
marketo_prev_df.shape

(279, 2)

## old in program


In [19]:
req = """
SELECT
    DISTINCT
        email,
        billing_account_id
FROM "//home/cloud_analytics/emailing/sender/ISV-Cloud-Boost"
FORMAT TabSeparatedWithNames
"""
old_df = lib.execute_query(req)

In [20]:
new_table = main_df.copy()
new_table = new_table[new_table['is_isv'] == 1]

In [21]:
new_table[new_table['billing_account_id'] == 'dn27bl08253ht286iejj']

Unnamed: 0,billing_account_id,email,puid,console_regstration_date,ba_created,first_first_paid_consumption_datetime,usage_status,is_isv,mail_feature,mail_info,mail_promo,language,group_index,days_since_console_registration,Group,isv_start_date,isv_end_date,isv_status
16505,dn27bl08253ht286iejj,maxim@tetrika.school,1130000041641920,2019-12-09 13:51:32,2019-12-09 16:57:06,2020-01-05 22:59:59,paid,1,0,0,0,ru,20,242,test,2020-06-30,,


In [22]:
new_table = new_table[(new_table['mail_feature'] == 1) |
                      (new_table['mail_info'] == 1) |
                      (new_table['mail_promo'] == 1)]

In [23]:
new_table['stream'] = new_table['usage_status'].apply(
    lambda x: 'Paid' if x == 'paid' else "Not Paid")

In [24]:
new_table = new_table[~new_table['email'].isin(dropped_emails)]

In [25]:
new_table = new_table[
    (~new_table['email'].isin(marketo_prev_df['email'])) &
    (~new_table['billing_account_id'].isin(marketo_prev_df['billing_account_id']))
]

In [26]:
new_table = new_table[
    pd.to_datetime(new_table['console_regstration_date']) >= parse('2020-03-24')]

In [27]:
new_table = new_table[
    (~new_table['email'].isin(old_df['email'])) &
    (~new_table['billing_account_id'].isin(old_df['billing_account_id']))
]

In [28]:
new_table['experiment_date'] = lib.get_current_date_as_str()
new_table['is_dropped'] = (new_table['email'].isin(dropped_emails)).astype(int)

In [29]:
new_table.shape

(0, 21)

In [30]:
lib.save_table("ISV-Cloud-Boost", "//home/cloud_analytics/emailing/sender", 
               new_table, append=True)

In [31]:
def copy_after(table_name):
    if yt.row_count("//home/cloud_analytics/emailing/sender/" + table_name) != 0:
        yt.copy("//home/cloud_analytics/emailing/sender/" + table_name,
                "//home/cloud_analytics/emailing/sender_copy/" + table_name, force=True)
copy_after("ISV-Cloud-Boost")
copy_after("ISV-Cloud-Boost-OLD")

In [3]:
# marketo_previous_req = """
# SELECT
#     DISTINCT
#     billing_account_id,
#     any(email) as email,
#     groupUniqArray(mailing_name) as sended_mails
# FROM "//home/cloud_analytics/cubes/emailing_events/emailing_events"
# WHERE lower(mailing_name) like '%cloud-boost-program%'
# GROUP BY billing_account_id
# FORMAT TabSeparatedWithNames
# """
# marketo_prev_df = lib.execute_query(marketo_previous_req)
# marketo_prev_df['sended_mails'] = marketo_prev_df['sended_mails'].astype(str)
# lib.save_table("ISV-Cloud-Boost-OLD", "//home/cloud_analytics/emailing/sender", 
#                marketo_prev_df)