In [1]:
%load_ext autoreload
%autoreload 2
import scipy.stats as sps
from sklearn.model_selection import train_test_split
import gc
from sklearn.linear_model import LinearRegression
from dateutil.parser import *
import pandas as pd
import numpy as np
import os
from collections import Counter
import json
import typing as tp
import my_library as lib
import ast
from datetime import datetime

  import pandas.util.testing as tm


In [2]:
DAYS_TO_OBSERVE = 14
EMPTY_TYPE = 'undefined'
TABLE_NAME = 'main_info_table'

In [3]:
column_types = set()

In [4]:
def get_main_condition_request():
    request = f"""
    WHERE
        toDate(event_time) < toDate(scoring_date)
    AND 
        toDate(first_first_trial_consumption_datetime) == 
        addDays(toDate(scoring_date), -{DAYS_TO_OBSERVE})
    AND first_first_trial_consumption_datetime != '0000-00-00 00:00:00'
    """
    return request

In [6]:
string_columns_NO_limits = [
    "puid",
    "user_settings_email",
    "email",
    "age",
    "sex",
    "country",
    "city",
    'general_interests',
    'channel',
    'search_phrase'
]

string_columns_ONLY_in_day_use = [
    "cloud_id",
    "ba_state",
    "ba_type",
    "ba_usage_status",
    "ba_person_type",
    "is_fraud",
    "is_robot",
    "ba_payment_cycle_type",
    "grant_sources",
    'segment',
    'master_account_id',
    'account_name',
    'first_first_paid_consumption_datetime',
    "os",
    "device_model",
    "device_type",
    "ad_block",
]

numeric_columns = [
    "hits",
    "total_visits",
    "mobile_phone_vendor",
    'income',
    'is_corporate_card'
]

In [7]:
def get_core_req(string_columns_NO_limits, string_columns_ONLY_in_day_use, numeric_columns):
    """
    Создание таблицы с текущими на момент даты скоринга значениями столбцов из  columns
    
    Параметры
    ----------
    string_columns : List[str]
        Столбцы, в которых данные храняться в виде string 
        (или нужно работать с данными в виде string)
        Берется последняя не пустая запись по времени до scoring_date 
        и выдается как ответ
    numeric_columns : List[str]
        Столбцы, в которых данные храняться в виде числа
        По ним возвращается максимум
    Возвращает
    -------
    request: str
        Clickhouse запрос с столбцами из 
        string_columns + numeric_columns + billing_account_id + scoring_date (из dates),
        При этом смотрятся только те юзеры, для которых прошло ровно DAYS_TO_OBSERVE дней 
        с начала first_first_trial_consumption_datetime до одной из дат в dates
    """
    string_no_limits_part_req = ""
    for column in string_columns_NO_limits:
        string_no_limits_part_req += f"""    
        argMax({column}, 
        if(CAST({column} as String) != '' 
            AND 
            CAST({column} as String) != '[]', event_time, '')) as {column},\n"""
        
    string_day_use_part_req = ""
    for column in string_columns_ONLY_in_day_use:
        string_day_use_part_req += f"""    
        argMax(if (event == 'day_use', CAST({column} as String), ''), 
        if(if (event == 'day_use', CAST({column} as String), '') != '' 
           AND 
           if (event == 'day_use', CAST({column} as String), '') != '[]', event_time, '')) as {column},\n"""
    
    numeric_part_req = ""
    for column in numeric_columns:
        numeric_part_req += f"""    max(if (event == 'day_use', {column}, 0)) as {column},\n"""

    core_req = f"""
SELECT
    {string_no_limits_part_req}
    {string_day_use_part_req}
    {numeric_part_req}
    billing_account_id,
    addDays(toDate(first_first_trial_consumption_datetime), {DAYS_TO_OBSERVE}) as scoring_date
FROM "//home/cloud_analytics/cubes/acquisition_cube/cube"
{get_main_condition_request()}
GROUP BY billing_account_id, scoring_date
    """
    return core_req

In [8]:
def is_company_by_name_req():
    column_types.add('is_company_by_name:binary')
    company_attributes_in_name = ['.ru',
                                  '.com',
                                  'коворкинг',
                                  'компания',
                                  'company',
                                  'ooo',
                                  'oao',
                                  'оао',
                                  'ооо',
                                  'ао',
                                  'ao',
                                  'веб',
                                  'группа']
    company_attributes_in_name = [f"lowerUTF8(account_name) like '%{name}%'" 
                                  for name in company_attributes_in_name]
    adding_text = "OR ".join(company_attributes_in_name)
    part_req  = f"""
    if({adding_text}, 1, 0) as is_company_by_name
    """
    return part_req

def is_company_req():
    column_types.add('is_company:binary')
    part_req  = f"""
    if(is_company_by_name == 1 or is_corporate_card == 1 
    or ba_person_type like '%company%', 1, 0) as is_company
    """
    return part_req

def is_already_paid():
    column_types.add('is_already_paid:binary')
    part_req  = f"""
    if(first_first_paid_consumption_datetime != '0000-00-00 00:00:00'
       and
       toDate(first_first_paid_consumption_datetime) < toDate(scoring_date), 1, 0) as is_already_paid
    """
    return part_req


In [9]:
def has_master_account_id_req():
    column_types.add('has_master_account_id:binary')
    part_req  = f"""
    if(master_account_id != '', 1, 0) as has_master_account_id
    """
    return part_req

def grant_sources_req():
    column_types.add('grant_sources_st:binary')
    column_types.add('grant_sources_default:binary')
    column_types.add('grant_sources_offer:binary')
    column_types.add('grant_sources_policy:binary')
    part_req  = f"""
    if(grant_sources like '%ST%', 1, 0) as grant_sources_st,
    if(grant_sources like '%default%', 1, 0) as grant_sources_default,
    if(grant_sources like '%offer%', 1, 0) as grant_sources_offer,
    if(grant_sources like '%policy%', 1, 0) as grant_sources_policy
    """
    return part_req

def from_desktop_req():
    column_types.add('from_desktop:binary')
    part_req  = f"""
    if(device_type like '%desktop%', 1, 0) as from_desktop
    """
    return part_req

def os_req():
    column_types.add('os:category')
    part_req  = f"""
    if (os != '', splitByChar(' ', assumeNotNull(os))[1], '') as os
    """
    return part_req

def usage_status_req():
    column_types.add('usage_status:category')
    part_req  = f"""
    if(ba_usage_status == '', 'trial', ba_usage_status) as usage_status
    """
    return part_req

def person_type_req():
    column_types.add('person_type:category')
    part_req  = f"""
    multiIf(ba_person_type == 'switzerland_nonresident_company', 
            'company',
            ba_person_type == '', 
            'individual',
            ba_person_type) as person_type
    """
    return part_req

In [10]:
def simple_req():
    column_types.add('age:category')
    column_types.add('sex:category')
    column_types.add('ba_type:category')
    column_types.add('is_fraud:binary')
    column_types.add('device_type:category')
    column_types.add('ad_block:category')
    column_types.add('is_robot:category')
    column_types.add('segment:category')
    column_types.add('income:category')
    column_types.add('hits:numeric')
    column_types.add('mobile_phone_vendor:numeric')
    column_types.add('total_visits:numeric')
    column_types.add('general_interests:json__30')
    #column_types.add('account_name:category')
    part_req  = f"""
    age,
    sex,
    ba_type,
    is_fraud,
    device_type,
    ad_block,
    is_robot,
    channel,
    segment,
    general_interests,
    hits,
    total_visits,
    mobile_phone_vendor,
    income,
    account_name
    """
    return part_req

In [11]:
def state_req():
    column_types.add('state:category')
    part_req  = f"""
    multiIf(ba_state == 'suspended' or ba_state == 'inactive' or ba_state == 'deleted', 
            'suspended_now',
            ba_state == 'payment_required' or ba_state == 'payment_not_confirmed',
            'payment_problem',
            ba_state) as state
    """
    return part_req

In [12]:
def region_req():
    column_types.add('region:category')
    part_req  = f"""
    multiIf(city == 'Москва', 'Moscow',
                city == 'Санкт-Петербург', 'Saint Petersburg',
                country == 'Россия', 'Russia', 
                isNotNull(country), 'Other countries',
                'undefined') as region
    """
    return part_req

In [13]:
def search_phrase_req():
    column_types.add('search_phrase:category')
    part_req  = f"""
    multiIf(search_phrase == 'yandex' or search_phrase == 'яндекс', 
    'yandex_cloud_direct_search', 
    search_phrase == '', '{EMPTY_TYPE}',
    'not_direct_search_type') as search_phrase
    """
    return part_req

In [14]:
def email_features_req():
    column_types.add('is_yandex_email:binary')
    column_types.add('is_corporate_email:binary')
    column_types.add('is_equal_user_settings_email_and_email:binary')
    part_req = """
    multiIf(user_settings_email LIKE '%@yandex.%' OR user_settings_email LIKE '%@ya.%', 1, 0) 
    AS is_yandex_email,
    multiIf(match(user_settings_email, 
    '.*@yandex\..*|.*@ya\..*|.*@gmail\..*|.*@mail\..*|.*@tut\..*|.*@linqcorp\..*'), 
    0, 1) AS is_corporate_email,
    if (user_settings_email == email, 1, 0) as is_equal_user_settings_email_and_email
    """
    return part_req

In [15]:
def last_value_preprocess():
    req = ""
    for column_type in column_types:
        column, curr_type = column_type.split(":")
        if "category" == curr_type:
            req += f"""lowerUTF8(if(
            CAST({column} as String) == '', '{EMPTY_TYPE}', 
            replaceAll(CAST({column} as String), ' ', '_')
            )) as {column},\n"""
        else:
            req += f'{column},\n'
    return req[:-2]

In [16]:
def json_changer(x):
    if x == '':
        return '[]'
    if not isinstance(ast.literal_eval(x), list):
        return f'[{x}]'
    return x

In [17]:
def make_common_information_scoring_table(request_texts_array):
    core_req = get_core_req(string_columns_NO_limits, string_columns_ONLY_in_day_use, numeric_columns)
    func_requests = ", ".join(request_texts_array)
    
    full_req = f"""
        SELECT
            {last_value_preprocess()},
            billing_account_id,
            scoring_date
        FROM (
            SELECT
                {func_requests},
                billing_account_id,
                scoring_date
            FROM ({core_req})
        )
        FORMAT TabSeparatedWithNames
        """.encode('utf-8')
    #print(full_req.decode('utf-8'))
    df = lib.execute_query(full_req)
    df['ad_block'] = df['ad_block'].astype(str)
    df['income'] = df['income'].astype(str)
    df['general_interests'] = df['general_interests'].apply(
        lambda x: json_changer(x)
    )
    return df

In [18]:
def save_types(column_types):
    rows = []
    for column_type in column_types:
        column, current_type = column_type.split(':')
        rows.append([column, current_type, TABLE_NAME])
    type_df = pd.DataFrame(np.matrix(rows), columns=['column_name', 'type',
                                                     'table_name'])
    lib.save_table('type_table', "//home/cloud_analytics/scoring_v2/data_tables", 
                   type_df, append=True)


def add_table_to_model_to_observe():
    tables_df = pd.DataFrame([TABLE_NAME], columns=['table_names'])
    lib.save_table('table_names_for_scoring_model', 
               "//home/cloud_analytics/scoring_v2/data_tables", 
               tables_df, append=True)
    

def check_types_correspondence(df, column_types):
    req = """
    SELECT
        type,
        checker_function
    FROM "//home/cloud_analytics/scoring_v2/data_tables/column_type_description"
    FORMAT TabSeparatedWithNames
    """
    type_df = lib.execute_query(req)

    checker_functions = {}
    for func_str in type_df['checker_function']:
        exec(func_str.replace("\\n", '\n'), checker_functions)
    
    assert len(df.columns) == len(column_types) + 2, \
    'difference in number of columns in dataframe and in column_types, '
    f'{len(column_types) + 2 - len(df.columns)}'
    
    for column_type in column_types:
        column, curr_type = column_type.split(":")
        curr_function_name = curr_type.split("__")[0]
        if checker_functions.get(curr_function_name + "_checker") is None:
            assert False, f"no type {curr_function_name}"
        assert checker_functions[curr_function_name + "_checker"](df, column),\
        f'{curr_function_name} check failed for column {column}'
        

def save_all_results(df):
    check_types_correspondence(df, column_types)
    lib.save_table(TABLE_NAME, "//home/cloud_analytics/scoring_v2/data_tables", df)
    save_types(column_types)
    add_table_to_model_to_observe()

In [19]:
request_texts_array = [
            is_company_by_name_req(),
            has_master_account_id_req(),
            email_features_req(), search_phrase_req(),
            region_req(), state_req(), simple_req(), person_type_req(),
            usage_status_req(), grant_sources_req(), 
            from_desktop_req(), os_req(), is_company_req(), is_already_paid()]

In [20]:
main_information_df = make_common_information_scoring_table(request_texts_array)

In [21]:
#lib.save_table(TABLE_NAME, "//home/cloud_analytics/scoring_v2/data_tables", main_information_df)

In [22]:
save_all_results(main_information_df)