In [14]:
%load_ext autoreload
%autoreload 2
import scipy.stats as sps
from sklearn.model_selection import train_test_split
import gc
from sklearn.linear_model import LinearRegression
from dateutil.parser import *
import pandas as pd
import numpy as np
import os
from collections import Counter
import json
import typing as tp
from sklearn.preprocessing import normalize
import my_library as lib
from datetime import datetime

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [15]:
DAYS_TO_OBSERVE = 14
EMPTY_TYPE = 'undefined'
column_types = set()
TABLE_NAME = 'consumption_table'

In [16]:
services = ["nbs", 'compute', 'ai', 'mdb', 
            'gpu', 'storage', 'mk8s', 'functions',
            'speech', 'translate', 'vision', 'datalens', 
            'windows', 'marketplace', 'snapshot', 'image']

In [17]:
def get_main_condition_request():
    request = f"""
    WHERE
        toDate(event_time) < addDays(toDate(scoring_date), -1)
    AND 
        toDate(first_first_trial_consumption_datetime) == 
        addDays(toDate(scoring_date), -{DAYS_TO_OBSERVE})
    AND first_first_trial_consumption_datetime != '0000-00-00 00:00:00'
    """
    return request

In [18]:
def services_req(services):
    part_req = ""
    for service in services:
        column_types.add(f"{service}:binary")
        part_req += f"max(if (sku_name like '%{service}%', 1, 0)) as {service},\n"
    return part_req[:-2]

In [21]:
def consumption_req():
    part_req = ""
    array_part_req = ""
    for i in range(1, DAYS_TO_OBSERVE):
        ind = i + 1
        part_req += f"""SUM(if(toDate(event_time) == addDays(toDate(scoring_date), -{ind}), 
                            trial_consumption, 0)) as consumption_per_day_minus_{ind},\n"""
        column_types.add(f"consumption_per_day_minus_{ind}:numeric")
        array_part_req = f"""consumption_per_day_minus_{ind}, """ + array_part_req
    array_part_req = array_part_req[:-2]
    part_req += f"[{array_part_req}] as consumption_array"
    return part_req

In [7]:
def number_of_empty_days_in_the_end(array):
    answer = 0;
    for x in reversed(array):
        if x < 0.1:
            answer += 1
        else:
            break
    return answer


def predict_tan(target: tp.List[float]) -> float:
    target = normalize(np.array(target)[:,np.newaxis], axis=0).ravel()
    x = np.arange(0, target.shape[0])
    model = LinearRegression()
    model.fit(x.reshape(-1, 1), np.array(target))
    return model.coef_[0]

In [8]:
def make_consumption_scoring_table():
    core_req = f"""
SELECT
    {services_req(services)},
    {consumption_req()},
    length(groupUniqArray(if(active_grant_ids == '', null, 
    active_grant_ids))) as number_of_grants, 
    billing_account_id,
    addDays(toDate(first_first_trial_consumption_datetime), 
    {DAYS_TO_OBSERVE}) as scoring_date
FROM "//home/cloud_analytics/cubes/acquisition_cube/cube"
{get_main_condition_request()}
AND sku_lazy == 0
GROUP BY billing_account_id, scoring_date
FORMAT TabSeparatedWithNames
"""
    # print(core_req)
    df = lib.execute_query(core_req)
    df['number_of_empty_days'] =\
    df['consumption_array'].apply(lambda array: len([x for x in array if x < 0.1]))
    
    df['number_of_empty_days_in_the_end'] =\
    df['consumption_array'].apply(lambda array: number_of_empty_days_in_the_end(array))
    
    df['mean_consumption'] =\
    df['consumption_array'].apply(lambda array: np.mean(array))
    
    df['median_consumption'] =\
    df['consumption_array'].apply(lambda array: np.median(array))
    
    df['std_consumption'] =\
    df['consumption_array'].apply(lambda array: np.std(array))
    
    df["consumer_plateau"] =\
    ((df['consumption_array'].apply(lambda array: np.std(array[-5:])) /\
     (df['consumption_array'].apply(lambda array: np.mean(array[-5:]))
      + 1e-5)) < 0.1).astype(int)
    
    df["min_consumption"] =\
    df['consumption_array'].apply(lambda array: np.min(array))
    
    df["max_consumption"] =\
    df['consumption_array'].apply(lambda array: np.max(array))
    
    df["consumption_tangens"] = df['consumption_array'].apply(
        lambda array: predict_tan(array))
        
    df.drop(columns = ["consumption_array"], inplace=True)
    column_types.add("number_of_grants:numeric")
    column_types.add("max_consumption:numeric")
    column_types.add("min_consumption:numeric")
    column_types.add("consumer_plateau:binary")
    column_types.add("mean_consumption:numeric")
    column_types.add("std_consumption:numeric")
    column_types.add("consumption_tangens:numeric")
    column_types.add("median_consumption:numeric")
    column_types.add("number_of_empty_days:numeric")
    column_types.add("number_of_empty_days_in_the_end:numeric")
    
    final_columns = [x.split(':')[0] for x in column_types]
    assert set(df.columns) == set(final_columns + ['scoring_date', 
                                                   'billing_account_id']), \
    str(set(df.columns) - set(final_columns + ['scoring_date', 'billing_account_id'])) +\
    "not matched"
            
    return df

In [9]:
def save_types(column_types):
    rows = []
    for column_type in column_types:
        column, current_type = column_type.split(':')
        rows.append([column, current_type, TABLE_NAME])
    type_df = pd.DataFrame(np.matrix(rows), columns=['column_name', 'type',
                                                     'table_name'])
    lib.save_table('type_table', "//home/cloud_analytics/scoring_v2/data_tables", 
                   type_df, append=True)


def add_table_to_model_to_observe():
    tables_df = pd.DataFrame([TABLE_NAME], columns=['table_names'])
    lib.save_table('table_names_for_scoring_model', 
               "//home/cloud_analytics/scoring_v2/data_tables", 
               tables_df, append=True)
    

def check_types_correspondence(df, column_types):
    req = """
    SELECT
        type,
        checker_function
    FROM "//home/cloud_analytics/scoring_v2/data_tables/column_type_description"
    FORMAT TabSeparatedWithNames
    """
    type_df = lib.execute_query(req)

    checker_functions = {}
    for func_str in type_df['checker_function']:
        exec(func_str.replace("\\n", '\n'), checker_functions)
    
    assert len(df.columns) == len(column_types) + 2, \
    'difference in number of columns in dataframe and in column_types, '
    f'{len(column_types) + 2 - len(df.columns)}'
    
    for column_type in column_types:
        column, curr_type = column_type.split(":")
        curr_function_name = curr_type.split("__")[0]
        if checker_functions.get(curr_function_name + "_checker") is None:
            assert False, f"no type {curr_function_name}"
        assert checker_functions[curr_function_name + "_checker"](df, column),\
        f'{curr_function_name} check failed for column {column}'
        

def save_all_results(df):
    check_types_correspondence(df, column_types)
    lib.save_table(TABLE_NAME, "//home/cloud_analytics/scoring_v2/data_tables", df)
    save_types(column_types)
    add_table_to_model_to_observe()

In [10]:
consumption_df = make_consumption_scoring_table()

In [11]:
lib.save_table(TABLE_NAME, "//home/cloud_analytics/scoring_v2/data_tables", consumption_df)

In [12]:
save_all_results(consumption_df)