In [1]:
%load_ext autoreload
%autoreload 2
import scipy.stats as sps
from sklearn.model_selection import train_test_split
import gc
from sklearn.linear_model import LinearRegression
from dateutil.parser import *
import pandas as pd
import numpy as np
import os
from collections import Counter
import json
import typing as tp
from sklearn.preprocessing import normalize
import my_library as lib
from datetime import datetime

  import pandas.util.testing as tm


In [2]:
DAYS_TO_OBSERVE = 14
EMPTY_TYPE = 'undefined'
column_types = set()
TABLE_NAME = 'support_table'

In [3]:
def make_support_scoring_table():
    req = f"""
SELECT
    billing_account_id,
    count(*) as number_of_support_call,
    scoring_date
FROM (
    SELECT
        DISTINCT 
        cloud_id,
        billing_account_id,
        toDate(created_at) as created_time,
        scoring_date
    FROM "//home/cloud/billing/exported-support-tables/tickets_prod" as a
    INNER JOIN (
        SELECT
            cloud_id,
            billing_account_id,
            addDays(toDate(MIN(first_first_trial_consumption_datetime)),
            {DAYS_TO_OBSERVE}) as scoring_date
        FROM "//home/cloud_analytics/cubes/acquisition_cube/cube"
        WHERE event == 'ba_created'
        AND billing_account_id != ''
        AND cloud_id != ''
        AND first_first_trial_consumption_datetime != '0000-00-00 00:00:00'
        GROUP BY cloud_id, billing_account_id 
    ) as b
    ON a.cloud_id == b.cloud_id
    WHERE created_time < scoring_date
)
GROUP BY billing_account_id, scoring_date
FORMAT TabSeparatedWithNames
    """
    df = lib.execute_query(req)
    df['number_of_support_call'] = df['number_of_support_call'].apply(
        lambda x: '1' if x == 1 else 'many')
    df['one_ore_more_support_call'] = [1 for _ in range(len(df))]
    
    column_types.add('one_ore_more_support_call:binary')
    column_types.add('number_of_support_call:category')
    
    final_columns = [x.split(':')[0] for x in column_types]
    assert set(df.columns) == set(final_columns + ['scoring_date', 'billing_account_id']), \
    str(set(df.columns) - set(final_columns + ['scoring_date', 'billing_account_id'])) +\
    " not matched"
    return df

In [4]:
def save_types(column_types):
    rows = []
    for column_type in column_types:
        column, current_type = column_type.split(':')
        rows.append([column, current_type, TABLE_NAME])
    type_df = pd.DataFrame(np.matrix(rows), columns=['column_name', 'type',
                                                     'table_name'])
    lib.save_table('type_table', "//home/cloud_analytics/scoring_v2/data_tables", 
                   type_df, append=True)


def add_table_to_model_to_observe():
    tables_df = pd.DataFrame([TABLE_NAME], columns=['table_names'])
    lib.save_table('table_names_for_scoring_model', 
               "//home/cloud_analytics/scoring_v2/data_tables", 
               tables_df, append=True)
    

def check_types_correspondence(df, column_types):
    req = """
    SELECT
        type,
        checker_function
    FROM "//home/cloud_analytics/scoring_v2/data_tables/column_type_description"
    FORMAT TabSeparatedWithNames
    """
    type_df = lib.execute_query(req)

    checker_functions = {}
    for func_str in type_df['checker_function']:
        exec(func_str.replace("\\n", '\n'), checker_functions)
    
    assert len(df.columns) == len(column_types) + 2, \
    'difference in number of columns in dataframe and in column_types, '
    f'{len(column_types) + 2 - len(df.columns)}'
    
    for column_type in column_types:
        column, curr_type = column_type.split(":")
        curr_function_name = curr_type.split("__")[0]
        if checker_functions.get(curr_function_name + "_checker") is None:
            assert False, f"no type {curr_function_name}"
        assert checker_functions[curr_function_name + "_checker"](df, column),\
        f'{curr_function_name} check failed for column {column}'
        

def save_all_results(df):
    check_types_correspondence(df, column_types)
    lib.save_table(TABLE_NAME, "//home/cloud_analytics/scoring_v2/data_tables", df)
    save_types(column_types)
    add_table_to_model_to_observe()

In [5]:
support_df = make_support_scoring_table()

In [6]:
save_all_results(support_df)