In [9]:
%load_ext autoreload
%autoreload 2
import scipy.stats as sps
from sklearn.model_selection import train_test_split
import gc
from sklearn.linear_model import LinearRegression
from dateutil.parser import *
import pandas as pd
import numpy as np
import os
from collections import Counter
import json
import typing as tp
import my_library as lib
import ast
from datetime import datetime
import time

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
DAYS_TO_OBSERVE = 14
EMPTY_TYPE = 'undefined'
column_types = set()
TABLE_NAME = 'grant_table'

In [11]:
def grants_preprod_maker():
    req = f"""
    SELECT
        billing_account_id,
        end_time,
        start_time,
        id as grant_id,
        initial_amount,
        scoring_date,
        type
    FROM (
        SELECT
            billing_account_id,
            toDate(end_time) as end_time,
            toDate(start_time) as start_time,
            source as type,
            id,
            CAST(initial_amount as Double) as initial_amount
        FROM "//home/cloud/billing/exported-billing-tables/monetary_grants_prod"
        WHERE id != ''
    ) as a
    INNER JOIN (
        SELECT
            billing_account_id,
            addDays(toDate(first_first_trial_consumption_datetime),
            {DAYS_TO_OBSERVE}) as scoring_date,
            first_first_trial_consumption_datetime
        FROM "//home/cloud_analytics/cubes/acquisition_cube/cube"
        WHERE event == 'ba_created'
        AND billing_account_id != ''
        AND cloud_id != ''
        AND first_first_trial_consumption_datetime != '0000-00-00 00:00:00'
        GROUP BY billing_account_id, first_first_trial_consumption_datetime
    ) as b
    ON a.billing_account_id == b.billing_account_id
    WHERE toDate(start_time) < toDate(scoring_date)
    FORMAT TabSeparatedWithNames
    """
    df = lib.execute_query(req)
    lib.save_table("grants_prepare_information", "//home/cloud_analytics/scoring_v2/data_tables", df)
    time.sleep(20)
    return df

grants_preprod = grants_preprod_maker()

In [12]:
req = f"""
SELECT
    billing_account_id,
    scoring_date,
    count(DISTINCT grant_id) as grants_number,
    SUM(if(type == 'default', initial_amount, 0)) as default_grant_amount,
    SUM(initial_amount) as all_grants_amount,
    if (toDate(MAX(if(type == 'default', end_time, '2100-01-01'))) < addDays(toDate(NOW()), -{DAYS_TO_OBSERVE}) 
        AND toDate(scoring_date) < addDays(toDate(NOW()), -60), 
        1, 0) as is_training_group
FROM "//home/cloud_analytics/scoring_v2/data_tables/grants_prepare_information"
GROUP BY billing_account_id, scoring_date
FORMAT TabSeparatedWithNames
"""
grant_df = lib.execute_query(req)

In [13]:
#lib.save_table(TABLE_NAME, "//home/cloud_analytics/scoring_v2/data_tables", grant_df)

In [14]:
column_types.add("grants_number:numeric")
column_types.add("default_grant_amount:numeric")
column_types.add("all_grants_amount:numeric")
column_types.add("is_training_group:binary")

In [15]:
def save_types(column_types):
    rows = []
    for column_type in column_types:
        column, current_type = column_type.split(':')
        rows.append([column, current_type, TABLE_NAME])
    type_df = pd.DataFrame(np.matrix(rows), columns=['column_name', 'type',
                                                     'table_name'])
    lib.save_table('type_table', "//home/cloud_analytics/scoring_v2/data_tables", 
                   type_df, append=True)

def add_table_to_model_to_observe():
    tables_df = pd.DataFrame([TABLE_NAME], columns=['table_names'])
    lib.save_table('table_names_for_scoring_model', 
               "//home/cloud_analytics/scoring_v2/data_tables", 
               tables_df, append=True)
    

def check_types_correspondence(df, column_types):
    req = """
    SELECT
        type,
        checker_function
    FROM "//home/cloud_analytics/scoring_v2/data_tables/column_type_description"
    FORMAT TabSeparatedWithNames
    """
    type_df = lib.execute_query(req)

    checker_functions = {}
    for func_str in type_df['checker_function']:
        exec(func_str.replace("\\n", '\n'), checker_functions)
    
    assert len(df.columns) == len(column_types) + 2, \
    'difference in number of columns in dataframe and in column_types, '\
    f'{len(column_types) + 2 - len(df.columns)}'
    
    for column_type in column_types:
        column, curr_type = column_type.split(":")
        curr_function_name = curr_type.split("__")[0]
        if checker_functions.get(curr_function_name + "_checker") is None:
            assert False, f"no type {curr_function_name}"
        assert checker_functions[curr_function_name + "_checker"](df, column),\
        f'{curr_function_name} check failed for column {column}'
        

def save_all_results(df):
    check_types_correspondence(df, column_types)
    lib.save_table(TABLE_NAME, "//home/cloud_analytics/scoring_v2/data_tables", df)
    save_types(column_types)
    add_table_to_model_to_observe()

In [16]:
save_all_results(grant_df)