In [10]:
%load_ext autoreload
%autoreload 2
import scipy.stats as sps
from sklearn.model_selection import train_test_split
import gc
from sklearn.linear_model import LinearRegression
from dateutil.parser import *
import pandas as pd
import numpy as np
import os
from collections import Counter
import json
import typing as tp
import my_library as lib
import ast
from datetime import datetime

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [11]:
DAYS_TO_OBSERVE = 14
EMPTY_TYPE = 'undefined'
column_types = set()
TABLE_NAME = 'target_table'

In [12]:
req = f"""
SELECT
    billing_account_id,
    addDays(toDate(first_first_trial_consumption_datetime),
    {DAYS_TO_OBSERVE}) as scoring_date,
    if(
            first_first_paid_consumption_datetime != '0000-00-00 00:00:00',
            1,
            0
        ) as paid_target,
    call_target as call_target
FROM "//home/cloud_analytics/cubes/acquisition_cube/cube" as a
ANY LEFT JOIN (
    SELECT
        DISTINCT
        billing_account_id,
        1 as call_target
    FROM "//home/cloud_analytics/lunin-dv/crm/crm_call_infromation_for_billings"
    WHERE 
        status == 'Held'
) as b
ON a.billing_account_id == b.billing_account_id
WHERE event == 'ba_created'
AND billing_account_id != ''
AND first_first_trial_consumption_datetime != '0000-00-00 00:00:00'
FORMAT TabSeparatedWithNames
"""
target_df = lib.execute_query(req)

In [13]:
#lib.save_table(TABLE_NAME, "//home/cloud_analytics/scoring_v2/data_tables", target_df)

In [7]:
column_types.add('paid_target:binary_target')
column_types.add('call_target:binary_target')

In [8]:
def save_types(column_types):
    rows = []
    for column_type in column_types:
        column, current_type = column_type.split(':')
        rows.append([column, current_type, TABLE_NAME])
    type_df = pd.DataFrame(np.matrix(rows), columns=['column_name', 'type',
                                                     'table_name'])
    lib.save_table('type_table', "//home/cloud_analytics/scoring_v2/data_tables", 
                   type_df, append=True)

def add_table_to_model_to_observe():
    tables_df = pd.DataFrame([TABLE_NAME], columns=['table_names'])
    lib.save_table('table_names_for_scoring_model', 
               "//home/cloud_analytics/scoring_v2/data_tables", 
               tables_df, append=True)
    

def check_types_correspondence(df, column_types):
    req = """
    SELECT
        type,
        checker_function
    FROM "//home/cloud_analytics/scoring_v2/data_tables/column_type_description"
    FORMAT TabSeparatedWithNames
    """
    type_df = lib.execute_query(req)

    checker_functions = {}
    for func_str in type_df['checker_function']:
        exec(func_str.replace("\\n", '\n'), checker_functions)
    
    assert len(df.columns) == len(column_types) + 2, \
    'difference in number of columns in dataframe and in column_types, '
    f'{len(column_types) + 2 - len(df.columns)}'
    
    for column_type in column_types:
        column, curr_type = column_type.split(":")
        curr_function_name = curr_type.split("__")[0]
        if checker_functions.get(curr_function_name + "_checker") is None:
            assert False, f"no type {curr_function_name}"
        assert checker_functions[curr_function_name + "_checker"](df, column),\
        f'{curr_function_name} check failed for column {column}'
        

def save_all_results(df):
    check_types_correspondence(df, column_types)
    lib.save_table(TABLE_NAME, "//home/cloud_analytics/scoring_v2/data_tables", df)
    save_types(column_types)
    add_table_to_model_to_observe()

In [9]:
save_all_results(target_df)