In [1]:
%load_ext autoreload
%autoreload 2
import scipy.stats as sps
from sklearn.model_selection import train_test_split
import gc
from sklearn.linear_model import LinearRegression
from dateutil.parser import *
import pandas as pd
import numpy as np
import os
from collections import Counter
import json
import typing as tp
import my_library as lib
from datetime import datetime

  import pandas.util.testing as tm


In [2]:
DAYS_TO_OBSERVE = 14
EMPTY_TYPE = 'undefined'
column_types = set()
TABLE_NAME = 'vm_info_table'

In [15]:
def get_main_condition_request():
    request = f"""
    INNER JOIN (
        SELECT
            billing_account_id,
            addDays(toDate(first_first_trial_consumption_datetime), 14) as scoring_date,
            first_first_trial_consumption_datetime
        FROM "//home/cloud_analytics/cubes/acquisition_cube/cube"
        WHERE event == 'ba_created'
        AND first_first_trial_consumption_datetime != '0000-00-00 00:00:00'
        AND billing_account_id != ''
    ) as b
    ON billing_account_id == b.billing_account_id
    WHERE toDate(vm_finish) < addDays(toDate(scoring_date), -1)
    AND toDate(vm_start) > toDate(first_first_trial_consumption_datetime)
    GROUP BY billing_account_id, scoring_date
    """
    return request

In [4]:
def core_and_preemtible_req():
    column_types.add("min_cores_real:numeric")
    column_types.add("avg_cores_real:numeric")
    column_types.add("max_cores_real:numeric")
    
    column_types.add("min_num_of_cores:numeric")
    column_types.add("avg_num_of_cores:numeric")
    column_types.add("max_num_of_cores:numeric")
    
    column_types.add("diff_num_of_cores:numeric")
    column_types.add("different_core_number:numeric")
    column_types.add("different_core_more_than_one:binary")
    
    column_types.add("is_5_pct_used:binary")
    column_types.add("is_100_pct_used:binary")
    column_types.add("is_middle_pct_used:binary")
    
    column_types.add("is_preemtible:binary")
    column_types.add("num_of_preemtibles:numeric")

    part_req  = f"""
    min(vm_cores_real) as min_cores_real,
    avg(vm_cores_real) as avg_cores_real,
    max(vm_cores_real) as max_cores_real,
    
    min(vm_cores) as min_num_of_cores,
    avg(vm_cores) as avg_num_of_cores,
    max(vm_cores) as max_num_of_cores,
    
    if(min(vm_core_fraction) == 5, 1, 0) as is_5_pct_used,
    if(max(vm_core_fraction) == 100, 1, 0) as is_100_pct_used,
    if(max(vm_core_fraction) != 100 and 
       max(vm_core_fraction) != 5, 1, 0) as is_middle_pct_used,
    
    max_num_of_cores - min_num_of_cores as diff_num_of_cores,
    length(groupUniqArray(vm_cores)) as different_core_number,
    if (different_core_number > 1, 1, 0) as different_core_more_than_one,
    
    max(vm_preemptible) as is_preemtible,
    length(groupUniqArray(vm_preemptible)) as num_of_preemtibles
    
    """
    return part_req

In [5]:
def vm_num_and_az_num_req():
    column_types.add("num_of_vm:numeric")
    column_types.add("more_than_one_vm:binary")
    
    column_types.add("az_num:numeric")
    column_types.add("more_than_one_az:binary")
    part_req = """
    length(groupUniqArray(vm_id)) as num_of_vm,
    if (num_of_vm > 1, 1, 0) as more_than_one_vm,
    
    count(DISTINCT node_az) as az_num,
    if (num_of_vm > 1, 1, 0) as more_than_one_az
    """
    return part_req

In [6]:
def cpu_req():
    column_types.add("max_cpu_load_avg_avg:numeric")
    column_types.add("min_cpu_load_avg_avg:numeric")
    column_types.add("avg_cpu_load_avg_avg:numeric")
    column_types.add("median_cpu_load_avg_avg:numeric")
    column_types.add("std_cpu_load_avg_avg:numeric")
    
    part_req = """
    max(vm_cpu_load_avg_avg) as max_cpu_load_avg_avg,
    min(vm_cpu_load_avg_avg) as min_cpu_load_avg_avg,
    avg(vm_cpu_load_avg_avg) as avg_cpu_load_avg_avg,
    median(vm_cpu_load_avg_avg) as median_cpu_load_avg_avg,
    stddevPop(vm_cpu_load_avg_avg) as std_cpu_load_avg_avg
    """
    return part_req 

In [7]:
def memory_req():
    column_types.add("max_vm_memory_real:numeric")
    
    column_types.add("min_used_vm_memory_pct:numeric")
    column_types.add("max_used_vm_memory_pct:numeric")
    column_types.add("avg_used_vm_memory_pct:numeric")
    
    column_types.add("min_vm_memory:numeric")
    column_types.add("avg_vm_memory:numeric")
    column_types.add("max_vm_memory:numeric")
    column_types.add("std_vm_memory:numeric")
    
    column_types.add("vm_memory_to_cores_ratio:numeric")
    
    part_req = """
    max(vm_memory_real) as max_vm_memory_real,
    
    min(vm_memory_real / vm_memory) * 100 as min_used_vm_memory_pct,
    max(vm_memory_real / vm_memory) * 100 as max_used_vm_memory_pct,
    avg(vm_memory_real / vm_memory) * 100 as avg_used_vm_memory_pct,
    
    min(vm_memory) as min_vm_memory,
    avg(vm_memory) as avg_vm_memory,
    max(vm_memory) as max_vm_memory,
    stddevPop(vm_memory) as std_vm_memory,
    
    avg(vm_memory_to_cores_ratio) as vm_memory_to_cores_ratio
    """
    return part_req 

In [8]:
def vm_product_req():
    column_types.add("vm_product_name:json__10")
    
    part_req = """
    groupUniqArray(vm_product_name) as vm_product_name
    """
    return part_req 

In [9]:
request_texts_array = [core_and_preemtible_req(),
                       vm_num_and_az_num_req(),
                       cpu_req(),
                       memory_req(),
                       vm_product_req()]

In [10]:
def make_vm_information_scoring_table(request_texts_array):
    func_requests = ", ".join(request_texts_array)
    
    full_req = f"""
SELECT
    {func_requests},
    ba_id as billing_account_id,
    scoring_date
FROM "//home/cloud_analytics/compute_logs/vm_cube/vm_cube" as a
{get_main_condition_request()}
FORMAT TabSeparatedWithNames
        """.encode('utf-8')
    #print(full_req.decode('utf-8'))
    df = lib.execute_query(full_req)
    df['vm_product_name'] = df['vm_product_name'].astype(str)

    return df

In [11]:
def save_types(column_types):
    rows = []
    for column_type in column_types:
        column, current_type = column_type.split(':')
        rows.append([column, current_type, TABLE_NAME])
    type_df = pd.DataFrame(np.matrix(rows), columns=['column_name', 'type',
                                                     'table_name'])
    lib.save_table('type_table', "//home/cloud_analytics/scoring_v2/data_tables", 
                   type_df, append=True)


def add_table_to_model_to_observe():
    tables_df = pd.DataFrame([TABLE_NAME], columns=['table_names'])
    lib.save_table('table_names_for_scoring_model', 
               "//home/cloud_analytics/scoring_v2/data_tables", 
               tables_df, append=True)
    

def check_types_correspondence(df, column_types):
    req = """
    SELECT
        type,
        checker_function
    FROM "//home/cloud_analytics/scoring_v2/data_tables/column_type_description"
    FORMAT TabSeparatedWithNames
    """
    type_df = lib.execute_query(req)

    checker_functions = {}
    for func_str in type_df['checker_function']:
        exec(func_str.replace("\\n", '\n'), checker_functions)
    
    assert len(df.columns) == len(column_types) + 2, \
    'difference in number of columns in dataframe and in column_types, '
    f'{len(column_types) + 2 - len(df.columns)}'
    
    for column_type in column_types:
        column, curr_type = column_type.split(":")
        curr_function_name = curr_type.split("__")[0]
        if checker_functions.get(curr_function_name + "_checker") is None:
            assert False, f"no type {curr_function_name}"
        assert checker_functions[curr_function_name + "_checker"](df, column),\
        f'{curr_function_name} check failed for column {column}'
        

def save_all_results(df):
    check_types_correspondence(df, column_types)
    lib.save_table(TABLE_NAME, "//home/cloud_analytics/scoring_v2/data_tables", df)
    save_types(column_types)
    add_table_to_model_to_observe()

In [12]:
vm_df = make_vm_information_scoring_table(request_texts_array)

In [13]:
save_all_results(vm_df)