In [1]:
%load_ext autoreload
%autoreload 2
import ast
import scipy.stats as sps
from sklearn.model_selection import train_test_split
import gc
from sklearn.linear_model import LinearRegression
from dateutil.parser import *
import pandas as pd
import numpy as np
import os
from collections import Counter
import json
import typing as tp
from sklearn.preprocessing import normalize
import my_library as lib
from datetime import datetime
from collections import Counter
from catboost import CatBoostClassifier
import time
import math

In [2]:
CURRENT_DATE = lib.execute_query("""
SELECT
    current_predicting_scoring_date
FROM "//home/cloud_analytics/scoring_v2/helping_folder_for_model/model_scoring_date"
FORMAT TabSeparatedWithNames
""").iloc[0, 0]

In [3]:
alert_table = lib.execute_query("""
SELECT
    *
FROM "//home/cloud_analytics/scoring_v2/alerts/alert_table"
FORMAT TabSeparatedWithNames
""")
no_alert_table = alert_table[alert_table['problems'] == 'OK']
COMPARE_DATE = no_alert_table['scoring_date'].iloc[1]

In [4]:
alert_table['description'] = alert_table['description'].apply(lambda x: x.replace("\\n", '\n'))
alert_table['description'] = alert_table['description'].apply(lambda x: x.replace("\\", ''))

In [5]:
def columns_feature_checker():
    column_table = lib.execute_query("""
    SELECT
        *
    FROM "//home/cloud_analytics/scoring_v2/feature_importance_columns/important_columns"
    FORMAT TabSeparatedWithNames
    """)
    feature_columns = []
    for col in column_table.columns:
        if col != 'scoring_date':
            feature_columns.append(col)
            column_table[col] = column_table[col].apply(lambda x: ast.literal_eval(x.replace("\\", "")))
            
    column_dict_before = column_table[column_table['scoring_date'] == COMPARE_DATE].iloc[0].to_dict()
    column_dict_now = column_table[column_table['scoring_date'] == CURRENT_DATE].iloc[0].to_dict()
    diff_500_pct =\
    len(set(column_dict_before['important_columns_500']) - 
        set(column_dict_now['important_columns_500'])) / 500 * 100
    alert_description = ""
    has_problems = 0
    has_problems = max(has_problems, int(diff_500_pct > 30))
    status = 'OK' if has_problems == 0 else "ALERT"
    alert_description += f"500 columns feature pct difference: {diff_500_pct}% - {status};" +\
                        "\n=================\n"
    return alert_description, has_problems

In [6]:
def array_len(string_array):
    return len(" ".join(string_array.split()).split())

In [7]:
def check_last_users_metrics(metric, last_users_now_dict, min_dict, confidence_inerval_dict):
    alert_description = ""
    min_val = confidence_inerval_dict[metric + '_mean'] - 3 * confidence_inerval_dict[metric + '_std']
    #############################################
    has_problem1 = int(last_users_now_dict[metric] < min_dict[metric])
    status = 'OK' if has_problem1 == 0 else "ALERT"
    alert_description += f"last users {metric}: {last_users_now_dict[metric]} > {min_dict[metric]}"\
    f" - {status};\n"
    #############################################
    has_problem2 = int(last_users_now_dict[metric] < min_val)
    status = 'OK' if has_problem2 == 0 else "ALERT"
    alert_description += f"last users {metric}: {last_users_now_dict[metric]}"\
                         f" in 99.7% conf interval (min: {round(min_val, 3)})"\
                         f" - {status};\n"
    
    return alert_description, max([has_problem2, has_problem1])

In [8]:
def check_cross_val_mean_metrics(metric, confidence_inerval_dict_now, confidence_inerval_dict_before, min_dict):
    alert_description = ""
    mean_metric = metric + "_mean"
    len_array = array_len(confidence_inerval_dict_before[metric + "_array"])
    min_mean = confidence_inerval_dict_before[mean_metric] -\
               3 * confidence_inerval_dict_before[metric + "_std"] / np.sqrt(len_array)
    
    #############################################
    has_problem1 = int(confidence_inerval_dict_now[mean_metric] < min_dict[metric])
    status = 'OK' if has_problem1 == 0 else "ALERT"
    
    alert_description +=\
    f"cross val mean {metric}: {confidence_inerval_dict_now[mean_metric]} > {min_dict[metric]}"\
                         f" - {status};\n"
    #############################################
    has_problem2 = int(confidence_inerval_dict_now[mean_metric] < min_mean)
    status = 'OK' if has_problem2 == 0 else "ALERT"
    
    alert_description +=\
    f"cross val mean {metric}: {confidence_inerval_dict_now[mean_metric]} in 99.7% conf interval"\
                         f" (min: {round(min_mean, 3)})"\
                         f" - {status};\n"

    return alert_description, max([has_problem2, has_problem1])

In [9]:
def check_metrics(target_type, last_users_min_dict, cross_val_min_dict):
    folder = target_type + "_metrics"
    name = target_type
    alert_description = ""
    has_problems = 0
    
    cross_val_name = name + '_cross_validation_results'
    last_users_name = name + '_last_users_validation_results'
    folder = name + '_metrics'

    ################################
    cross_val_metrics_df = lib.execute_query(f"""
    SELECT
        *
    FROM "//home/cloud_analytics/scoring_v2/{folder}/{cross_val_name}"
    FORMAT TabSeparatedWithNames
    """)

    last_users_metrics_df = lib.execute_query(f"""
    SELECT
        *
    FROM "//home/cloud_analytics/scoring_v2/{folder}/{last_users_name}"
    FORMAT TabSeparatedWithNames
    """)
    ################################
    last_users_now_dict = last_users_metrics_df[
        last_users_metrics_df['scoring_date'] == CURRENT_DATE
    ].iloc[0].to_dict()
    cross_val_now_dict = cross_val_metrics_df[
        cross_val_metrics_df['scoring_date'] == CURRENT_DATE
    ].iloc[0].to_dict()
    cross_val_before_dict = cross_val_metrics_df[
        cross_val_metrics_df['scoring_date'] == COMPARE_DATE
    ].iloc[0].to_dict()
    ################################
    
    new_description1, new_problem1 = check_last_users_metrics('precision', last_users_now_dict, 
                                     last_users_min_dict, cross_val_now_dict)
    new_description2, new_problem2 = check_last_users_metrics('recall', last_users_now_dict, 
                                     last_users_min_dict, cross_val_now_dict)
    
    new_description3, new_problem3 = check_cross_val_mean_metrics('precision', cross_val_now_dict, 
                                                                  cross_val_before_dict, 
                                                                  cross_val_min_dict)
    
    new_description4, new_problem4 = check_cross_val_mean_metrics('recall', cross_val_now_dict, 
                                                                  cross_val_before_dict, 
                                                                  cross_val_min_dict)
    alert_description = target_type + ":\n" + \
    new_description1 + new_description2 + new_description3 + new_description4 + "=================\n"
    has_problem = max([new_problem1, new_problem2, new_problem3, new_problem4])
    return alert_description, has_problem

In [10]:
alert_description1, has_problem1 = columns_feature_checker()

In [11]:
paid_last_users_dict_min = {'precision': 0.33,
                            'recall': 0.9}

paid_cross_val_mean_dict_min = {'precision': 0.35,
                                'recall': 0.92}
alert_description2, has_problem2 = check_metrics('paid', 
                                                 paid_last_users_dict_min, 
                                                 paid_cross_val_mean_dict_min)

In [12]:
call_last_users_dict_min = {'precision': 0.2,
                            'recall': 0.92}

call_cross_val_mean_dict_min = {'precision': 0.45,
                                'recall': 0.9}
alert_description3, has_problem3 = check_metrics('call_answer', 
                                                 call_last_users_dict_min, 
                                                 call_cross_val_mean_dict_min)

In [13]:
alert_description = alert_description1 + alert_description2 + alert_description3

In [14]:
has_problem = max([has_problem1, has_problem2, has_problem3])

In [15]:
print(alert_description)

500 columns feature pct difference: 10.0% - OK;
paid:
last users precision: 0.3704453441 > 0.33 - OK;
last users precision: 0.3704453441 in 99.7% conf interval (min: 0.221) - OK;
last users recall: 0.9219143577 > 0.9 - OK;
last users recall: 0.9219143577 in 99.7% conf interval (min: 0.907) - OK;
cross val mean precision: 0.3982280405 > 0.35 - OK;
cross val mean precision: 0.3982280405 in 99.7% conf interval (min: 0.31) - OK;
cross val mean recall: 0.9615380348 > 0.92 - OK;
cross val mean recall: 0.9615380348 in 99.7% conf interval (min: 0.939) - OK;
call_answer:
last users precision: 0.2919369787 > 0.2 - OK;
last users precision: 0.2919369787 in 99.7% conf interval (min: 0.174) - OK;
last users recall: 0.9752321981 > 0.92 - OK;
last users recall: 0.9752321981 in 99.7% conf interval (min: 0.79) - OK;
cross val mean precision: 0.5002074437 > 0.45 - OK;
cross val mean precision: 0.5002074437 in 99.7% conf interval (min: 0.346) - OK;
cross val mean recall: 0.9104741713 > 0.9 - OK;
cross va

In [16]:
status = 'OK' if has_problem == 0 else "ALERT"

In [17]:
final_adding_row = pd.DataFrame([{'scoring_date': CURRENT_DATE,
                    'problems': status,
                    'description': alert_description,
                    'added_in_crm': 0}])

In [18]:
if len(alert_table[alert_table['scoring_date'] == CURRENT_DATE]) == 0:
    alert_table.loc[len(alert_table)] = final_adding_row.iloc[0]
else:
    curr_ind = alert_table[alert_table['scoring_date'] == CURRENT_DATE].index[0]
    alert_table.iloc[curr_ind] = final_adding_row.iloc[0]

In [19]:
alert_table = alert_table.sort_values(by='scoring_date', ascending=False)

In [20]:
lib.save_table('alert_table', '//home/cloud_analytics/scoring_v2/alerts', alert_table)

In [21]:
assert has_problem == 0, alert_description