In [1]:
%load_ext autoreload
%autoreload 2
import ast
import scipy.stats as sps
from sklearn.model_selection import train_test_split
import gc
from sklearn.linear_model import LinearRegression
from dateutil.parser import *
import pandas as pd
import numpy as np
import os
from collections import Counter
import json
import typing as tp
from sklearn.preprocessing import normalize
import my_library as lib
from datetime import datetime
from collections import Counter
from catboost import CatBoostClassifier
import time
import warnings
from sklearn.model_selection import cross_validate

warnings.simplefilter("ignore")

In [2]:
os.environ['KMP_DUPLICATE_LIB_OK']='True'
import time
import xgboost as xgb
from xgboost import XGBClassifier
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.base import ClassifierMixin
from sklearn.base import BaseEstimator
from sklearn.base import clone
from sklearn.calibration import calibration_curve, CalibratedClassifierCV
from catboost import CatBoostClassifier
from imblearn.pipeline import Pipeline
from imblearn.pipeline import make_pipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.base import BaseEstimator, TransformerMixin


In [50]:
EMPTY_TYPE = 'undefined'
JSON_EMPTY_VALUE = 'zero_len'
CURRENT_DATE = lib.get_current_date_as_str()
VALIDATION_TEST_SIZE = 2000
TRAIN_SIZE = 20000
PAID_THRESHOLD = 0.05
CALL_THRESHOLD = 0.125

In [4]:
path = "//home/cloud_analytics/scoring_v2/data_tables"

In [5]:
time.sleep(20)

In [6]:
def get_get_all_table_names():
    tables_name_req = """
    SELECT
        *
    FROM "//home/cloud_analytics/scoring_v2/data_tables/table_names_for_scoring_model"
    FORMAT TabSeparatedWithNames
    """
    tables_name_df = lib.execute_query(tables_name_req)
    return tables_name_df

In [7]:
def get_column_type_dict():
    column_req = """
    SELECT
        *
    FROM "//home/cloud_analytics/scoring_v2/data_tables/type_table"
    FORMAT TabSeparatedWithNames
    """
    column_df = lib.execute_query(column_req)
    column_df['column_name'] = column_df[['column_name', 'table_name']].apply(
        lambda x: x['table_name'] + "__" + x['column_name'], axis=1)

    column_df.index = column_df['column_name']
    column_dict = column_df.to_dict()['type']
    return column_dict

In [8]:
def get_one_table(name):
    full_path = path + "/" + name
    req = f"""
    SELECT
        *
    FROM "{full_path}"
    FORMAT TabSeparatedWithNames
    """
    df = lib.execute_query(req)
    old_columns = df.columns
    new_columns = [name +"__" + column for column in old_columns]
    df.columns = new_columns
    df['billing_account_id'] = df[name + "__" + 'billing_account_id']
    df['scoring_date'] = df[name + "__" + 'scoring_date']
    if 'is_training_group' in old_columns:
        df['is_training_group'] = df[name + "__" + 'is_training_group']
        df.drop(columns = [name + "__" + 'is_training_group'], inplace=True)
    df.drop(columns = [name + "__" + 'billing_account_id', 
                       name + "__" + 'scoring_date'], inplace=True)
    return df

In [9]:
def get_full_table():
    tables_name_df = get_get_all_table_names()
    dataframe_dict = {}
    for name in tables_name_df['table_names'].tolist():
        dataframe_dict[name] = get_one_table(name)

    df = dataframe_dict['main_info_table']
    for name in dataframe_dict:
        if name != 'main_info_table':
            df = pd.merge(df, dataframe_dict[name], on=['billing_account_id', 
                                                    'scoring_date'], how='left')
    del dataframe_dict
    return df

In [10]:
def number_replace(df, column):
    df[column].fillna(0, inplace=True)
    df[column] = df[column].astype(float)
    
def category_replace(df, column):
    df[column].fillna(EMPTY_TYPE, inplace=True)
    df[column] = df[column].astype(str)
    
def json_replace(df, column):
    df[column].fillna('', inplace=True)
    df[column] = df[column].apply(lambda x: [] if x == '' else x)
    if np.sum(df[column].apply(lambda x: isinstance(x, list))) != df.shape[0]:
        df[column] = df[column].apply(lambda x: 
            x if isinstance(x, list) else 
            ast.literal_eval(x.replace("\\", "")))
        

In [11]:
def get_without_nan_table(df_raw, column_dict):
    df = df_raw.copy()
    dropped_columns = ['billing_account_id', 'scoring_date', 
                       'is_training_group']
    df['is_training_group'].fillna(0, inplace=True)
    for column in df.columns:
        if column not in dropped_columns:
            if 'numeric' in column_dict[column] or 'binary' in column_dict[column]:
                number_replace(df, column)
            elif 'category' in column_dict[column]:
                category_replace(df, column)
            elif 'json' in column_dict[column]:
                json_replace(df, column)
            else:
                assert False, f'no type for column {column}'
    assert np.sum(np.sum(df.isna())) == 0
    return df

In [12]:
def numeric_changer(df, column):
    if 'consumption' in column.split("__")[1]:
        min_val = df[column].min()
        df[column] = df[column].apply(lambda x: np.log(x + 1 + min_val))

In [13]:
def find_pct_pattern(array, pattern_value):
    cnt = 0
    for array_value in array:
        if pattern_value == str(array_value):
            cnt += 1
    if len(array) == 0 and pattern_value == JSON_EMPTY_VALUE:
        return 100
    return cnt / (len(array) + 1e-5) * 100


def is_pattern_inside(array, pattern_value):
    for array_value in array:
        if pattern_value == str(array_value):
            return 1
    if len(array) == 0 and pattern_value == JSON_EMPTY_VALUE:
        return 1
    return 0

In [14]:
def json_get_interesting_values(df, json_column, column_dict):
    list_of_lists = df[json_column].tolist()
    counter = Counter()
    for curr_list in list_of_lists:
        for value in curr_list:
            counter[str(value)] += 1
        if len(curr_list) == 0:
            counter[JSON_EMPTY_VALUE] += 1
    json_num = int(column_dict[json_column].split("__")[1])
    final_interesting_values = counter.most_common(json_num)
    return final_interesting_values

In [15]:
def json_changer(df, column, column_dict):
    interesting_values = json_get_interesting_values(df, column, column_dict)
    for value, number_of_finds in interesting_values:
        df[column + "___" + value] = df[column].apply(lambda x:
                                        is_pattern_inside(x, value))
        df[column + "___" + value] = df[column + "___" + value].astype(float)

In [16]:
def json_pct_changer(df, column, column_dict):
    interesting_values = json_get_interesting_values(df, column, column_dict)
    for value, number_of_finds in interesting_values:
        df[column + "___" + value] = df[column].apply(lambda x:
                                        find_pct_pattern(x, value))
        df[column + "___" + value] = df[column + "___" + value].astype(float)

In [17]:
def create_dummy(df, cat_column):
    dum = pd.get_dummies(df[cat_column])
    dum.columns = [cat_column + "___"+ str(val) for val in dum.columns]
    df = pd.concat([df, dum], axis = 1)
    return df

In [18]:
def make_ml_table_without_category(df, column_dict):
    test_df = df.copy()
    dropped_columns = ['billing_account_id', 'scoring_date', 
                       'is_training_group']
    for column in test_df.columns:
        if column not in dropped_columns:
            if 'numeric' in column_dict[column]:
                numeric_changer(test_df, column)
            if column_dict[column].split("__")[0] == 'json':
                json_changer(test_df, column, column_dict)
            if column_dict[column].split("__")[0] == 'json_pct':
                json_pct_changer(test_df, column, column_dict)
            if 'category' in column_dict[column]:
                create_dummy(test_df, column)
    return test_df

In [19]:
df = get_full_table()
column_dict = get_column_type_dict()
df = get_without_nan_table(df, column_dict)
ml_df = make_ml_table_without_category(df, column_dict)

In [20]:
target_columns = []
for column in ml_df.columns:
    if 'target_table' in column:
        target_columns.append(column)

In [21]:
target_columns

['target_table__paid_target', 'target_table__call_target']

In [22]:
def final_dataset_for_model_cleaning(df_raw, curr_target_column):
    df = df_raw.copy()
    y = df[curr_target_column]
    df.drop(columns=target_columns, inplace=True)
    dropped_columns = ['billing_account_id', 'scoring_date', 
                       'is_training_group']
    df.drop(columns = dropped_columns, inplace=True)
    json_columns = []
    for column in column_dict:
        if 'json' in column_dict[column]:
            json_columns.append(column)
    df.drop(columns = json_columns, inplace=True)
    return df, y

In [23]:
assert ml_df.shape[0] == ml_df['billing_account_id'].unique().shape[0]

In [24]:
def prepare_datasets(ml_df, scoring_date, curr_target_column):
    full_train_df = ml_df[ml_df['is_training_group'] == 1]
    full_train_df = full_train_df.sort_values(by='scoring_date')
    
    to_predict_df = ml_df[ml_df['scoring_date'] == scoring_date]    
    validation_train_df = full_train_df
    validation_train_df.index = np.arange(0, len(validation_train_df))

#     kostya = kostya_df.sort_values(by='billing_account_id')
#     validation_test_df = validation_train_df[validation_train_df['billing_account_id'].isin(
#         kostya['billing_account_id'])]
    
    validation_test_df = validation_train_df.tail(VALIDATION_TEST_SIZE)
    validation_train_df = validation_train_df[~validation_train_df.index.isin(validation_test_df.index)]
    validation_train_df = validation_train_df[((validation_train_df['scoring_date'] >= '2020-04-01') |
                                              (full_train_df['scoring_date'] < '2020-01-01')) &
                                              (full_train_df['scoring_date'] > '2019-06-01')]
    ####################################################################################################
    validation_train_df = validation_train_df.tail(TRAIN_SIZE)
    validation_train_df, validation_train_y = final_dataset_for_model_cleaning(validation_train_df, 
                                                                               curr_target_column)
    validation_test_df, validation_test_y = final_dataset_for_model_cleaning(validation_test_df,
                                                                             curr_target_column)
    to_predict_df, _ = final_dataset_for_model_cleaning(to_predict_df,
                                                        curr_target_column)
    ####################################################################################################
    full_train_df_for_model = lib.concatenate_tables([validation_train_df, validation_test_df])
    full_train_df_for_model_y = lib.concatenate_tables([validation_train_y, validation_test_y])
    
    return validation_train_df, validation_train_y,\
           validation_test_df, validation_test_y,\
           full_train_df_for_model, full_train_df_for_model_y,\
           to_predict_df

In [25]:
curr_target_column = 'target_table__paid_target'
ml_df[(ml_df['main_info_table__is_already_paid'] == ml_df[curr_target_column]) &
      (ml_df[curr_target_column] == 1)].shape[0] / \
ml_df[(ml_df[curr_target_column] == 1)].shape[0]

0.11882430647291942

In [26]:
curr_target_column = 'target_table__paid_target'

validation_train_df, validation_train_y,\
validation_test_df, validation_test_y,\
full_train_df, full_train_y,\
to_predict_df = prepare_datasets(ml_df, CURRENT_DATE, curr_target_column)

# Checker

In [27]:
# curr_target_column = 'target_table__paid_target'
# kostya_df = lib.execute_query("""
# SELECT
#     DISTINCT billing_account_id,
#     addDays(toDate(first_trial_consumption_date), 15) as scoring_date,
#     if (score > 0.2, 1, 0) as predicted_by_kostya,
#     score as proba_by_kostya
# FROM "//home/cloud_analytics/scoring/leads/leads"
# WHERE scoring_date >= toDate('2020-04-01')
# AND scoring_date < toDate('2020-06-01')
# FORMAT TabSeparatedWithNames
# """)
# for_kostya = ml_df[['billing_account_id', 'scoring_date', curr_target_column, 'is_training_group']]
# kostya_df = pd.merge(kostya_df, for_kostya, on=['billing_account_id'])
# kostya_df = kostya_df[kostya_df['is_training_group'] == 1]
# kostya_pred = kostya_df['predicted_by_kostya']
# proba_by_kostya = kostya_df['proba_by_kostya']
# y_true = kostya_df[curr_target_column]
# recall_score(y_true, kostya_pred), precision_score(y_true, kostya_pred)
# ###################################
# curr_target_column = 'target_table__paid_target'

# validation_train_df, validation_train_y,\
# validation_test_df, validation_test_y,\
# full_train_df, full_train_y,\
# to_predict_df = prepare_datasets(ml_df, CURRENT_DATE, curr_target_column)
# ###################################

-----

## ML

In [28]:
class CategoryChanger(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.changer = {}

    def work_with_category(self, table_raw, y_train=None, train=False):
        table = table_raw.copy()
        columns = table.columns
        if train:
            self.changer = {}
            table["y"] = y_train
            self.possible_vals = []
            self.dummy_changer = None
            for col in columns:
                if col not in column_dict or 'category' not in column_dict[col]:
                    continue
                woe_df = table.groupby(col)["y"].mean()
                woe_df = pd.DataFrame(woe_df)
                woe_df = woe_df.rename(columns={"y":"Good"})
                woe_df["Bad"] = 1 - woe_df["Good"]
                woe_df["Bad"] = np.where(woe_df["Bad"] == 0, 1e-5, woe_df["Bad"])
                woe_df["WoE"] = np.log(woe_df["Good"] / woe_df["Bad"] + 1e-5) 
                fe = woe_df["WoE"].to_dict()
                self.changer[col] = fe
        else:
            for col in columns:
                if col not in column_dict or 'category' not in column_dict[col]:
                    continue
                table[col] = table[col].map(self.changer[col])
                if table[col].isna().sum() > 0:
                    table[col] = table[col].replace(np.nan, 
                                                    min(self.changer[col].values()))
        return table
    
    def fit(self, X_raw, y):
        X = X_raw.copy()
        self.work_with_category(X, y, train=True)
        return self
    
    def transform(self, X_raw):
        X = X_raw.copy()
        res = self.work_with_category(X)
        return res #np.matrix(res)

In [29]:
class ThresholdModel(BaseEstimator, ClassifierMixin):
    def __init__(self, model, prob_threshold, **model_params):
        super().__init__()
        self.model = model.set_params(**model_params)
        self.prob_threshold = prob_threshold
    
    def fit(self, X, y):
        self.model.fit(X, y)
    
    def set_params(self, **params):
        self.model.set_params(**params)
    
    def predict_proba(self, X):
        return self.model.predict_proba(X)
    
    def predict(self, X):
        predicts = self.predict_proba(X)
        return predicts[:, 1] > self.prob_threshold

In [30]:
def make_treshold_pipeline(pipeline, params, prob_threshold, calibrated=True):
    last_model = pipeline.set_params(**params)[-1]
    if calibrated:
        last_model = CalibratedClassifierCV(last_model, cv=3, method='sigmoid')
    all_steps = pipeline.steps.copy()
    all_steps[-1] = ("Threshold", ThresholdModel(last_model,  prob_threshold=prob_threshold))
    return Pipeline(all_steps)

In [31]:
def create_best_xgboost_model_with_grid_search(xgboost_pipeline, 
                                               X, y):
    """
    xgboost_pipeline = Pipeline([('category_changer', CategoryChanger(category_columns)),
                                 ('minmaxscaler', MinMaxScaler()),
                                 ('XGBoost', XGBClassifier())])
    """
    init_p = np.sum(y) / y.shape[0]
    final_params = {"XGBoost__base_score": init_p,
                    "XGBoost__seed": 42, "XGBoost__n_jobs": -1, 
                    'XGBoost__max_depth': 3, 
                    'XGBoost__n_estimators': 100, 
                    'XGBoost__random_state': 8}
    
    params1 = {'XGBoost__max_depth': [3, 4, 5]}
    params2 = {'XGBoost__n_estimators': [150, 200, 300]}
    params3 = {"XGBoost__gamma": [0, 0.5, 1, 1.5]}
    params4 = {'XGBoost__subsample': [0.4, 0.7, 1]}
    params5 = {"XGBoost__min_child_weight": [1, 3, 5]}
    params6 = {'XGBoost__subsample': [0.4, 0.7, 1]}
    params = [params1, params2, params3, params4, params5, params6]

    for param in params:
        xgboost_pipeline.set_params(**final_params)
        search_model = GridSearchCV(xgboost_pipeline, param, 
                                    cv = 3, n_jobs = -1, scoring ='recall')
        search_model.fit(X, y)
        final_params = {**final_params, **search_model.best_params_}
    xgboost_pipeline.set_params(**final_params)
    return xgboost_pipeline, final_params

In [32]:
def create_best_random_forest_model_with_grid_search(rf_pipeline, 
                                               X, y):
    final_params = {"random_forest__n_estimators": 800,
                    "random_forest__n_jobs": -1, 
                    'random_forest__random_state': 8}
    
    params1 = {'random_forest__criterion': ['gini', 'entropy']}
    params2 = {'random_forest__max_features': ['auto', 'sqrt', 'log2']}
    params3 = {"random_forest__min_samples_split": [0.05, 0.1, 0.5, 2, 4]}
    params4 = {"random_forest__min_samples_leaf": [1, 2, 3]}
    params5 = {"random_forest__min_impurity_decrease": [0, 10**-4, 10**-2, 10**-1]}
    params = [params1, params2, params3, params4, params5]

    for param in params:
        rf_pipeline.set_params(**final_params)
        search_model = GridSearchCV(rf_pipeline, param, 
                                    cv = 3, n_jobs = -1, scoring ='recall')
        search_model.fit(X, y)
        final_params = {**final_params, **search_model.best_params_}
    rf_pipeline.set_params(**final_params)
    return rf_pipeline, final_params

In [33]:
def create_best_catboost_model_with_grid_search(catboost_pipeline, 
                                                X, y):

    final_params = {
                    'catboost__iterations': 1000,
                    'catboost__depth':1,
                    'catboost__learning_rate':0.5,
                    'catboost__bootstrap_type': 'Bernoulli',
                    'catboost__loss_function':'Logloss',
                    'catboost__verbose': False,
                    'catboost__random_state': 8,
                    'catboost__early_stopping_rounds': 20}
    
    params1 = {'catboost__bootstrap_type': ['Bayesian', 'Bernoulli', 'MVS']}
    params2 = {'catboost__depth': [1, 2, 3, 4]}
    params3 = {"catboost__learning_rate": [0.01, 0.03, 0.5, 0.8, 1]}
    params4 = {'catboost__iterations': [800, 900, 1000]}
    params = [params1, params2, params3, params4]

    for param in params:
        catboost_pipeline.set_params(**final_params)
        search_model = GridSearchCV(catboost_pipeline, param, 
                                    cv = 3, n_jobs = -1, scoring ='recall')
        search_model.fit(X, y)
        final_params = {**final_params, **search_model.best_params_}
    catboost_pipeline.set_params(**final_params)
    return catboost_pipeline, final_params

In [34]:
def get_cv_results_of_model(name, pipeline, validation_train_df, validation_train_y, cv_num=5):
    print(name)
    answer_dict = {}
    cv = StratifiedKFold(n_splits=cv_num, shuffle=False)
    metrics = ['precision', 'recall']
    cros_res = cross_validate(pipeline, validation_train_df, validation_train_y, cv=cv,
                              scoring=metrics, return_train_score=False)
    
    answer_dict['precision'] = cros_res['test_precision']
    answer_dict['recall'] = cros_res['test_recall']
    for key in metrics:
        curr_arr = answer_dict[key]
        print(f"{key}: mean={round(np.mean(curr_arr), 2)}; std={round(np.std(curr_arr), 3)}")
        print(curr_arr)
        print("========================")
    return answer_dict

In [35]:
def get_final_columns(df_raw, y, max_columns=300):
    df = df_raw.copy()
    df = CategoryChanger().fit_transform(df, y)
    model = CatBoostClassifier(random_state=8, verbose=False)
    model.fit(df, y)
    
    fi = pd.Series(model.feature_importances_,
                   index=df.columns)
    fi = fi.sort_values(ascending=False)
    fi = pd.DataFrame(fi)
    fi.columns = ['feature_weight']
    interest_columns = fi.iloc[:max_columns].index
    
    row = {'scoring_date': CURRENT_DATE,
           'important_columns_500': str(list(interest_columns)),
           'important_columns_100': str(list(interest_columns)[:100])}
    columns_df = pd.DataFrame([row])
    lib.save_table("important_columns", 
                   "//home/cloud_analytics/scoring_v2/helping_folder_for_model", 
                   columns_df)
    return interest_columns

In [36]:
def make_cross_validated_results_df(results):
    row = {}
    row['scoring_date'] = CURRENT_DATE
    for key in results:
        row[key + "_array"] = str(results[key])
        mean = np.mean(results[key])
        std = np.std(results[key])
        row[key + "_mean"] = mean
        row[key + "_std"] = std
        left = max(mean - 2 * std, 0)
        right = min(mean + 2 * std, 1)
        row[key + "_95_confidence_interval"] = f"[{round(left, 3)}, {round(right, 3)}]"
    return pd.DataFrame([row])

In [37]:
def fit_model_and_predict_on_dataset(metrics_name, pipeline, params, threshold,
                                     train_dataset, train_y, test_dataset, 
                                     validation=False, test_y=None):
    assert metrics_name in ['paid', 'call_answer']
    final_pipeline = make_treshold_pipeline(pipeline, params, 
                                            prob_threshold=threshold,
                                            calibrated=True)
    trained_pipeline = final_pipeline.fit(train_dataset, train_y)
    prediction = trained_pipeline.predict(test_dataset)
    prediction_proba = trained_pipeline.predict_proba(test_dataset)
    if validation:
        assert test_y is not None
        recall = recall_score(test_y, prediction)
        precision = precision_score(test_y, prediction)
        print(f"validation results: "
              f"recall={round(recall, 3)}; "
              f"precision={round(precision, 3)};")
        row = {'scoring_date': CURRENT_DATE,
               'test_size': len(test_y),
               'precision': precision,
               'recall': recall}
        test_validation_results_df = pd.DataFrame([row])
        lib.save_table(metrics_name + "_last_users_validation_results", 
                   "//home/cloud_analytics/scoring_v2/helping_folder_for_model", 
                   test_validation_results_df)
    else:
        results = get_cv_results_of_model('cross_validation:', final_pipeline,
                                          train_dataset, train_y, cv_num=5)
        df_results = make_cross_validated_results_df(results)
        lib.save_table(metrics_name + "_cross_validation_results", 
                       "//home/cloud_analytics/scoring_v2/helping_folder_for_model", 
                       df_results)
    return trained_pipeline, prediction, prediction_proba

In [38]:
curr_target_column = 'target_table__paid_target'

validation_train_df, validation_train_y,\
validation_test_df, validation_test_y,\
full_train_df, full_train_y,\
to_predict_df = prepare_datasets(ml_df, CURRENT_DATE, curr_target_column)


interest_columns = get_final_columns(full_train_df, full_train_y, max_columns=500)

-----

-------

In [51]:
curr_target_column = 'target_table__paid_target'

validation_train_df, validation_train_y,\
validation_test_df, validation_test_y,\
full_train_df, full_train_y,\
to_predict_df = prepare_datasets(ml_df, CURRENT_DATE, curr_target_column)

In [52]:
CatBoost_pipeline = Pipeline([('category_changer', CategoryChanger()),
                              ('MinMaxScaler', MinMaxScaler()),
                              ('feature_selection',\
                               SelectFromModel(CatBoostClassifier(random_state=8, verbose=False),
                                               max_features=500)),
                              ('catboost', CatBoostClassifier(random_state=8, verbose=False))])

In [53]:
t0 = time.time()
paid_pipeline, _, _ = fit_model_and_predict_on_dataset(
                                     'paid', CatBoost_pipeline, {}, PAID_THRESHOLD,
                                     validation_train_df, validation_train_y, 
                                     validation_test_df, 
                                     validation=True, test_y=validation_test_y)
t1 = time.time()
print("time in min:", (t1 - t0) / 60.0)

validation results: recall=0.926; precision=0.37;
time in min: 2.0612407962481183


In [None]:
t0 = time.time()
paid_pipeline, paid_prediction, paid_prediction_proba = fit_model_and_predict_on_dataset(
                                     'paid', CatBoost_pipeline, {}, PAID_THRESHOLD,
                                     full_train_df, full_train_y, 
                                     to_predict_df, 
                                     validation=False)
t1 = time.time()
print("time in min:", (t1 - t0) / 60.0)

cross_validation:


In [None]:
paid_prediction.sum()

# Call target

In [None]:
curr_target_column = 'target_table__call_target'

validation_train_df, validation_train_y,\
validation_test_df, validation_test_y,\
full_train_df, full_train_y,\
to_predict_df = prepare_datasets(ml_df, CURRENT_DATE, curr_target_column)

In [None]:
t0 = time.time()
call_pipeline, _, _ = fit_model_and_predict_on_dataset(
                                     'call_answer', CatBoost_pipeline, {}, CALL_THRESHOLD,
                                     validation_train_df, validation_train_y, 
                                     validation_test_df, 
                                     validation=True, test_y=validation_test_y)
t1 = time.time()
print("time in min:", (t1 - t0) / 60.0)

In [None]:
t0 = time.time()
call_pipeline, call_prediction, call_prediction_proba = fit_model_and_predict_on_dataset(
                                     'call_answer', CatBoost_pipeline, {}, CALL_THRESHOLD,
                                     full_train_df, full_train_y, 
                                     to_predict_df, 
                                     validation=False)
t1 = time.time()
print("time in min:", (t1 - t0) / 60.0)

In [None]:
call_prediction.sum()

In [None]:
def make_and_save_final_crm_prediction_table(df, call_prediction, paid_prediction,
                                             call_prediction_proba, paid_prediction_proba):
    to_predict_df = df[df['scoring_date'] == CURRENT_DATE]
    to_predict_df['call_prediction'] = call_prediction.astype(int)
    to_predict_df['paid_prediction'] = paid_prediction.astype(int)
    to_predict_df['call_proba'] = call_prediction_proba[:, 1]
    to_predict_df['paid_proba'] = paid_prediction_proba[:, 1]
    
    final_current_date_table = to_predict_df[['billing_account_id', 
                                              'scoring_date', 
                                              'paid_prediction', 'paid_proba',
                                              'call_prediction', 'call_proba']]
    lib.save_table(f'scored_users',
                   '//home/cloud_analytics/scoring_v2/helping_folder_for_model',
                   final_current_date_table)

In [None]:
 make_and_save_final_crm_prediction_table(df, call_prediction, paid_prediction,
                                             call_prediction_proba, paid_prediction_proba)

In [None]:
date_table = pd.DataFrame([{"current_predicting_scoring_date":CURRENT_DATE}])

In [None]:
lib.save_table(f'model_scoring_date',
                   '//home/cloud_analytics/scoring_v2/helping_folder_for_model',
                   date_table)