In [13]:
%load_ext autoreload
%autoreload 2
import scipy.stats as sps
from sklearn.model_selection import train_test_split
import gc
from sklearn.linear_model import LinearRegression
from dateutil.parser import *
import pandas as pd
import numpy as np
import os
from collections import Counter
import json
import typing as tp
from sklearn.preprocessing import normalize
import my_library as lib
from datetime import datetime
import re
import gensim
import gensim as gen
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import ast
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [14]:
DAYS_TO_OBSERVE = 14
EMPTY_TYPE = 'undefined'
column_types = set()
TABLE_NAME = 'console_logs_table'

In [15]:
def get_table():
    column_types.add("number_of_days_site_visits:numeric")
    column_types.add("last_hit_first_hit_date_diff:numeric")
    column_types.add("last_hit_day_diff_with_scoring:numeric")
    column_types.add("site_hit_length:numeric")
    column_types.add("unique_site_hit_length:numeric")
    column_types.add("unique_to_all_hits_pct:numeric")
    req = f"""
    SELECT
        billing_account_id,
        arraySort((x, y) -> y, groupArray(event), groupArray(time)) as event_array,
        COUNT(DISTINCT time) as number_of_days_site_visits,
        dateDiff('day', MIN(time), MAX(time)) as last_hit_first_hit_date_diff,
        dateDiff('day', MAX(time), scoring_date) as last_hit_day_diff_with_scoring,
        length(groupArray(event)) as site_hit_length,
        length(groupUniqArray(event)) as unique_site_hit_length,
        unique_site_hit_length / site_hit_length * 100 as unique_to_all_hits_pct,
        scoring_date
    FROM (
        SELECT
            billing_account_id,
            scoring_date,
            event,
            toDate(replaceRegexpOne(timestamp, '[.].*', '')) as time
        FROM "//home/cloud_analytics/import/console_logs/events" as a
        INNER JOIN (
            SELECT
                puid,
                billing_account_id,
                addDays(toDate(MIN(first_first_trial_consumption_datetime)),
                {DAYS_TO_OBSERVE}) as scoring_date
            FROM "//home/cloud_analytics/cubes/acquisition_cube/cube"
            WHERE event == 'ba_created'
            AND billing_account_id != ''
            AND puid != ''
            AND first_first_trial_consumption_datetime != '0000-00-00 00:00:00'
            GROUP BY puid, billing_account_id 

        ) as b
        ON a.puid == b.puid
        WHERE time < scoring_date
        AND response >= '200'
        AND response < '300'
    )
    GROUP BY billing_account_id, scoring_date
    FORMAT TabSeparatedWithNames
    """
    df = lib.execute_query(req)
    return df

In [16]:
def event_changer(x):
    x = re.sub(r'.ru', r'.com', x)
    x = re.sub(r'https:\/\/(.*)|\/(.*)', 
               r'\1\2', x)
    x = re.sub(r'\/id|_\/|\/folders', 
               r'', x)
    x = re.sub(r'([^\/]*\/[^\/]*\/[^\/]*)\/.*|(.*)', 
               r'\1\2', x)
    x = re.sub(r'([^?]*)?.*', 
               r'\1', x)
    x = re.sub(r'([^&]*)&.*', 
               r'\1', x)
    parts = x.split("/")
    good_parts = []
    for part in parts:
        if not any(map(str.isdigit, part)):
            good_parts.append(part)
    x = "/".join(good_parts)
    x = re.sub(r"[.]|\/", r'_', x)
    x = x.lower()
    return x

In [17]:
interested_patterns = [
    "console.cloud.yandex.com",
    "docs",
    "docs/compute",
    'docs/vpc',
    'docs/storage',
    'docs/solutions',
    'docs/resource-manager',
    'docs/billing',
    'docs/speechkit',
    'prices',
    'prices/compute',
    'prices/storage',
    'prices/speechkit',
    'marketplace',
    'services',
    'services/compute',
    'services/managed-clickhouse',
    'services/managed-postgresql',
    'services/speechkit',
    'services/storage',
    'services/vpc',
    'billing',
    'updateUserSettings',
    'support',
    'form',
    'managed-postgresql',
    'managed-clickhouse',
    'compute',
    'speechkit',
    'storage',
    'vpc',
    'instance',
    'blog',
    'delete',
    'remove',
    '/add',
    'create',
    'update',
    'start',
    'stop'
]

In [18]:
def update_patterns(patterns):
    new_patterns = []
    for x in patterns:
        x = re.sub(r"[.]|\/", r'_', x)
        x = x.lower()
        new_patterns.append(x)
    return new_patterns
interested_patterns = update_patterns(interested_patterns)

In [19]:
def find_pattern(array, pattern):
    cnt = 0
    for event in array:
        if pattern in event:
            cnt += 1
    return cnt / len(array) * 100

In [20]:
def text2vec(text_paths, model):
    res = []
    gc.collect()
    for ind, session in enumerate(text_paths):
        res.append(model.infer_vector(session))
        if ind % 10000 == 0:
            gc.collect()
    return res

In [21]:
def make_console_logs_scoring_table():
    df = get_table()
    df['events'] = df['event_array'].apply(
        lambda array: [event_changer(x) for x in array])
    
    for pattern in interested_patterns:
        df[pattern] = df['events'].apply(lambda x: find_pattern(x, pattern))
        
    df.drop(columns = ['event_array'], inplace=True)
    model = gensim.models.doc2vec.Doc2Vec(vector_size=100, min_count=100, epochs=100, workers=-1)
    text_corpus = [TaggedDocument(doc, [i]) for i, doc in enumerate(df['events'])]
    model.build_vocab(text_corpus)
    model.train(text_corpus, total_examples=model.corpus_count, epochs=model.epochs)
    text_paths = df['events'].to_list()
    events_vec_rows = text2vec(text_paths, model)
    vec_events_df = pd.DataFrame(events_vec_rows)
    vec_events_df.columns = [f'event_vec_path_{i}' for i in range(vec_events_df.shape[1])]
    
    events = df['events'].apply(lambda x: " ".join(x))
    vectorizer = TfidfVectorizer(token_pattern=u'[^ ]+', min_df=30)
    vectorizer.fit(events)
    count_vec_transform = vectorizer.fit_transform(events)
    tdf_df = pd.DataFrame(count_vec_transform.toarray(), 
                          columns =vectorizer.get_feature_names())
    tdf_df.columns = ['tdf_' + col for col in tdf_df.columns]
    
    df = pd.concat([df, vec_events_df], axis=1)
    df = pd.concat([df, tdf_df], axis=1)
    df['events'] = df['events'].astype(str)

    for i in range(vec_events_df.shape[1]):
        column_types.add(f'event_vec_path_{i}:numeric')
    for column in tdf_df.columns:
        column_types.add(f'{column}:numeric')

    column_types.add(f"events:json_pct__200")
    for pattern in interested_patterns:
        column_types.add(f"{pattern}:numeric")
    
    return df

In [22]:
def save_types(column_types):
    rows = []
    for column_type in column_types:
        column, current_type = column_type.split(':')
        rows.append([column, current_type, TABLE_NAME])
    type_df = pd.DataFrame(np.matrix(rows), columns=['column_name', 'type',
                                                     'table_name'])
    lib.save_table('type_table', "//home/cloud_analytics/scoring_v2/data_tables", 
                   type_df, append=True)


def add_table_to_model_to_observe(table_name):
    tables_df = pd.DataFrame([table_name], 
                             columns=['table_names'])
    lib.save_table('table_names_for_scoring_model', 
               "//home/cloud_analytics/scoring_v2/data_tables", 
               tables_df, append=True)
    

def check_types_correspondence(df, column_types):
    req = """
    SELECT
        type,
        checker_function
    FROM "//home/cloud_analytics/scoring_v2/data_tables/column_type_description"
    FORMAT TabSeparatedWithNames
    """
    type_df = lib.execute_query(req)

    checker_functions = {}
    for func_str in type_df['checker_function']:
        exec(func_str.replace("\\n", '\n'), checker_functions)
    
    assert len(df.columns) == len(column_types) + 2, \
    'difference in number of columns in dataframe and in column_types, '
    f'{len(column_types) + 2 - len(df.columns)}'
    
    for column_type in column_types:
        column, curr_type = column_type.split(":")
        curr_function_name = curr_type.split("__")[0]
        if checker_functions.get(curr_function_name + "_checker") is None:
            assert False, f"no type {curr_function_name}"
        assert checker_functions[curr_function_name + "_checker"](df, column),\
        f'{curr_function_name} check failed for column {column}'
        

def save_all_results(df):
    check_types_correspondence(df, column_types)
    lib.save_table(TABLE_NAME, 
                   "//home/cloud_analytics/scoring_v2/data_tables", df)
    save_types(column_types)
    add_table_to_model_to_observe(TABLE_NAME)

In [11]:
%%time
console_logs_df = make_console_logs_scoring_table()

CPU times: user 27min 58s, sys: 5.93 s, total: 28min 4s
Wall time: 28min 21s


In [12]:
save_all_results(console_logs_df)

In [23]:
df = get_table()