In [1]:
import pandas as pd, datetime, ast, os,sys, pymysql, logging, requests
module_path = os.path.abspath(os.path.join('/home/ktereshin/yandex/arcadia/cloud/analytics/python/work'))
if module_path not in sys.path:
    sys.path.append(module_path)
from data_loader import clickhouse
from global_variables import (
    metrika_clickhouse_param_dict,
    cloud_clickhouse_param_dict
)
from nile.api.v1 import (
    clusters,
    aggregators as na,
    extractors as ne,
    filters as nf,
    Record
)
from vault_client import instances
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [2]:
def execute_query(query, cluster, alias, token, timeout=600):
    logger.info("Executing query: %s", query)
    proxy = "http://{}.yt.yandex.net".format(cluster)
    s = requests.Session()
    url = "{proxy}/query?database={alias}&password={token}".format(proxy=proxy, alias=alias, token=token)
    resp = s.post(url, data=query, timeout=timeout)
    if resp.status_code != 200:
        logger.error("Response status: %s", resp.status_code)
        logger.error("Response headers: %s", resp.headers)
        logger.error("Response content: %s", resp.content)
    resp.raise_for_status()
    rows = resp.content.strip().split('\n')
    logger.info("Time spent: %s seconds, rows returned: %s", resp.elapsed.total_seconds(), len(rows))
    return rows

In [3]:
logger = logging.getLogger(__name__)
client = instances.Production()
yt_creds = client.get_version('ver-01d33pgv8pzc7t99s3egm24x47')
cluster_yt = clusters.yt.Hahn(
    token = yt_creds['value']['token'],
    pool = yt_creds['value']['pool'],
    
).env(

    templates=dict(
        dates='{2019-03-28..2019-03-31}'
    )
)

In [4]:
cluster = 'hahn'
alias = "*ch_public"
token = '%s' % (yt_creds['value']['token'])

In [5]:
events = cluster_yt.read('//home/cloud_analytics/scoring/events').as_dataframe()

In [6]:
events['event'] = events['event'].apply(lambda x: x.lower())

In [7]:
#events = events[events['event'] != '/api/billing/setpaidaccount']

In [8]:
bag_of_events = events.groupby(['puid'])['event'].agg(lambda x: '\t'.join(list(x))).reset_index()

In [9]:
def get_count_vectors(bag_of_events):
    vectorizer = CountVectorizer(token_pattern=u'[^\t]+')
    vectorizer.fit(list(bag_of_events['event']))

    count_vec_transform = vectorizer.fit_transform(list(bag_of_events['event']))
    return pd.concat(
        [
            bag_of_events[['puid']],
            pd.DataFrame(count_vec_transform.toarray(), columns =vectorizer.get_feature_names())
        ],
        axis = 1
    )

def get_tfidf_vectors(bag_of_events):
    vectorizer = TfidfVectorizer(token_pattern=u'[^\t]+')
    vectorizer.fit(list(bag_of_events['event']))

    count_vec_transform = vectorizer.fit_transform(list(bag_of_events['event']))
    return pd.concat(
        [
            bag_of_events[['puid']],
            pd.DataFrame(count_vec_transform.toarray(), columns =vectorizer.get_feature_names())
        ],
        axis = 1
    )

In [10]:
count_bag = get_count_vectors(bag_of_events).rename(columns = lambda x: 'count_v_' + str(x) if x != 'puid' else str(x))
tfidf_bag = get_tfidf_vectors(bag_of_events).rename(columns = lambda x: 'tfidf_' + str(x) if x != 'puid' else str(x))

In [11]:
def get_event2vec(events):
    bag_of_events = events.sort_values(by=['puid', 'timestamp']).groupby(['puid'])['event'].agg(list).reset_index()
    documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(list(bag_of_events['event']))]
    doc2vec_model = Doc2Vec(documents, vector_size=50, window=5, min_count=1, workers=4)
    return doc2vec_model, pd.concat(
        [
            bag_of_events[['puid']],
            pd.DataFrame(list(bag_of_events['event'].apply(lambda x:doc2vec_model.infer_vector(x)))).rename(columns = lambda x: 'event2vec_'+str(x))
        ],
        axis = 1
    )

In [12]:
doc2vec_model, doc2vec_df = get_event2vec(events)

In [13]:
result = pd.merge(
    cluster_yt.read('//home/cloud_analytics/scoring/meta_info').as_dataframe(),
    tfidf_bag,
    on = 'puid',
    how = 'left'
).fillna(-100)
result = pd.merge(
    result,
    count_bag,
    on = 'puid',
    how = 'left'
).fillna(-100)
result = pd.merge(
    result,
    doc2vec_df,
    on = 'puid',
    how = 'left'
).fillna(-100)

In [17]:
for col in result.columns:
    if len(col) > 255:
        result.drop(col, axis = 1, inplace = True)

In [18]:
cluster_yt.write('//home/cloud_analytics/scoring/learning_dataset', result)