In [1]:
import pandas as pd, datetime, ast, os,sys, pymysql, logging, requests
module_path = os.path.abspath(os.path.join('/home/ktereshin/yandex/arcadia/cloud/analytics/python/work'))
if module_path not in sys.path:
    sys.path.append(module_path)
from data_loader import clickhouse
from global_variables import (
    metrika_clickhouse_param_dict,
    cloud_clickhouse_param_dict
)
from nile.api.v1 import (
    clusters,
    aggregators as na,
    extractors as ne,
    filters as nf,
    Record
)
from vault_client import instances
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [2]:
def execute_query(query, cluster, alias, token, timeout=600):
    logger.info("Executing query: %s", query)
    proxy = "http://{}.yt.yandex.net".format(cluster)
    s = requests.Session()
    url = "{proxy}/query?database={alias}&password={token}".format(proxy=proxy, alias=alias, token=token)
    resp = s.post(url, data=query, timeout=timeout)
    if resp.status_code != 200:
        logger.error("Response status: %s", resp.status_code)
        logger.error("Response headers: %s", resp.headers)
        logger.error("Response content: %s", resp.content)
    resp.raise_for_status()
    rows = resp.content.strip().split('\n')
    logger.info("Time spent: %s seconds, rows returned: %s", resp.elapsed.total_seconds(), len(rows))
    return rows

In [3]:
logger = logging.getLogger(__name__)
client = instances.Production()
yt_creds = client.get_version('ver-01d33pgv8pzc7t99s3egm24x47')
cluster_yt = clusters.yt.Hahn(
    token = yt_creds['value']['token'],
    pool = yt_creds['value']['pool'],
    
).env(

    templates=dict(
        dates='{2019-03-28..2019-03-31}'
    )
)

In [4]:
cluster = 'hahn'
alias = "*ch_public"
token = '%s' % (yt_creds['value']['token'])

In [6]:
events = cluster_yt.read('//home/cloud_analytics/scoring/events').as_dataframe()

In [163]:
events

Unnamed: 0,date,delta,event,event_type,puid,timestamp,ts
0,2018-12-11,0,https://cloud.yandex.ru/docs/speechkit/pricing,pageview,100183458,2018-12-11T14:19:13.081Z,1544537953.081
1,2018-12-11,24.066999912261963,https://cloud.yandex.ru/docs/speechkit/stt,pageview,100183458,2018-12-11T14:19:37.148Z,1544537977.148
2,2018-12-19,666375.0339999199,https://cloud.yandex.ru,pageview,100183458,2018-12-19T07:25:52.182Z,1545204352.182
3,2018-12-19,9.30299997329712,https://console.cloud.yandex.ru,pageview,100183458,2018-12-19T07:26:01.485Z,1545204361.485
4,2018-12-19,8.754000186920166,/api/iam/createCloudPublic,event,100183458,2018-12-19T07:26:10.239Z,1545204370.239
5,2018-12-19,1,https://console.cloud.yandex.ru,pageview,100183458,2018-12-19T07:26:11.239Z,1545204371.239
6,2018-12-19,5.753000020980835,https://console.cloud.yandex.ru/settings,pageview,100183458,2018-12-19T07:26:16.992Z,1545204376.992
7,2018-12-19,7.829999923706055,https://console.cloud.yandex.ru/folders/id,pageview,100183458,2018-12-19T07:26:24.822Z,1545204384.822
8,2018-12-19,10.384999990463257,https://cloud.yandex.ru/docs/speechkit,pageview,100183458,2018-12-19T07:26:35.207Z,1545204395.207
9,2018-12-19,14.256999969482422,https://cloud.yandex.ru/docs/speechkit/tts,pageview,100183458,2018-12-19T07:26:49.464Z,1545204409.464


In [105]:
bag_of_events = events.groupby(['puid'])['event'].agg(lambda x: '\t'.join(list(x))).reset_index()

In [137]:
def get_count_vectors(bag_of_events):
    vectorizer = CountVectorizer(token_pattern=u'[^\t]+')
    vectorizer.fit(list(bag_of_events['event']))

    count_vec_transform = vectorizer.fit_transform(list(bag_of_events['event']))
    return pd.concat(
        [
            bag_of_events[['puid']],
            pd.DataFrame(count_vec_transform.toarray(), columns =vectorizer.get_feature_names())
        ],
        axis = 1
    )

def get_tfidf_vectors(bag_of_events):
    vectorizer = TfidfVectorizer(token_pattern=u'[^\t]+')
    vectorizer.fit(list(bag_of_events['event']))

    count_vec_transform = vectorizer.fit_transform(list(bag_of_events['event']))
    return pd.concat(
        [
            bag_of_events[['puid']],
            pd.DataFrame(count_vec_transform.toarray(), columns =vectorizer.get_feature_names())
        ],
        axis = 1
    )

In [136]:
count_bag = get_count_vectors(bag_of_events)
tfidf_bag = get_tfidf_vectors(bag_of_events)

In [164]:
def get_event2vec(events):
    bag_of_events = events.sort_values(by=['puid', 'timestamp']).groupby(['puid'])['event'].agg(list).reset_index()
    documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(list(bag_of_events['event']))]
    doc2vec_model = Doc2Vec(documents, vector_size=50, window=5, min_count=1, workers=4)
    return doc2vec_model, pd.concat(
        [
            bag_of_events[['puid']],
            pd.DataFrame(list(bag_of_events['event'].apply(lambda x:doc2vec_model.infer_vector(x)))).rename(columns = lambda x: 'event2vec_'+str(x))
        ],
        axis = 1
    )

In [165]:
doc2vec_model, doc2vec_df = get_event2vec(events)

In [167]:
doc2vec_df

Unnamed: 0,puid,event2vec_0,event2vec_1,event2vec_2,event2vec_3,event2vec_4,event2vec_5,event2vec_6,event2vec_7,event2vec_8,...,event2vec_40,event2vec_41,event2vec_42,event2vec_43,event2vec_44,event2vec_45,event2vec_46,event2vec_47,event2vec_48,event2vec_49
0,100183458,-0.100708,-0.068842,-0.085151,-0.053985,0.032507,0.048527,-0.031207,-0.001349,0.040882,...,-0.008741,-0.064861,-0.011610,0.027784,-0.005831,-0.017537,0.006954,-0.014024,0.041187,0.027144
1,100409855,-0.041308,-0.004387,-0.017807,-0.091550,-0.082645,-0.012797,-0.046838,-0.010054,0.021097,...,0.038022,-0.030008,0.061364,0.031719,-0.102916,-0.015487,-0.060660,-0.084909,-0.030084,0.043351
2,101053473,-0.190803,-0.068959,-0.098817,-0.116320,-0.024516,0.024881,-0.041533,0.043831,0.119262,...,0.053418,-0.080188,0.064470,-0.026650,-0.154870,0.001587,-0.097875,-0.090158,-0.030592,0.185147
3,101113943,-0.048207,-0.039132,-0.078491,-0.075725,-0.070790,0.025982,-0.049126,-0.042347,0.028252,...,0.042070,-0.020208,0.038493,0.024738,-0.093280,-0.003166,0.017167,-0.045965,-0.034506,0.085851
4,101175206,-0.009409,0.010598,-0.026517,-0.069207,-0.019624,-0.033630,-0.027352,-0.009275,0.052080,...,0.049332,0.014153,0.023470,-0.024184,-0.038725,-0.004780,-0.029687,-0.016655,-0.007804,-0.006769
5,101232496,-0.127322,0.022741,-0.117153,-0.217718,-0.073679,0.041220,-0.145142,-0.166912,0.101787,...,0.134891,-0.106201,0.185849,0.036324,-0.289746,-0.045672,-0.167916,-0.173672,0.006490,0.055926
6,10125483,-0.006920,0.011576,-0.016116,-0.008140,-0.012122,-0.002139,0.037928,0.009265,-0.034475,...,0.003435,0.008759,-0.010724,0.029375,-0.014344,0.018856,-0.027253,-0.032336,0.010059,-0.047312
7,101267333,-0.114976,-0.033703,-0.070086,-0.149742,-0.109801,-0.007345,-0.095820,-0.074131,0.196834,...,0.076544,-0.069988,0.131554,0.039404,-0.101965,-0.000351,-0.016011,-0.099231,0.005755,0.059830
8,101708896,-0.083300,-0.035360,-0.071524,-0.057860,-0.148156,-0.041033,0.005509,-0.032518,0.029232,...,0.042233,-0.012990,-0.005236,0.028036,-0.076008,-0.034903,-0.071464,-0.105591,-0.013902,0.026966
9,101790871,-0.008421,0.002491,-0.057565,-0.036607,-0.084428,0.067619,-0.037812,-0.058296,0.005816,...,0.028947,-0.028766,0.039166,0.063680,-0.073801,-0.019859,-0.015488,-0.034874,-0.066989,0.070227


In [157]:
pd.DataFrame(list(bag_of_events['event'].apply(lambda x:doc2vec_model.infer_vector(x))))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,-0.061174,-0.045291,-0.058925,-0.041000,0.053636,0.059162,-0.064959,-0.001946,0.021584,-0.057815,...,-0.050536,-0.061041,-0.020218,0.044812,-0.067141,0.047250,-0.007110,0.028767,0.039752,0.039324
1,0.010247,-0.007528,-0.023134,-0.025248,-0.024083,0.006268,-0.063822,0.025785,0.001520,-0.005459,...,0.041220,-0.019249,0.004926,0.012099,-0.064036,-0.009069,-0.026151,-0.078939,0.006473,0.013705
2,-0.133472,-0.033975,-0.134110,-0.114152,-0.018455,0.071574,-0.049680,-0.024778,0.115219,0.004591,...,0.123370,-0.034270,0.069063,-0.028398,-0.163469,-0.039712,-0.085512,-0.089607,-0.026579,0.119928
3,0.043064,-0.014153,-0.086496,-0.071278,-0.061789,0.050085,-0.100721,-0.064124,0.018872,0.020967,...,0.062560,-0.002157,0.034031,0.068415,-0.141174,-0.001075,-0.018051,0.001933,-0.053257,0.122298
4,-0.028869,-0.034235,-0.040677,-0.084182,-0.041068,0.017802,-0.054713,-0.037747,0.029708,0.025164,...,0.049084,-0.021799,0.030348,0.025315,-0.081482,-0.027247,-0.070411,-0.043552,-0.021110,0.045338
5,-0.044998,0.064536,-0.218336,-0.116901,-0.032500,0.114890,-0.112470,-0.232758,0.112711,0.004920,...,0.189562,0.031752,0.181143,0.026009,-0.351087,-0.106968,-0.226094,-0.209512,0.073053,0.066773
6,-0.015812,-0.020868,0.014353,-0.026494,-0.003539,0.012448,0.042941,0.012100,-0.026236,-0.008115,...,-0.013094,0.008365,-0.024370,0.005516,0.016722,-0.006318,-0.033979,-0.018343,0.003554,-0.043184
7,-0.068022,0.003255,-0.127971,-0.086023,-0.079264,-0.022169,-0.121567,-0.044641,0.152307,-0.004288,...,0.070543,-0.045899,0.091526,0.030816,-0.095775,0.002857,0.053028,-0.079537,0.042079,0.050529
8,-0.029844,0.034911,-0.112640,-0.039946,-0.145353,-0.033210,-0.049179,-0.050774,0.029662,0.027434,...,0.065537,0.055696,-0.007159,-0.021488,-0.083910,-0.046772,-0.069655,-0.084146,-0.040319,0.032621
9,0.010103,0.041485,-0.029804,0.005550,0.055213,0.044042,-0.008498,-0.003568,0.024149,-0.024846,...,0.020816,-0.018802,0.026244,0.025123,0.015185,-0.012618,0.009068,0.034809,0.005019,0.007507
