In [None]:
import pandas as pd, datetime, ast, os,sys, pymysql, requests
module_path = os.path.abspath(os.path.join('/home/ktereshin/yandex/arcadia/cloud/analytics/python/work'))
if module_path not in sys.path:
    sys.path.append(module_path)
from data_loader import clickhouse
from global_variables import (
    metrika_clickhouse_param_dict,
    cloud_clickhouse_param_dict
)
from nile.api.v1 import (
    clusters,
    aggregators as na,
    extractors as ne,
    filters as nf,
    Record
)
from vault_client import instances
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [None]:
def execute_query(query, cluster, alias, token, timeout=600):
    proxy = "http://{}.yt.yandex.net".format(cluster)
    s = requests.Session()
    url = "{proxy}/query?database={alias}&password={token}".format(proxy=proxy, alias=alias, token=token)
    resp = s.post(url, data=query, timeout=timeout)
    resp.raise_for_status()
    rows = resp.content.strip().split('\n')
    return rows

In [None]:
client = instances.Production()
yt_creds = client.get_version('ver-01d33pgv8pzc7t99s3egm24x47')
cluster_yt = clusters.yt.Hahn(
    token = yt_creds['value']['token'],
    pool = yt_creds['value']['pool'],
    
).env(

    templates=dict(
        dates='{2019-03-28..2019-03-31}'
    )
)

In [None]:
cluster = 'hahn'
alias = "*ch_public"
token = '%s' % (yt_creds['value']['token'])

In [None]:
query = '''
SELECT
   yandexuid,
   events as event,
   tss as ts,
   session_indexes as session_index,
   diffs as time_diff,
   hits
FROM(
    SELECT
        yandexuid,
        groupArray(event) as events,
        groupArray(ts) as tss,
        length(events) as hits,
        arrayConcat([tss[1]],arraySlice(tss, 1,length(tss)-1)) as tss_,
        arrayMap(x,y -> y-x, tss_, tss)  as diffs,
        arrayCumSum( arrayMap(x -> x >=1800, diffs)) as session_indexes
    FROM(
        SELECT
            t0.*
        FROM(
            SELECT
                yandexuid,
                puid,
                event_type,
                event,
                timestamp,
                ts
            FROM
                "//home/cloud_analytics/import/console_logs/events"
            WHERE 
                match(yandexuid, '^[0-9]+$') = 1
                AND yandexuid NOT IN ('0', '1', '2', '3')
            ORDER BY
                yandexuid,
                timestamp
        ) as t0
        ANY INNER JOIN (
            SELECT
                puid
            FROM
                "//home/cloud_analytics_test/cubes/acquisition_cube/cube"
            WHERE
                first_first_trial_consumption_datetime != ''
                AND first_first_trial_consumption_datetime >= '2018-12-20'

        ) as t1 
        ON t0.puid = t1.puid
    )
    GROUP BY
        yandexuid
)
ARRAY JOIN events, tss, session_indexes, diffs
WHERE
    hits > 3
ORDER BY
    yandexuid,
    ts
'''

result = execute_query(query=query, cluster=cluster, alias=alias, token=token)
site_events = pd.DataFrame([row.split('\t') for row in result], columns=['yandexuid','event','ts','session_index','time_diff', 'hits'])

In [None]:
site_events.shape

In [None]:
site_events.head(20)

In [None]:
temp = site_events.groupby(['yandexuid', 'session_index'])['event'].count().reset_index().sort_values(by = 'event')
temp = temp[temp['event']>3]

In [None]:
res = pd.merge(
    site_events,
    temp[['yandexuid', 'session_index']],
    on = ['yandexuid', 'session_index'],
    how = 'inner'
)

In [None]:
seq = res.groupby(['yandexuid', 'session_index'])[['event', 'ts']].agg(list).reset_index()

In [None]:
seq['event'][0]

In [None]:
def get_seq(events_list):
    if len(events_list) <=1:
        try:
            return events_list
        except:
            return ['empty']
    else:
        res_list = []
        for i in range(1,len(events_list)):
            res_list.append(events_list[i-1] + '>>' + events_list[i])
        return res_list

In [None]:
get_seq(seq['event'][2])

In [None]:
seq_list = []
for s in seq['event'].values:
    event_seq = get_seq(s)
    event_seq
    seq_list.append(' '.join([event.strip().replace(' ', '_') for event in event_seq]))

In [None]:
seq_list[0]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
vectorizer = CountVectorizer(token_pattern=u'[^\t]+', min_df = 10)
vectorizer.fit(list(seq_list))

In [None]:
count_vec_transform = vectorizer.fit_transform(list(seq_list))

In [None]:
count_vec_transform.shape

In [39]:
def get_interval(list_):
    list_ = sorted(list_)
    start = None
    end = None
    res = []
    
    if len(list_) > 1:
        
        for i in range(len(list_)):
            
            if i == 0:
                interval = str(list_[0])
                start = list_[0]
                end = list_[0]
            else:
                
                if i < len(list_) - 1:

                    if list_[i+1] - list_[i] > 1:

                        if list_[i] - end > 1 and start != list_[i]:

                            res.append(str(start) + '-' + str(list_[i]))

                        else:
                            res.append(str(list_[i]))

                        start = list_[i+1]
                        end = list_[i]

                else:
                    if list_[i] - end > 1 and start != list_[i]:

                        res.append(str(start) + '-' + str(list_[i]))

                    else:
                        res.append(str(list_[i]))
        return res
    elif len(list_) == 1:
        return [str(list_[0])]
    
    else:
        return []


In [40]:
a = [1,5,8,3,2,9,12, 13, 34,35,60]

In [41]:
get_interval(a)

['1-3', '5', '8-9', '12-13', '34-35', '60']