In [1]:
import requests
import pandas as pd
import datetime
import json
import time
import pymysql
from io import StringIO
from sqlalchemy import create_engine
from share import assasin_beatch

In [2]:
#program parameters
#time between checking status
TIME_TO_SLEEP = 10
#number of loops to raise error
LOOP_ERROR_N = 50

In [3]:
#logs api parameters
TOKEN = assasin_beatch.TOKEN
COUNTER_ID = 44147844
START_DATE = '2019-05-01'
END_DATE = '2019-05-02'
SOURCE = 'hits'
API_HOST = 'https://api-metrika.yandex.ru'
API_FIELDS = (
    'ym:pv:counterID',
    'ym:s:visitID',
    'ym:pv:clientID',
    'ym:pv:watchID',
    'ym:pv:dateTime',
    'ym:pv:lastTrafficSource',
    'ym:pv:URL',
    'ym:pv:goalsID')

params = {
    'date1':START_DATE,
    'date2':END_DATE,
    'source':SOURCE,
    'fields':','.join(sorted(API_FIELDS, key=lambda s: s.lower()))
}

In [4]:
class CreationQueryError(Exception):pass
class LoopingError(Exception):pass

def eval_query(API_HOST,COUNTER_ID,TOKEN,params):

    url = '{host}/management/v1/counter/{counter_id}/logrequests/evaluate'\
        .format(
            host = API_HOST,
            counter_id = COUNTER_ID
        )
    headers = {'Authorization': TOKEN}
    responce = requests.get(url,params=params,headers=headers)
    
    if responce.status_code == 200:
        return json.loads(responce.text)['log_request_evaluation']['possible']
    else:
        return False

def creating_query(API_HOST,COUNTER_ID,TOKEN,params):
    
    url = '{host}/management/v1/counter/{counter_id}/logrequests'\
        .format(host=API_HOST,
                counter_id=COUNTER_ID)
    headers = {'Authorization': TOKEN}
    responce = requests.post(url,params=params,headers=headers)
    
    if (responce.status_code == 200) and json.loads(responce.text)['log_request']:
        return responce
    else:
        raise CreationQueryError(responce.text)

def checking_status(API_HOST,COUNTER_ID,TOKEN,request_id):

    url = '{host}/management/v1/counter/{counter_id}/logrequest/{request_id}' \
        .format(request_id=request_id,
                counter_id=COUNTER_ID,
                host=API_HOST)        
    headers = {'Authorization': TOKEN}
    responce = requests.get(url,params=params,headers=headers)
    if responce.status_code == 200:
        return json.loads(responce.text)['log_request']['status'], responce
    else:
        ValueError(responce.text)

def download(API_HOST,COUNTER_ID,request_id,parts,TOKEN):
    headers = {'Authorization': TOKEN}
    all_dfs = []
    for part in parts:
        part_num = part['part_number']        
        url = '{host}/management/v1/counter/{counter_id}/logrequest/{request_id}/part/{part}/download' \
                .format(
                    host=API_HOST,
                    counter_id=COUNTER_ID,
                    request_id=request_id,
                    part=part_num
                )
        response = requests.get(url,headers=headers)
        df = pd.read_csv(StringIO(response.text),sep='\t')
        all_dfs.append(df)
    return pd.concat(all_dfs)

def main():
    if eval_query(API_HOST,COUNTER_ID,TOKEN,params):
        created_responce = creating_query(API_HOST,COUNTER_ID,TOKEN,params)
        request_id = json.loads(created_responce.text)['log_request']['request_id']
        status, processed_responce = checking_status(API_HOST,COUNTER_ID,TOKEN,request_id)
        print('request_id: {0},checked status: {1}'.format(request_id,status))
        i = 0
        while status == 'created':
            time.sleep(TIME_TO_SLEEP)
            status, processed_responce = checking_status(API_HOST,COUNTER_ID,TOKEN,request_id)
            print('request_id: {0},cur status: {1}'.format(request_id,status))
            i+=1
            if i > LOOP_ERROR_N:
                raise LoopingError('request_id:{0}'.format(request_id))
        parts = json.loads(processed_responce.text)['log_request']['parts']
        df = download(API_HOST,COUNTER_ID,request_id,parts,TOKEN)
        
        for column in API_FIELDS:
            if column.find('date') < 0:
                df[column] = df[column].astype(str)
            else:
                df[column] = pd.to_datetime(df[column])
        return df

In [5]:
eval_query(API_HOST,COUNTER_ID,TOKEN,params)

True

In [6]:
resp = creating_query(API_HOST,COUNTER_ID,TOKEN,params)


In [7]:
request_id = json.loads(resp.text)['log_request']['request_id']

In [15]:
_, resp = checking_status(API_HOST,COUNTER_ID,TOKEN,request_id)

In [16]:
resp.text

'{"log_request":{"request_id":5149786,"counter_id":44147844,"source":"hits","date1":"2019-05-01","date2":"2019-05-02","fields":["ym:pv:clientID","ym:pv:counterID","ym:pv:dateTime","ym:pv:goalsID","ym:pv:lastTrafficSource","ym:pv:URL","ym:pv:watchID"],"status":"processed","size":795570,"parts":[{"part_number":0,"size":795570}]}}'

In [17]:
parts = json.loads(resp.text)['log_request']['parts']

In [18]:
df = download(API_HOST,COUNTER_ID,request_id,parts,TOKEN)

In [19]:
df.head()

Unnamed: 0,ym:pv:clientID,ym:pv:counterID,ym:pv:dateTime,ym:pv:goalsID,ym:pv:lastTrafficSource,ym:pv:URL,ym:pv:watchID
0,1556542121847326075,44147844,2019-05-01 15:46:20,[],direct,https://metrica.yandex.com/about,2299626911413767664
1,1556542121847326075,44147844,2019-05-01 15:46:20,[],organic,https://metrica.yandex.com/about,2299626893212847600
2,1556542121847326075,44147844,2019-05-01 15:46:35,[],direct,https://metrica.yandex.com/about,2299630949544693132
3,1556542121847326075,44147844,2019-05-01 15:46:40,[],external,https://passport.yandex.com/auth?origin=metrica,2299632174425902446
4,1556725401310710672,44147844,2019-05-01 18:42:33,[],direct,https://metrica.yandex.com.tr/about,2302398464128912828


In [20]:
df['ym:pv:URL'].unique()

array(['https://metrica.yandex.com/about',
       'https://passport.yandex.com/auth?origin=metrica',
       'https://metrica.yandex.com.tr/about',
       'https://passport.yandex.com.tr/auth?origin=metrica',
       'goal://metrica.yandex.com.tr/GetCounterMain',
       'https://metrica.yandex.com/about/',
       'goal://metrica.yandex.com/GetCounterMain',
       'https://metrica.yandex.com/about/info/traffic',
       'https://yandex.com/', 'goal://metrica.yandex.com/GetCounterUP',
       'goal://metrica.yandex.com/TryDemoMain',
       'https://metrika.yandex.ru/about',
       'goal://metrica.yandex.com.tr/TryDemoMain',
       'https://metrika.yandex.ru/about/info/integrations',
       'https://metrika.yandex.ru/about/info/pricing',
       'goal://metrica.yandex.com.tr/GetCounterUP',
       'https://passport.yandex.com/registration/?mode=register',
       'https://passport.yandex.com.tr/registration/?mode=register',
       'https://metrica.yandex.com/about/info/pricing',
       'goal://m

In [21]:
df[df['ym:pv:URL'] == 'goal://metrika.yandex.ru/TryDemoInt']

Unnamed: 0,ym:pv:clientID,ym:pv:counterID,ym:pv:dateTime,ym:pv:goalsID,ym:pv:lastTrafficSource,ym:pv:URL,ym:pv:watchID
3092,1556793583224488891,44147844,2019-05-02 13:39:55,[],internal,goal://metrika.yandex.ru/TryDemoInt,2320287662191611254
6294,1556749098335684271,44147844,2019-05-02 01:28:28,[],internal,goal://metrika.yandex.ru/TryDemoInt,2308783151959182757


In [22]:
df['ym:pv:goalsID'].unique()

array(['[]', '[41646742]', '[30606889]'], dtype=object)

In [23]:
df[df['ym:pv:goalsID'] == '[41646742]']

Unnamed: 0,ym:pv:clientID,ym:pv:counterID,ym:pv:dateTime,ym:pv:goalsID,ym:pv:lastTrafficSource,ym:pv:URL,ym:pv:watchID
34,1556696203810958790,44147844,2019-05-01 10:36:49,[41646742],internal,goal://metrica.yandex.com/TryDemoMain,2294758517972077932
47,152693307862964988,44147844,2019-05-01 20:08:39,[41646742],internal,goal://metrica.yandex.com.tr/TryDemoMain,2303752923696270680
58,1556708190563429717,44147844,2019-05-01 13:56:57,[41646742],internal,goal://metrica.yandex.com/TryDemoMain,2297906562435059032
89,1556716647740848767,44147844,2019-05-01 16:17:30,[41646742],internal,goal://metrica.yandex.com.tr/TryDemoMain,2300117043842059710
113,1556718713732268464,44147844,2019-05-01 16:52:06,[41646742],internal,goal://metrica.yandex.com/TryDemoMain,2300661283645754874
...,...,...,...,...,...,...,...
6332,1556796797921691965,44147844,2019-05-02 14:33:23,[41646742],internal,goal://metrica.yandex.com/TryDemoMain,2321128691744574848
6431,1556781659612469643,44147844,2019-05-02 10:21:01,[41646742],internal,goal://metrica.yandex.com/TryDemoMain,2317159243952885232
6475,1556572772347950369,44147844,2019-05-02 19:42:48,[41646742],internal,goal://metrica.yandex.com/TryDemoMain,2325995441666592160
6523,1556781878890737662,44147844,2019-05-02 10:24:53,[41646742],internal,goal://metrica.yandex.com.tr/TryDemoMain,2317220281752161716


In [57]:
df = main()

request_id: 3670021,checked status: created
request_id: 3670021,cur status: created
request_id: 3670021,cur status: created
request_id: 3670021,cur status: created
request_id: 3670021,cur status: created
request_id: 3670021,cur status: processed
