In [5]:
import requests
import pandas as pd
from io import StringIO
import datetime
import json
from urllib.parse import urlencode
import time

# Получение данных через `Logs API`
## Logs API

`Logs API` позволяет выгрузить сырые данные со счетчика.

Документация по `Logs API` - https://yandex.ru/dev/metrika/doc/api2/logs/intro.html

Данные для этого кейса также доступны на Яндекс.Диске - https://disk.yandex.ru/d/sUmQmh_MnQWL4g?w=1

### Шаг 1: получаем токен
Для работы с API необходимо получить свой токен - https://yandex.ru/dev/oauth/doc/dg/tasks/get-oauth-token.html

Создаем приложение тут (указываем права для чтения в Яндекс.Метрике) - https://oauth.yandex.ru/client/new

Переходим по ссылке вида - `https://oauth.yandex.ru/authorize?response_type=token&client_id=<идентификатор приложения>`

Полученный токен можно сохранить в домашнюю директорию в файл `.yatoken.txt`

In [6]:
TOKEN = open('../.yatoken.txt').read().strip()

### Шаг 2: проверяем, можно ли создать запрос в Logs API

In [7]:
API_HOST = 'https://api-metrika.yandex.ru'
COUNTER_ID = 178943
START_DATE = '2026-01-10'
END_DATE = '2026-02-10'
SOURCE = 'hits'
API_FIELDS = ('ym:pv:date', 'ym:pv:dateTime', 'ym:pv:URL', 'ym:pv:deviceCategory', 
         'ym:pv:operatingSystemRoot', 'ym:pv:clientID', 'ym:pv:browser', 'ym:pv:lastTrafficSource')


In [8]:
header_dict = {'Authorization': f'OAuth {TOKEN}',
'Content-Type': 'application/x-yametrika+json'
}

In [9]:
url_params = urlencode(
    [
        ('date1', START_DATE),
        ('date2', END_DATE),
        ('source', SOURCE),
        ('fields', ','.join(API_FIELDS))
    ]
)

url = '{host}/management/v1/counter/{counter_id}/logrequests/evaluate?'\
    .format(host=API_HOST, counter_id=COUNTER_ID) + url_params

r = requests.get(url, headers = header_dict)

In [10]:
r.status_code

200

In [11]:
json.loads(r.text)['log_request_evaluation']

{'possible': True,
 'expected_size': 139544262,
 'max_possible_day_quantity': 2380,
 'log_request_sum_max_size': 10737418240,
 'log_request_sum_size': 357786870}

### Шаг 3: создаем запрос

In [12]:
url_params = urlencode(
    [
        ('date1', START_DATE),
        ('date2', END_DATE),
        ('source', SOURCE),
        ('fields', ','.join(sorted(API_FIELDS, key=lambda s: s.lower())))
    ]
)
url = '{host}/management/v1/counter/{counter_id}/logrequests?'\
    .format(host=API_HOST,
            counter_id=COUNTER_ID) \
      + url_params

r = requests.post(url, headers=header_dict)

In [13]:
r.status_code

200

In [14]:
json.loads(r.text)['log_request']

{'request_id': 51600625,
 'counter_id': 178943,
 'source': 'hits',
 'date1': '2026-01-10',
 'date2': '2026-02-10',
 'fields': ['ym:pv:browser',
  'ym:pv:clientID',
  'ym:pv:date',
  'ym:pv:dateTime',
  'ym:pv:deviceCategory',
  'ym:pv:lastTrafficSource',
  'ym:pv:operatingSystemRoot',
  'ym:pv:URL'],
 'status': 'created',
 'size': 0,
 'attribution': 'LASTSIGN'}

In [15]:
request_id = json.loads(r.text)['log_request']['request_id']

In [16]:
request_id

51600625

### Шаг 4: ждем окончания обработки

In [17]:
status = 'created'
while status == 'created':
    time.sleep(60)
    print('trying')
    url = '{host}/management/v1/counter/{counter_id}/logrequest/{request_id}' \
            .format(request_id=request_id,
                    counter_id=COUNTER_ID,
                    host=API_HOST)

    r = requests.get(url, headers=header_dict)
    if r.status_code == 200:
        status = json.loads(r.text)['log_request']['status']
        print(json.dumps(json.loads(r.text)['log_request'], indent = 4))
    else:
        raise(BaseException(r.text))

trying
{
    "request_id": 51600625,
    "counter_id": 178943,
    "source": "hits",
    "date1": "2026-01-10",
    "date2": "2026-02-10",
    "fields": [
        "ym:pv:browser",
        "ym:pv:clientID",
        "ym:pv:date",
        "ym:pv:dateTime",
        "ym:pv:deviceCategory",
        "ym:pv:lastTrafficSource",
        "ym:pv:operatingSystemRoot",
        "ym:pv:URL"
    ],
    "status": "processed",
    "size": 153169403,
    "parts": [
        {
            "part_number": 0,
            "size": 71703406
        },
        {
            "part_number": 1,
            "size": 81465997
        }
    ],
    "attribution": "LASTSIGN"
}


In [18]:
json.loads(r.text)['log_request']

{'request_id': 51600625,
 'counter_id': 178943,
 'source': 'hits',
 'date1': '2026-01-10',
 'date2': '2026-02-10',
 'fields': ['ym:pv:browser',
  'ym:pv:clientID',
  'ym:pv:date',
  'ym:pv:dateTime',
  'ym:pv:deviceCategory',
  'ym:pv:lastTrafficSource',
  'ym:pv:operatingSystemRoot',
  'ym:pv:URL'],
 'status': 'processed',
 'size': 153169403,
 'parts': [{'part_number': 0, 'size': 71703406},
  {'part_number': 1, 'size': 81465997}],
 'attribution': 'LASTSIGN'}

In [19]:
parts = json.loads(r.text)['log_request']['parts']
parts

[{'part_number': 0, 'size': 71703406}, {'part_number': 1, 'size': 81465997}]

### Шаг 5: выгружаем данные

In [20]:
tmp_dfs = []
for part_num in map(lambda x: x['part_number'], parts):
    url = '{host}/management/v1/counter/{counter_id}/logrequest/{request_id}/part/{part}/download' \
            .format(
                host=API_HOST,
                counter_id=COUNTER_ID,
                request_id=request_id,
                part=part_num
            )

    r = requests.get(url, headers=header_dict)
    if r.status_code == 200:
        tmp_df = pd.read_csv(StringIO(r.text), sep = '\t')
        tmp_dfs.append(tmp_df)
    else:
        raise(BaseError(r.text))
        
hits_df = pd.concat(tmp_dfs)

In [21]:
hits_df.shape

(912070, 8)

In [22]:
hits_df.to_csv('metrika_cloud_case_data_hits.csv', sep = '\t', index = False)

### Шаг 6: то же самое но для визитов

In [23]:
SOURCE = 'visits'
API_FIELDS = ('ym:s:date', 'ym:s:dateTime', 'ym:s:startURL', 'ym:s:deviceCategory', 
         'ym:s:operatingSystemRoot', 'ym:s:clientID', 'ym:s:browser', 'ym:s:lastTrafficSource', 'ym:s:purchaseRevenue', 'ym:s:purchaseID')


In [24]:
url_params = urlencode(
    [
        ('date1', START_DATE),
        ('date2', END_DATE),
        ('source', SOURCE),
        ('fields', ','.join(sorted(API_FIELDS, key=lambda s: s.lower())))
    ]
)
url = '{host}/management/v1/counter/{counter_id}/logrequests?'\
    .format(host=API_HOST,
            counter_id=COUNTER_ID) \
      + url_params

r = requests.post(url, headers=header_dict)

In [25]:
r.status_code

200

In [26]:
json.loads(r.text)['log_request']

{'request_id': 51600631,
 'counter_id': 178943,
 'source': 'visits',
 'date1': '2026-01-10',
 'date2': '2026-02-10',
 'fields': ['ym:s:browser',
  'ym:s:clientID',
  'ym:s:date',
  'ym:s:dateTime',
  'ym:s:deviceCategory',
  'ym:s:lastTrafficSource',
  'ym:s:operatingSystemRoot',
  'ym:s:purchaseID',
  'ym:s:purchaseRevenue',
  'ym:s:startURL'],
 'status': 'created',
 'size': 0,
 'attribution': 'LASTSIGN'}

In [27]:
request_id = json.loads(r.text)['log_request']['request_id']

In [28]:
request_id

51600631

In [29]:
status = 'created'
while status == 'created':
    time.sleep(60)
    print('trying')
    url = '{host}/management/v1/counter/{counter_id}/logrequest/{request_id}' \
            .format(request_id=request_id,
                    counter_id=COUNTER_ID,
                    host=API_HOST)

    r = requests.get(url, headers=header_dict)
    if r.status_code == 200:
        status = json.loads(r.text)['log_request']['status']
        print(json.dumps(json.loads(r.text)['log_request'], indent = 4))
    else:
        raise(BaseException(r.text))

trying
{
    "request_id": 51600631,
    "counter_id": 178943,
    "source": "visits",
    "date1": "2026-01-10",
    "date2": "2026-02-10",
    "fields": [
        "ym:s:browser",
        "ym:s:clientID",
        "ym:s:date",
        "ym:s:dateTime",
        "ym:s:deviceCategory",
        "ym:s:lastTrafficSource",
        "ym:s:operatingSystemRoot",
        "ym:s:purchaseID",
        "ym:s:purchaseRevenue",
        "ym:s:startURL"
    ],
    "status": "processed",
    "size": 20180480,
    "parts": [
        {
            "part_number": 0,
            "size": 20180480
        }
    ],
    "attribution": "LASTSIGN"
}


In [30]:
json.loads(r.text)['log_request']

{'request_id': 51600631,
 'counter_id': 178943,
 'source': 'visits',
 'date1': '2026-01-10',
 'date2': '2026-02-10',
 'fields': ['ym:s:browser',
  'ym:s:clientID',
  'ym:s:date',
  'ym:s:dateTime',
  'ym:s:deviceCategory',
  'ym:s:lastTrafficSource',
  'ym:s:operatingSystemRoot',
  'ym:s:purchaseID',
  'ym:s:purchaseRevenue',
  'ym:s:startURL'],
 'status': 'processed',
 'size': 20180480,
 'parts': [{'part_number': 0, 'size': 20180480}],
 'attribution': 'LASTSIGN'}

In [31]:
parts = json.loads(r.text)['log_request']['parts']
parts

[{'part_number': 0, 'size': 20180480}]

In [32]:
tmp_dfs = []
for part_num in map(lambda x: x['part_number'], parts):
    url = '{host}/management/v1/counter/{counter_id}/logrequest/{request_id}/part/{part}/download' \
            .format(
                host=API_HOST,
                counter_id=COUNTER_ID,
                request_id=request_id,
                part=part_num
            )

    r = requests.get(url, headers=header_dict)
    if r.status_code == 200:
        tmp_df = pd.read_csv(StringIO(r.text), sep = '\t')
        tmp_dfs.append(tmp_df)
    else:
        raise(BaseError(r.text))
        
visits_df = pd.concat(tmp_dfs)

In [33]:
visits_df.shape

(106194, 10)

In [34]:
visits_df.to_csv('metrika_cloud_case_data_visits.csv', sep = '\t', index = False)