In [9]:
import pandas as pd

visits = pd.read_csv("../data/raw/visits.csv")
regs   = pd.read_csv("../data/raw/regs.csv")

visits.head(), regs.head()

(                                   uuid platform  \
 0  1de9ea66-70d3-4a1f-8735-df5ef7697fb9      web   
 1  f149f542-e935-4870-9734-6b4501eaf614      web   
 2  f149f542-e935-4870-9734-6b4501eaf614      web   
 3  08f0ebd4-950c-4dd9-8e97-b5bdf073eed1      web   
 4  08f0ebd4-950c-4dd9-8e97-b5bdf073eed1      web   
 
                                           user_agent                 date  
 0  Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2...  2023-03-01T13:29:22  
 1  Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) Apple...  2023-03-01T16:44:28  
 2  Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) Apple...  2023-03-06T06:12:36  
 3  Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109...  2023-03-01T20:16:37  
 4  Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109...  2023-03-05T17:42:47  ,
                   date  user_id                    email platform  \
 0  2023-03-01T00:25:39  8838849     joseph95@example.org      web   
 1  2023-03-01T14:53:01  8741065  janetsuarez@example.net      web   
 2  202

In [10]:
# Преобразуем колонки к удобному виду

# переводим названия колонок в нижний регистр (на всякий случай)
visits.columns = [c.lower() for c in visits.columns]
regs.columns = [c.lower() for c in regs.columns]

# даты
visits["date"] = pd.to_datetime(visits["date"], errors="coerce")
regs["date"] = pd.to_datetime(regs["date"], errors="coerce")

# идентификаторы как строки
visits["uuid"] = visits["uuid"].astype(str)
regs["user_id"] = regs["user_id"].astype(str)

# категории к нижнему регистру
for df, cols in [(visits, ["platform"]), (regs, ["platform", "registration_type"])]:
    for c in cols:
        df[c] = df[c].astype(str).str.strip().str.lower()

visits.dtypes, regs.dtypes


(uuid                  object
 platform              object
 user_agent            object
 date          datetime64[ns]
 dtype: object,
 date                 datetime64[ns]
 user_id                      object
 email                        object
 platform                     object
 registration_type            object
 dtype: object)

In [11]:
print("Visits shape:", visits.shape)
print("Registrations shape:", regs.shape)

print("Доля дубликатов в visits:", visits.duplicated().mean())
print("Доля дубликатов в registrations:", regs.duplicated().mean())

Visits shape: (1000, 4)
Registrations shape: (1000, 5)
Доля дубликатов в visits: 0.0
Доля дубликатов в registrations: 0.0


In [12]:
print("Пропуски в visits:")
print(visits.isna().mean())

print("\nПропуски в registrations:")
print(regs.isna().mean())

Пропуски в visits:
uuid          0.0
platform      0.0
user_agent    0.0
date          0.0
dtype: float64

Пропуски в registrations:
date                 0.0
user_id              0.0
email                0.0
platform             0.0
registration_type    0.0
dtype: float64


In [13]:
print("Visits by platform:")
print(visits["platform"].value_counts())

print("\nRegistrations by platform:")
print(regs["platform"].value_counts())

print("\nRegistration types:")
print(regs["registration_type"].value_counts())

Visits by platform:
platform
web        954
android     27
ios         19
Name: count, dtype: int64

Registrations by platform:
platform
android    517
web        265
ios        218
Name: count, dtype: int64

Registration types:
registration_type
email     446
google    303
apple     178
yandex     73
Name: count, dtype: int64


In [14]:
import pandas as pd
import requests

BASE = "https://data-charts-api.hexlet.app"
BEGIN = "2023-03-01"
END   = "2023-09-01"
params = {"begin": BEGIN, "end": END}
params

{'begin': '2023-03-01', 'end': '2023-09-01'}

In [16]:
# 1) делаем GET-запрос
resp_v = requests.get(f"{BASE}/visits", params=params, timeout=60)
resp_v.raise_for_status()  # если что-то не так — будет понятная ошибка

# 2) преобразуем JSON -> DataFrame
visits_api = pd.DataFrame(resp_v.json())

# 3) приводим имена и типы под нашу схему (как в файлах)
# из API: visit_id, platform, user_agent, datetime
# надо:   uuid,     platform, user_agent, date (datetime64)
visits_api = visits_api.rename(columns={"visit_id": "uuid", "datetime": "date"})
visits_api["date"] = pd.to_datetime(visits_api["date"], errors="coerce")
for c in ["uuid", "platform", "user_agent"]:
    visits_api[c] = visits_api[c].astype(str).str.strip().str.lower()

visits_api.head(), visits_api.shape


(                                   uuid platform  \
 0  1de9ea66-70d3-4a1f-8735-df5ef7697fb9      web   
 1  f149f542-e935-4870-9734-6b4501eaf614      web   
 2  08f0ebd4-950c-4dd9-8e97-b5bdf073eed1      web   
 3  19322fed-157c-49c6-b16e-2d5cabeb9592      web   
 4  04762a22-3c9f-40c9-9ac9-6628c4381836      web   
 
                                           user_agent                date  
 0  mozilla/5.0 (windows nt 10.0; win64; x64) appl... 2023-03-01 10:36:22  
 1  mozilla/5.0 (windows nt 10.0; wow64; trident/7... 2023-03-01 06:25:00  
 2  mozilla/5.0 (macintosh; intel mac os x 10_11_2... 2023-03-01 10:26:13  
 3  mozilla/5.0 (macintosh; intel mac os x 10_15_7... 2023-03-01 12:33:06  
 4  mozilla/5.0 (windows nt 10.0; win64; x64) appl... 2023-03-01 01:38:35  ,
 (263459, 4))

In [17]:
resp_r = requests.get(f"{BASE}/registrations", params=params, timeout=60)
resp_r.raise_for_status()

regs_api = pd.DataFrame(resp_r.json())

# из API: user_id, email, platform, registration_type, datetime
# надо:   user_id, email, platform, registration_type, date (datetime64)
regs_api = regs_api.rename(columns={"datetime": "date"})
regs_api["date"] = pd.to_datetime(regs_api["date"], errors="coerce")
for c in ["user_id", "email", "platform", "registration_type"]:
    regs_api[c] = regs_api[c].astype(str).str.strip().str.lower()

regs_api.head(), regs_api.shape

(                 date                               user_id  \
 0 2023-03-01 07:40:13  2e0f6bb8-b029-4f45-a786-2b53990d37f1   
 1 2023-03-01 13:14:00  f007f97c-9d8b-48b5-af08-119bb8f6d9b6   
 2 2023-03-01 03:05:50  24ff46ae-32b3-4a74-8f27-7cf0b8f32f15   
 3 2023-03-01 00:04:47  3e9914e1-5d73-4c23-b25d-b59a3aeb2b60   
 4 2023-03-01 18:31:52  27f875fc-f8ce-4aeb-8722-0ecb283d0760   
 
                         email platform registration_type  
 0           ebyrd@example.org      web            google  
 1    knightgerald@example.org      web             email  
 2  cherylthompson@example.com      web             apple  
 3       halldavid@example.org      web             email  
 4        denise86@example.net      web            google  ,
 (21836, 5))

In [18]:
print("visits_api:", visits_api.shape, 
      "| nulls:", visits_api.isna().mean().round(4).to_dict())
print("regs_api:", regs_api.shape, 
      "| nulls:", regs_api.isna().mean().round(4).to_dict())

print("\nVisits platforms:\n", visits_api["platform"].value_counts().head(10))
print("\nRegs platforms:\n", regs_api["platform"].value_counts().head(10))
print("\nRegs types:\n", regs_api["registration_type"].value_counts().head(10))


visits_api: (263459, 4) | nulls: {'uuid': 0.0, 'platform': 0.0, 'user_agent': 0.0, 'date': 0.0}
regs_api: (21836, 5) | nulls: {'date': 0.0, 'user_id': 0.0, 'email': 0.0, 'platform': 0.0, 'registration_type': 0.0}

Visits platforms:
 platform
web        236301
android     13972
bot          7382
ios          5804
Name: count, dtype: int64

Regs platforms:
 platform
android    10582
web         6877
ios         4377
Name: count, dtype: int64

Regs types:
 registration_type
email     8996
google    7105
apple     4006
yandex    1729
Name: count, dtype: int64


In [19]:
# каталоги под данные (если вдруг нет)
import os
os.makedirs("../data/raw", exist_ok=True)
os.makedirs("../data/processed", exist_ok=True)

# 1) «сырые» ответы API в JSON — на всякий случай
import json
with open(f"../data/raw/visits_{BEGIN}_{END}.json", "w", encoding="utf-8") as f:
    json.dump(resp_v.json(), f, ensure_ascii=False, indent=2)
with open(f"../data/raw/registrations_{BEGIN}_{END}.json", "w", encoding="utf-8") as f:
    json.dump(resp_r.json(), f, ensure_ascii=False, indent=2)

# 2) «чистые» CSV для дальнейшей работы
visits_api.to_csv("../data/processed/visits_api_clean.csv", index=False)
regs_api.to_csv("../data/processed/registrations_api_clean.csv", index=False)

"Saved!"

'Saved!'

In [20]:
v_local = pd.read_csv("../data/raw/visits.csv")
r_local = pd.read_csv("../data/raw/regs.csv")

v_local["date"] = pd.to_datetime(v_local["date"], errors="coerce")
r_local["date"] = pd.to_datetime(r_local["date"], errors="coerce")

print("local visits date range:", v_local["date"].min(), "→", v_local["date"].max())
print("api   visits date range:", visits_api["date"].min(), "→", visits_api["date"].max())

print("local regs   date range:", r_local["date"].min(), "→", r_local["date"].max())
print("api   regs   date range:", regs_api["date"].min(), "→", regs_api["date"].max())

local visits date range: 2023-03-01 00:05:35 → 2023-03-07 23:05:08
api   visits date range: 2023-03-01 00:00:43 → 2023-08-31 23:52:57
local regs   date range: 2023-03-01 00:12:22 → 2023-03-05 22:04:01
api   regs   date range: 2023-03-01 00:04:47 → 2023-08-31 23:43:26
