In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv("geo_locations_astana_hackathon/geo_locations_astana_hackathon")

In [8]:
import ydata_profiling as yp

yp.ProfileReport(data)

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|          | 0/6 [00:00<?, ?it/s][A
 17%|█▋        | 1/6 [00:02<00:11,  2.23s/it][A
 33%|███▎      | 2/6 [00:02<00:04,  1.08s/it][A
100%|██████████| 6/6 [00:02<00:00,  2.03it/s][A


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [6]:
print(data['randomized_id'].duplicated().sum())

1255882


In [7]:
print(data.shape[0] - data['randomized_id'].duplicated().sum())

6805


In [9]:
print(data.columns)

Index(['randomized_id', 'lat', 'lng', 'alt', 'spd', 'azm'], dtype='object')


In [6]:
import pandas as pd
import folium
from ipywidgets import widgets, interact
from IPython.display import display

# ==== Загружаем твой датасет ====
# df = pd.read_csv("your_dataset.csv")
df = data.copy()

# список пользователей
users = sorted(df["randomized_id"].unique())
user = df[df["randomized_id"] == 7210059622410451615]
print(type(users))
print(type(users[0]))


def plot_user(user_id):
    g = df[df["randomized_id"] == user_id].reset_index(drop=True)
    if g.empty:
        display(f"Нет данных для {user_id}")
        return

    # центр карты по точкам этого пользователя
    center = [g["lat"].mean(), g["lng"].mean()]
    m = folium.Map(location=center, zoom_start=13)

    # маршрут (по порядку строк)
    coords = list(zip(g["lat"], g["lng"]))
    folium.PolyLine(coords, weight=4, opacity=0.8,
                    popup=f"User {user_id}, {len(coords)} точек").add_to(m)

    # старт и конец
    folium.Marker(coords[0], tooltip=f"{user_id} start").add_to(m)
    folium.Marker(coords[-1], tooltip=f"{user_id} end").add_to(m)

    # можно добавить всплывающие маркеры с атрибутами
    for i, row in g.iterrows():
        folium.CircleMarker(
            location=[row["lat"], row["lng"]],
            radius=3, color="blue", fill=True, fill_opacity=0.7,
            popup=(f"alt={row['alt']}<br>"
                   f"spd={row['spd']}<br>"
                   f"azm={row['azm']}")
        ).add_to(m)

    display(m)

# dropdown для выбора пользователя
# dropdown = widgets.Dropdown(options=user, description="User")
# interact(plot_user, user_id=dropdown)
plot_user(8656837411956585130)

<class 'list'>
<class 'numpy.int64'>


In [13]:
df = data.copy()

import math

def haversine(lat1, lon1, lat2, lon2):
    R = 6371_000
    phi1, phi2 = math.radians(lat1), math.radians(lat2)
    dphi = math.radians(lat2 - lat1)
    dlambda = math.radians(lon2 - lon1)
    a = math.sin(dphi/2)**2 + math.cos(phi1)*math.cos(phi2)*math.sin(dlambda/2)**2
    c = 2*math.atan2(math.sqrt(a), math.sqrt(1-a))
    return R * c

def restore_route(points):
    points = points.copy()
    n = len(points)
    if n <= 1:
        return points

    visited = [False] * n
    order = []

    curr = 0
    order.append(curr)
    visited[curr] = True

    for _ in range(n-1):
        lat1, lon1 = points.iloc[curr][["lat","lng"]]
        dists = []
        for j in range(n):
            if not visited[j]:
                lat2, lon2 = points.iloc[j][["lat","lng"]]
                dists.append((haversine(lat1, lon1, lat2, lon2), j))
        if not dists:
            break
        _, nxt = min(dists)  # ближайший сосед
        order.append(nxt)
        visited[nxt] = True
        curr = nxt

    return points.iloc[order].reset_index(drop=True)

def route_length(points):
    dist = 0.0
    for i in range(1, len(points)):
        lat1, lon1 = points.iloc[i-1][["lat","lng"]]
        lat2, lon2 = points.iloc[i][["lat","lng"]]
        dist += haversine(lat1, lon1, lat2, lon2)
    return dist

results = []
for user, group in df.groupby("randomized_id"):
    route = restore_route(group.reset_index(drop=True))
    length = route_length(route)
    results.append({"user": user, "distance_m": length})
    print(f"User {user}: {round(length/1000, 2)} km")

results_df = pd.DataFrame(results)
print(results_df)


User -9221304899272910788: 11.13 km
User -9217374206810770265: 3.7 km
User -9214548556609186054: 1.34 km
User -9214033164510198912: 2.71 km
User -9212938812549517684: 7.17 km
User -9210829366468064350: 7.9 km
User -9207744341078435299: 0.1 km
User -9207138669317269052: 0.16 km
User -9205038526221579984: 6.88 km
User -9204778906542759741: 3.78 km
User -9202172489717137435: 0.71 km
User -9201388571132764507: 15.1 km
User -9200688590849868007: 10.16 km
User -9200393735604840411: 13.26 km
User -9198754092502554769: 0.0 km
User -9198438140148819555: 5.41 km
User -9188776861367414071: 1.81 km
User -9185900661715731450: 3.66 km
User -9185410353967780790: 4.7 km
User -9184194414569400743: 4.03 km
User -9174809634493203141: 7.01 km
User -9169657942830411046: 16.0 km
User -9169092331127415906: 3.99 km
User -9167875026883133696: 13.64 km
User -9167088746307149199: 4.41 km
User -9164579065866867128: 8.56 km
User -9163058962347897266: 12.3 km
User -9162195255193388566: 3.37 km
User -916204503475976

KeyboardInterrupt: 

In [9]:
yp.ProfileReport(pd.DataFrame(data.groupby("randomized_id").size()))


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


100%|██████████| 1/1 [00:00<00:00, 56.81it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [16]:
import requests

url = "https://geocode-maps.yandex.ru/1.x/"

def get_geo(lat, lng):
    params = {
        "apikey": "ca094e51-7952-4c0e-a701-b242bdcb09d4",
        "geocode": f"{lng},{lat}",
        "format": "json",
        "lang": "ru_RU"
    }
    resp = requests.get(url, params=params)
    data = resp.json()

    components = (
        data["response"]["GeoObjectCollection"]["featureMember"][0]
        ["GeoObject"]["metaDataProperty"]["GeocoderMetaData"]["Address"]["Components"]
    )

    street, district = None, None
    for comp in components:
        if comp["kind"] == "street":
            street = comp["name"]
        elif comp["kind"] in ("district", "area"):  # иногда "area" = район
            district = comp["name"]

    return {"street": street, "district": district}

In [10]:
GOOGLE_API_KEY = "AIzaSyBSfPloEBVrEuKbI8_hgeDMmTpMtWEIxto"

def get_geo_gg(lat, lng):
    url = "https://maps.googleapis.com/maps/api/geocode/json"
    params = {
        "latlng": f"{lat},{lng}",
        "key": GOOGLE_API_KEY,
        "language": "ru"
    }

    try:
        resp = requests.get(url, params=params, timeout=5)
        resp.raise_for_status()
        data = resp.json()
    except Exception as e:
        print(f"{e}")
        return {"street": None, "district": None}

    if not data.get("results"):
        return {"street": None, "district": None}

    street, district = None, None
    components = data["results"][0].get("address_components", [])

    for comp in components:
        types = comp.get("types", [])
        if "route" in types:
            street = comp.get("long_name")
        elif "sublocality" in types or "administrative_area_level_2" in types:
            district = comp.get("long_name")

    return {"street": street, "district": district}


In [4]:
def drop_duplicates(points):
    """
    Убирает только дубликаты координат.
    points: list[(lat, lng)] или np.array
    """
    arr = np.array(points)
    arr = np.round(arr, 6)  # округляем, чтобы убрать шум float
    unique = np.unique(arr, axis=0)
    return unique

In [24]:
from sklearn.cluster import DBSCAN

def get_unique_streets(points, eps=0.0000059, min_samples=1):
    """
    points: список кортежей [(lat, lng), ...]
    eps: радиус кластера (в градусах, 0.0005 ≈ 50 м)
    """
    coords = np.array(points)
    coords = drop_duplicates(coords)

    # print(len(coords), "/", len(set(coords)))
    # кластеризация
    clustering = DBSCAN(eps=eps, min_samples=min_samples, metric="haversine").fit(np.radians(coords))
    labels = clustering.labels_
    print(labels)
    unique_streets = set()
    unique_districts = set()
    cache = {}

    for cluster_id in set(labels):
        cluster_points = coords[labels == cluster_id]
        if len(cluster_points) == 0:
            continue

        # берем первую точку из кластера
        lat, lng = cluster_points[0]

        # округление для кэша
        key = (round(float(lat), 5), round(float(lng), 5))
        if key not in cache:
            cache[key] = get_geo(lat, lng)

        street = cache[key].get("street")  # зависит от реализации get_geo()
        if street:
            unique_streets.add(street)

        district = cache[key].get("district")
        if district:
            unique_districts.add(district)


    return unique_streets, unique_districts


In [25]:
user = data.groupby("randomized_id").get_group(8656837411956585130)
coords = [(lat, lng) for lat, lng in zip(user.lat, user.lng)]

streets, districts = get_unique_streets(coords)
print(streets)
print(districts)

[ 0  0  1  1  2  0  3  3  4  4  5  4  4  6  4  7  4  4  4  4  4  4  4  4
  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4
  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4
  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4
  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  8  4  4
  4  4  4  8  4  4  4  4  4  8  8  4  4  4  4  4  4  4  4  4  4  4  4  4
  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  9  9  9
  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9
  9 10 10  9 10 10 10 10 10  9  9  9 11 12 13 13]
{'улица Сауран 2', 'улица Турара Рыскулова', 'проспект Кабанбай Батыра', 'проспект Улы Дала'}
{'Есильский район'}


In [None]:
from joblib import Parallel, delayed
import pandas as pd
from collections import defaultdict
import math

def process_user(user_df):
    coords = [(lat, lng) for lat, lng in zip(user_df.lat, user_df.lng)]
    streets, districts = get_unique_streets(coords)

    district_stats = defaultdict(lambda: {"count": 0, "speed_sum": 0, "speed_zero": 0})
    street_stats = defaultdict(lambda: {"count": 0, "speed_sum": 0, "speed_zero": 0})

    mean_speed = user_df.spd.mean()
    zero_speed = (user_df.spd == 0).sum()

    for d in districts:
        district_stats[d]["count"] += 1
        district_stats[d]["speed_sum"] += mean_speed
        district_stats[d]["speed_zero"] += zero_speed

    for s in streets:
        street_stats[s]["count"] += 1
        street_stats[s]["speed_sum"] += mean_speed
        street_stats[s]["speed_zero"] += zero_speed

    return street_stats, district_stats


# группируем по пользователям
groups = [g for _, g in data.groupby("randomized_id")]

# итоговые словари
all_streets = defaultdict(lambda: {"count": 0, "speed_sum": 0, "speed_zero": 0})
all_districts = defaultdict(lambda: {"count": 0, "speed_sum": 0, "speed_zero": 0})

batch_size = 15
num_batches = math.ceil(len(groups) / batch_size)

for batch_idx in range(num_batches):
    batch = groups[batch_idx * batch_size:(batch_idx + 1) * batch_size]
    # параллельно обрабатываем батч
    results = Parallel(n_jobs=-1, backend="multiprocessing")(
        delayed(process_user)(g) for g in batch
    )

    # объединяем результаты батча
    for street_stats, district_stats in results:
        for s, stats in street_stats.items():
            all_streets[s]["count"] += stats["count"]
            all_streets[s]["speed_sum"] += stats["speed_sum"]
            all_streets[s]["speed_zero"] += stats["speed_zero"]

        for d, stats in district_stats.items():
            all_districts[d]["count"] += stats["count"]
            all_districts[d]["speed_sum"] += stats["speed_sum"]
            all_districts[d]["speed_zero"] += stats["speed_zero"]

    # сохраняем прогресс после каждого батча
    streets_df = pd.DataFrame([
        {
            "street": s,
            "count": v["count"],
            "avg_speed": v["speed_sum"] / v["count"] if v["count"] > 0 else 0,
            "speed_zero": v["speed_zero"]
        }
        for s, v in all_streets.items()
    ])
    streets_df.to_csv("streets_progress.csv", index=False)

    districts_df = pd.DataFrame([
        {
            "district": d,
            "count": v["count"],
            "avg_speed": v["speed_sum"] / v["count"] if v["count"] > 0 else 0,
            "speed_zero": v["speed_zero"]
        }
        for d, v in all_districts.items()
    ])
    districts_df.to_csv("districts_progress.csv", index=False)

    print(f"Сохранили прогресс: {batch_idx + 1}/{num_batches} батчей обработано")

# финальные датафреймы
streets_df = pd.DataFrame([
    {
        "street": s,
        "count": v["count"],
        "avg_speed": v["speed_sum"] / v["count"] if v["count"] > 0 else 0,
        "speed_zero": v["speed_zero"]
    }
    for s, v in all_streets.items()
])

districts_df = pd.DataFrame([
    {
        "district": d,
        "count": v["count"],
        "avg_speed": v["speed_sum"] / v["count"] if v["count"] > 0 else 0,
        "speed_zero": v["speed_zero"]
    }
    for d, v in all_districts.items()
])
