In [6]:
"""
Utility for collecting VK user data and building pairwise friendship features.
The script expects an environment variable VK_TOKEN with a user token that has
at least the scopes `friends`, `groups`, and `offline`. Adjust the SEED_IDS
list below to control the starting users (e.g. members of a study group).
Outputs (written to data/):
    users.csv      - attributes of each open user that was collected
    friends.csv    - undirected friendships between collected users
    groups.csv     - group memberships for collected users
    features.csv   - pairwise features for TARGET_IDS combinations
"""
from __future__ import annotations
import json
import math
import os
import time
from collections import deque
from datetime import date
from pathlib import Path
from typing import Dict, Iterable, List, Optional, Set, Tuple
import pandas as pd
from dotenv import load_dotenv
import vk_api
from vk_api.exceptions import ApiError

load_dotenv()
token = os.getenv("VK_TOKEN")
if not token:
    raise ValueError("❌ Токен не найден! Добавьте VK_TOKEN в файл .env")

vk_session = vk_api.VkApi(token=token)
vk = vk_session.get_api()

# === Configuration ===========================================================
SEED_IDS: List[int] = [251116606]  # стартовые пользователи (указать вручную)
TARGET_IDS: Optional[List[int]] = None  # по умолчанию берём SEED_IDS позже
MAX_DEPTH: int = 2  # 0: только seed, 1: добавляем их друзей и т.д.
USER_LIMIT: Optional[int] = None  # ограничение на число открытых профилей
API_DELAY = 0.34  # безопасный интервал между запросами (сек.)
MAX_RETRIES = 5
USER_FIELDS = ",".join(
    [
        "sex",
        "bdate",
        "city",
        "country",
        "universities",
        "schools",
        "occupation",
        "career",
        "followers_count",
        "counters",
    ]
)
CACHE_DIR = Path("data/cache")
USERS_CACHE = CACHE_DIR / "users"
FRIENDS_CACHE = CACHE_DIR / "friends"
GROUPS_CACHE = CACHE_DIR / "groups"
OUTPUT_DIR = Path("data")
# === Helpers =================================================================
def ensure_dir(path: Path) -> None:
    path.mkdir(parents=True, exist_ok=True)
def clean_text(value: Optional[str]) -> Optional[str]:
    if not value:
        return None
    return value.strip() or None
def normalize(value: Optional[str]) -> Optional[str]:
    text = clean_text(value)
    return text.lower() if text else None
def choose_name(candidate: dict, keys: Iterable[str]) -> Optional[str]:
    for key in keys:
        value = candidate.get(key)
        if value:
            return clean_text(str(value))
    return None
def load_cache(path: Path) -> Optional[dict]:
    if path.exists():
        with path.open("r", encoding="utf-8") as fh:
            return json.load(fh)
    return None
def save_cache(path: Path, payload: dict) -> None:
    ensure_dir(path.parent)
    with path.open("w", encoding="utf-8") as fh:
        json.dump(
            payload, fh,
            ensure_ascii=False, indent=2,
            default=lambda o: sorted(o) if isinstance(o, set) else o
        )
def safe_api_call(method, *, description: str, **kwargs):
    """Call VK API with retries and throttling."""
    delay = API_DELAY
    for attempt in range(1, MAX_RETRIES + 1):
        try:
            response = method(**kwargs)
            time.sleep(API_DELAY)
            return response
        except ApiError as error:
            # 6 — too many requests per second; 10/29 — rate limit; retry with backoff.
            if error.code in {6, 10, 29}:
                time.sleep(delay)
                delay *= 2
                continue
            raise RuntimeError(f"{description} failed: {error.code} {error}") from error
    raise RuntimeError(f"{description} failed after {MAX_RETRIES} retries")
def parse_age(bdate: Optional[str]) -> Optional[int]:
    """Return age in years if full birth date is available and plausible."""
    if not bdate or bdate.count(".") != 2:
        return None
    day_str, month_str, year_str = bdate.split(".")
    if not year_str.isdigit():
        return None
    try:
        day, month, year = int(day_str), int(month_str), int(year_str)
        born = date(year, month, day)
    except ValueError:
        return None
    today = date.today()
    age = today.year - born.year - (
        (today.month, today.day) < (born.month, born.day)
    )
    if age < 12 or age > 90:
        return None
    return age
def normalize_list(items: Iterable[str]) -> List[str]:
    return sorted({clean_text(item) for item in items if clean_text(item)})
def normalization_set(items: Iterable[str]) -> Set[str]:
    return {normalize(item) for item in items if normalize(item)}
# === VK data extraction ======================================================
def fetch_user_profile(vk, user_id: int) -> Optional[dict]:
    cache_path = USERS_CACHE / f"{user_id}.json"
    cached = load_cache(cache_path)
    if cached:
        return cached
    try:
        raw_users = safe_api_call(
            vk.users.get,
            description=f"users.get for {user_id}",
            user_ids=user_id,
            fields=USER_FIELDS,
        )
    except RuntimeError as error:
        print(error)
        return None
    if not raw_users:
        return None
    raw = raw_users[0]
    if raw.get("is_closed"):
        return None
    city = raw.get("city", {}) or {}
    country = raw.get("country", {}) or {}
    universities_raw = raw.get("universities") or []
    schools_raw = raw.get("schools") or []
    career_raw = raw.get("career") or []
    universities = [
        choose_name(uni, ["name", "title", "university_name"])
        for uni in universities_raw
    ]
    faculties = [
        choose_name(uni, ["faculty_name", "chair_name"])
        for uni in universities_raw
    ]
    schools = [
        choose_name(sch, ["name", "title"])
        for sch in schools_raw
    ]
    works = [
        choose_name(job, ["company", "position", "name"])
        for job in career_raw
    ]
    counters = raw.get("counters") or {}
    friends_count = counters.get("friends")
    followers_count = raw.get("followers_count", counters.get("followers"))
    profile = {
        "id": raw["id"],
        "first_name": clean_text(raw.get("first_name")),
        "last_name": clean_text(raw.get("last_name")),
        "city": clean_text(city.get("title")),
        "city_norm": normalize(city.get("title")),
        "country": clean_text(country.get("title")),
        "country_norm": normalize(country.get("title")),
        "sex": raw.get("sex"),
        "age": parse_age(raw.get("bdate")),
        "universities": normalize_list(universities),
        "universities_norm": normalization_set(universities),
        "faculties": normalize_list(faculties),
        "faculties_norm": normalization_set(faculties),
        "schools": normalize_list(schools),
        "schools_norm": normalization_set(schools),
        "works": normalize_list(works),
        "works_norm": normalization_set(works),
        "friends_count": friends_count,
        "followers_count": followers_count,
    }
    save_cache(cache_path, profile)
    return profile
def fetch_friends(vk, user_id: int) -> Set[int]:
    cache_path = FRIENDS_CACHE / f"{user_id}.json"
    cached = load_cache(cache_path)
    if cached is not None:
        return set(cached)
    try:
        response = safe_api_call(
            vk.friends.get,
            description=f"friends.get for {user_id}",
            user_id=user_id,
        )
    except RuntimeError as error:
        print(error)
        save_cache(cache_path, [])
        return set()
    friends = set(response.get("items", []))
    save_cache(cache_path, sorted(friends))
    return friends
def fetch_groups(vk, user_id: int) -> Set[int]:
    cache_path = GROUPS_CACHE / f"{user_id}.json"
    cached = load_cache(cache_path)
    if cached is not None:
        return set(cached)
    groups: Set[int] = set()
    offset = 0
    count = 1000
    while True:
        try:
            response = safe_api_call(
                vk.groups.get,
                description=f"groups.get for {user_id}",
                user_id=user_id,
                count=count,
                offset=offset,
            )
        except RuntimeError as error:
            print(error)
            break
        batch = response.get("items", [])
        groups.update(batch)
        if len(batch) < count:
            break
        offset += count
    save_cache(cache_path, sorted(groups))
    return groups
def collect_graph(
    vk,
    seed_ids: Iterable[int],
    *,
    depth: int,
    user_limit: Optional[int],
) -> Tuple[Dict[int, dict], Dict[int, Set[int]], Dict[int, Set[int]]]:
    queue = deque([(uid, 0) for uid in set(seed_ids)])
    visited: Set[int] = set()
    profiles: Dict[int, dict] = {}
    friends_map: Dict[int, Set[int]] = {}
    groups_map: Dict[int, Set[int]] = {}
    while queue:
        user_id, current_depth = queue.popleft()
        if user_id in visited:
            continue
        visited.add(user_id)
        profile = fetch_user_profile(vk, user_id)
        if not profile:
            continue
        profiles[user_id] = profile
        friends = fetch_friends(vk, user_id)
        friends_map[user_id] = friends
        if depth is None or current_depth < depth:
            for friend_id in friends:
                if friend_id not in visited:
                    queue.append((friend_id, current_depth + 1))
        groups = fetch_groups(vk, user_id)
        groups_map[user_id] = groups
        if user_limit and len(profiles) >= user_limit:
            break
    return profiles, friends_map, groups_map
# === Feature engineering =====================================================
def adamic_adar_score(
    common_friends: Set[int],
    friends_map: Dict[int, Set[int]],
    profiles: Dict[int, dict],
) -> float:
    score = 0.0
    for friend_id in common_friends:
        degree = len(friends_map.get(friend_id, []))
        if degree <= 1:
            degree = profiles.get(friend_id, {}).get("friends_count")
        if degree and degree > 1:
            score += 1.0 / math.log(degree)
    return score
def compute_features(
    user_ids: Iterable[int],
    profiles: Dict[int, dict],
    friends_map: Dict[int, Set[int]],
    groups_map: Dict[int, Set[int]],
) -> List[dict]:
    rows: List[dict] = []
    user_list = [uid for uid in user_ids if uid in profiles]
    user_list.sort()
    for idx, uid_a in enumerate(user_list):
        for uid_b in user_list[idx + 1 :]:
            profile_a = profiles[uid_a]
            profile_b = profiles[uid_b]
            friends_a = friends_map.get(uid_a, set())
            friends_b = friends_map.get(uid_b, set())
            common_friends = friends_a & friends_b
            union_friends = friends_a | friends_b
            jaccard_friends = (
                len(common_friends) / len(union_friends) if union_friends else 0.0
            )
            groups_a = groups_map.get(uid_a, set())
            groups_b = groups_map.get(uid_b, set())
            common_groups = groups_a & groups_b
            union_groups = groups_a | groups_b
            jaccard_groups = (
                len(common_groups) / len(union_groups) if union_groups else 0.0
            )
            same_city = int(
                profile_a.get("city_norm")
                and profile_a.get("city_norm") == profile_b.get("city_norm")
            )
            same_university = int(
                bool(profile_a.get("universities_norm") & profile_b.get("universities_norm"))
            )
            same_faculty = int(
                bool(profile_a.get("faculties_norm") & profile_b.get("faculties_norm"))
            )
            same_school = int(
                bool(profile_a.get("schools_norm") & profile_b.get("schools_norm"))
            )
            age_a = profile_a.get("age")
            age_b = profile_b.get("age")
            age_diff = abs(age_a - age_b) if age_a is not None and age_b is not None else None
            row = {
                "user_A": uid_a,
                "user_B": uid_b,
                "common_friends": len(common_friends),
                "jaccard_friends": jaccard_friends,
                "adamic_adar": adamic_adar_score(common_friends, friends_map, profiles),
                "same_city": same_city,
                "same_university": same_university,
                "same_faculty": same_faculty,
                "same_school": same_school,
                "age_diff": age_diff,
                "common_groups": len(common_groups),
                "jaccard_groups": jaccard_groups,
            }
            rows.append(row)
    return rows
# === Export utilities ========================================================
def join_values(values: Iterable[str]) -> str:
    return ";".join(sorted({value for value in values if value}))
def export_users(profiles: Dict[int, dict], destination: Path) -> None:
    records = []
    for profile in profiles.values():
        records.append(
            {
                "id": profile["id"],
                "first_name": profile.get("first_name"),
                "last_name": profile.get("last_name"),
                "city": profile.get("city"),
                "country": profile.get("country"),
                "sex": profile.get("sex"),
                "age": profile.get("age"),
                "universities": join_values(profile.get("universities", [])),
                "faculties": join_values(profile.get("faculties", [])),
                "schools": join_values(profile.get("schools", [])),
                "works": join_values(profile.get("works", [])),
                "friends_count": profile.get("friends_count"),
                "followers_count": profile.get("followers_count"),
            }
        )
    df = pd.DataFrame(records)
    df.to_csv(destination, index=False)
def export_friends(
    profiles: Dict[int, dict],
    friends_map: Dict[int, Set[int]],
    destination: Path,
) -> None:
    known_ids = set(profiles.keys())
    edges = set()
    for uid, friends in friends_map.items():
        for fid in friends:
            if fid in known_ids and uid < fid:
                edges.add((uid, fid))
    df = pd.DataFrame(edges, columns=["user_A", "user_B"])
    df.to_csv(destination, index=False)
def export_groups(
    profiles: Dict[int, dict],
    groups_map: Dict[int, Set[int]],
    destination: Path,
) -> None:
    rows = []
    known_ids = set(profiles.keys())
    for uid, groups in groups_map.items():
        if uid not in known_ids:
            continue
        for gid in sorted(groups):
            rows.append({"user_id": uid, "group_id": gid})
    df = pd.DataFrame(rows)
    df.to_csv(destination, index=False)
def export_features(rows: List[dict], destination: Path) -> None:
    df = pd.DataFrame(rows)
    df.to_csv(destination, index=False)
# === Entry point =============================================================
def main():
    load_dotenv()
    token = os.getenv("VK_TOKEN")
    if not token:
        raise RuntimeError("VK_TOKEN is missing. Add it to your .env file.")
    ensure_dir(CACHE_DIR)
    ensure_dir(OUTPUT_DIR)
    vk_session = vk_api.VkApi(token=token)
    vk = vk_session.get_api()
    profiles, friends_map, groups_map = collect_graph(
        vk,
        seed_ids=SEED_IDS,
        depth=MAX_DEPTH,
        user_limit=USER_LIMIT,
    )
    if not profiles:
        raise RuntimeError("No open profiles were collected. Check permissions.")
    feature_ids = TARGET_IDS if TARGET_IDS is not None else SEED_IDS
    feature_rows = compute_features(feature_ids, profiles, friends_map, groups_map)
    export_users(profiles, OUTPUT_DIR / "users.csv")
    export_friends(profiles, friends_map, OUTPUT_DIR / "friends.csv")
    export_groups(profiles, groups_map, OUTPUT_DIR / "groups.csv")
    export_features(feature_rows, OUTPUT_DIR / "features.csv")
    print(f"Collected {len(profiles)} open users.")
    print(f"Wrote users, friends, groups, and features CSV to {OUTPUT_DIR.resolve()}")
if __name__ == "__main__":
    main()

friends.get for 221041702 failed: 18 [18] User was deleted or banned
groups.get for 221041702 failed: 18 [18] User was deleted or banned
friends.get for 318368868 failed: 18 [18] User was deleted or banned
groups.get for 318368868 failed: 18 [18] User was deleted or banned
friends.get for 277705371 failed: 18 [18] User was deleted or banned
groups.get for 277705371 failed: 18 [18] User was deleted or banned
friends.get for 384622761 failed: 18 [18] User was deleted or banned
groups.get for 384622761 failed: 18 [18] User was deleted or banned
friends.get for 309326115 failed: 18 [18] User was deleted or banned
groups.get for 309326115 failed: 18 [18] User was deleted or banned
friends.get for 258049928 failed: 18 [18] User was deleted or banned
groups.get for 258049928 failed: 18 [18] User was deleted or banned
friends.get for 376888763 failed: 18 [18] User was deleted or banned
groups.get for 376888763 failed: 18 [18] User was deleted or banned
friends.get for 227937236 failed: 18 [18]

KeyboardInterrupt: 

In [9]:
# === Rebuild CSVs from cache (no API calls) ==================================
def _safe_json_load(path: Path):
    try:
        with path.open("r", encoding="utf-8") as fh:
            return json.load(fh)
    except json.JSONDecodeError:
        # Битый кэш — пропускаем
        return None

def _to_set(x):
    if x is None:
        return set()
    if isinstance(x, set):
        return x
    if isinstance(x, (list, tuple)):
        return {str(v).strip().lower() for v in x if v is not None and str(v).strip()}
    # fallback: одно значение -> множество из одного
    s = str(x).strip().lower()
    return {s} if s else set()

def rebuild_from_cache(
    target_ids: Optional[Iterable[int]] = None,
    out_dir: Path = OUTPUT_DIR
):
    """Собирает users.csv, friends.csv, groups.csv и features.csv только из кэша (без VK API)."""
    ensure_dir(out_dir)

    # 1) Загружаем профили
    profiles: Dict[int, dict] = {}
    if USERS_CACHE.exists():
        for p in USERS_CACHE.glob("*.json"):
            data = _safe_json_load(p)
            if not data or "id" not in data:
                continue
            try:
                uid = int(data["id"])
            except (TypeError, ValueError):
                continue

            # Приведение *_norm к set, т.к. в JSON они лежат списками/могут быть None
            data["universities_norm"] = _to_set(data.get("universities_norm"))
            data["faculties_norm"]    = _to_set(data.get("faculties_norm"))
            data["schools_norm"]      = _to_set(data.get("schools_norm"))
            data["works_norm"]        = _to_set(data.get("works_norm"))

            # Нормализуем city_norm на всякий случай
            data["city_norm"] = (
                str(data["city_norm"]).strip().lower()
                if data.get("city_norm") not in (None, "")
                else None
            )

            profiles[uid] = data

    if not profiles:
        print("⚠️ В кэше нет валидных профилей (data/cache/users/*.json).")
        return

    # 2) Загружаем друзей
    friends_map: Dict[int, Set[int]] = {}
    if FRIENDS_CACHE.exists():
        for p in FRIENDS_CACHE.glob("*.json"):
            data = _safe_json_load(p)
            if data is None:
                continue
            try:
                uid = int(p.stem)
            except ValueError:
                continue
            # в кэше друзья — список id; приводим к множеству int
            friends_map[uid] = {
                int(x) for x in data
                if isinstance(x, (int, str)) and str(x).isdigit()
            }
    # гарантируем ключи для всех известных профилей
    for uid in profiles.keys():
        friends_map.setdefault(uid, set())

    # 3) Загружаем группы
    groups_map: Dict[int, Set[int]] = {}
    if GROUPS_CACHE.exists():
        for p in GROUPS_CACHE.glob("*.json"):
            data = _safe_json_load(p)
            if data is None:
                continue
            try:
                uid = int(p.stem)
            except ValueError:
                continue
            groups_map[uid] = {
                int(x) for x in data
                if isinstance(x, (int, str)) and str(x).isdigit()
            }
    for uid in profiles.keys():
        groups_map.setdefault(uid, set())

    # 4) Экспортируем базовые CSV
    export_users(profiles, out_dir / "users.csv")
    export_friends(profiles, friends_map, out_dir / "friends.csv")
    export_groups(profiles, groups_map, out_dir / "groups.csv")
    print("✅ users.csv, friends.csv, groups.csv пересобраны из кэша.")

    # 5) Фичи по парам (локальная безопасная версия, без изменения глобальной compute_features)
    if target_ids is None:
        # по умолчанию — все собранные пользователи
        target_ids = list(profiles.keys())

    def _as_set(x):
        if isinstance(x, set): 
            return x
        if isinstance(x, (list, tuple)): 
            return set(x)
        return set()

    def _adamic_adar(common_friends: Set[int]) -> float:
        score = 0.0
        for fid in common_friends:
            deg = len(friends_map.get(fid, []))
            if deg <= 1:
                deg = profiles.get(fid, {}).get("friends_count")
            if deg and deg > 1:
                score += 1.0 / math.log(deg)
        return score

    feature_rows: List[dict] = []
    user_list = [uid for uid in target_ids if uid in profiles]
    user_list.sort()
    for i, uid_a in enumerate(user_list):
        for uid_b in user_list[i+1:]:
            pa = profiles[uid_a]; pb = profiles[uid_b]

            fa = friends_map.get(uid_a, set()) or set()
            fb = friends_map.get(uid_b, set()) or set()
            cf = fa & fb
            uf = fa | fb
            j_f = len(cf) / len(uf) if uf else 0.0

            ga = groups_map.get(uid_a, set()) or set()
            gb = groups_map.get(uid_b, set()) or set()
            cg = ga & gb
            ug = ga | gb
            j_g = len(cg) / len(ug) if ug else 0.0

            ca = pa.get("city_norm"); cb = pb.get("city_norm")
            same_city = int(ca is not None and cb is not None and ca == cb)

            same_university = int(bool(_as_set(pa.get("universities_norm")) &
                                       _as_set(pb.get("universities_norm"))))
            same_faculty    = int(bool(_as_set(pa.get("faculties_norm")) &
                                       _as_set(pb.get("faculties_norm"))))
            same_school     = int(bool(_as_set(pa.get("schools_norm")) &
                                       _as_set(pb.get("schools_norm"))))

            age_a = pa.get("age"); age_b = pb.get("age")
            age_diff = abs(age_a - age_b) if (age_a is not None and age_b is not None) else None

            feature_rows.append({
                "user_A": uid_a,
                "user_B": uid_b,
                "common_friends": len(cf),
                "jaccard_friends": j_f,
                "adamic_adar": _adamic_adar(cf),
                "same_city": same_city,
                "same_university": same_university,
                "same_faculty": same_faculty,
                "same_school": same_school,
                "age_diff": age_diff,
                "common_groups": len(cg),
                "jaccard_groups": j_g,
            })

    export_features(feature_rows, out_dir / "features.csv")
    print("✅ features.csv пересобран из кэша.")


In [10]:
rebuild_from_cache()

✅ users.csv, friends.csv, groups.csv пересобраны из кэша.
✅ features.csv пересобран из кэша.
