In [3]:
import json, pickle
from pathlib import Path
import numpy as np
import pandas as pd
import torch
from typing import Optional

In [4]:
SHARD_DIR = Path("Graph_data")
OUT_CSV   = "embeddings_poi_from_shards.csv"

CLASSES = [
    'sport_and_leisure','medical','education_prim','veterinary','food_and_drink_stores',
    'arts_and_entertainment','food_and_drink','park_like','security','religion','education_sup'
]

R1, R2, R3 = 600.0, 1200.0, 2400.0
EPS = 1e-6 #Evita divisiones por 0

def as_numpy_1d(t: Optional[torch.Tensor]) -> Optional[np.ndarray]:
    if t is None:
        return None
    if isinstance(t, torch.Tensor):
        return t.view(-1).detach().cpu().numpy()
    return None

def get_raw_meters(g) -> Optional[np.ndarray]:
    # Prefer explicit meters field if present; else edge_attr (new shards keep meters there)
    if hasattr(g, "edge_attr_meters"):
        return as_numpy_1d(getattr(g, "edge_attr_meters"))
    return as_numpy_1d(getattr(g, "edge_attr", None))

def baseline_from_meters(d: Optional[np.ndarray]) -> Optional[list]:
    if d is None or d.size == 0:
        return None
    d = d.astype(float)
    d_sorted = np.sort(d)
    inv = 1.0 / (d_sorted + EPS)
    return [
        float(d_sorted.size),            # 0 count
        float(d_sorted.mean()),          # 1 mean
        float(d_sorted.min()),           # 2 min
        float(d_sorted.max()),           # 3 max
        float(np.median(d_sorted)),      # 4 median
        float(d_sorted.std()),           # 5 std
        float(inv.mean()),               # 6 mean_inv
        float(inv.max()),                # 7 max_inv
        float(inv.sum()),                # 8 sum_inv
        float((d_sorted <= R1).mean()),  # 9  <= 600m
        float((d_sorted <= R2).mean()),  # 10 <= 1200m
        float((d_sorted <= R3).mean()),  # 11 <= 2400m
    ]

# --- Gather baselines per apartment ---
rows = {}
shards = sorted(SHARD_DIR.glob("shard_*.pkl"))
print(f"Found {len(shards)} POI shards under {SHARD_DIR}")

for p in shards:
    with open(p, "rb") as f:
        d = pickle.load(f)  # dict[int -> dict[class -> Data|None]]
    for aid, per_cls in d.items():
        if aid not in rows:
            rows[aid] = {f"emb_{c}": None for c in CLASSES}
        for cls in CLASSES:
            g = per_cls.get(cls)
            if g is None:
                rows[aid][f"emb_{cls}"] = None
                continue
            meters = get_raw_meters(g)
            rows[aid][f"emb_{cls}"] = baseline_from_meters(meters)

# --- Build DataFrame with JSON strings per class ---
out = []
for aid, rec in rows.items():
    row = {"id": int(aid)}
    for cls in CLASSES:
        key = f"emb_{cls}"
        row[key] = json.dumps(rec[key]) if rec[key] is not None else None
    out.append(row)

df_poi = pd.DataFrame(out).sort_values("id").reset_index(drop=True)
df_poi.to_csv(OUT_CSV, index=False)
non_null_counts = {c: int(df_poi[c].notna().sum()) for c in df_poi.columns if c.startswith("emb_")}
print(f"✅ wrote {OUT_CSV} with {len(df_poi)} rows. Non-nulls per class:")
for k,v in non_null_counts.items():
    print(f"  - {k}: {v}")


Found 3 POI shards under Graph_data
✅ wrote embeddings_poi_from_shards.csv with 25215 rows. Non-nulls per class:
  - emb_sport_and_leisure: 25123
  - emb_medical: 25087
  - emb_education_prim: 24519
  - emb_veterinary: 22982
  - emb_food_and_drink_stores: 24694
  - emb_arts_and_entertainment: 25007
  - emb_food_and_drink: 24205
  - emb_park_like: 24275
  - emb_security: 24564
  - emb_religion: 21291
  - emb_education_sup: 24581


In [5]:
import random, json

# Pick a random apartment or set manually
#apt_id = random.choice(df_poi["id"].tolist())
apt_id = 1583341281  # <- uncomment to force a specific ID

cls = "medical"  # change to any: sport_and_leisure, education_prim, etc.

row = df_poi[df_poi["id"] == apt_id].iloc[0]
val_json = row[f"emb_{cls}"]

print(f"Apartment ID: {apt_id}")
if val_json is None:
    print(f"No embedding for class {cls}")
else:
    vals = json.loads(val_json)
    print(f"\nClass: {cls} embedding values:")
    labels = [
        "count_pois", "mean_distance", "min_distance", "max_distance",
        "median_distance", "std_distance", "mean_inverse_distance",
        "max_inverse_distance", "sum_inverse_distance",
        "ratio_within_near_radius", "ratio_within_mid_radius", "ratio_within_far_radius"
    ]
    for i, (lab, v) in enumerate(zip(labels, vals)):
        print(f"  dim{i:02d}: {v:.6f} → {lab}")


Apartment ID: 1583341281

Class: medical embedding values:
  dim00: 31.000000 → count_pois
  dim01: 1619.239470 → mean_distance
  dim02: 528.707642 → min_distance
  dim03: 2341.403320 → max_distance
  dim04: 1808.764771 → median_distance
  dim05: 549.732507 → std_distance
  dim06: 0.000758 → mean_inverse_distance
  dim07: 0.001891 → max_inverse_distance
  dim08: 0.023494 → sum_inverse_distance
  dim09: 0.129032 → ratio_within_near_radius
  dim10: 0.258065 → ratio_within_mid_radius
  dim11: 1.000000 → ratio_within_far_radius
