In [1]:
# Cell 1 — Imports & Config
import os, time, pickle, random
import pandas as pd
import numpy as np
import torch
from torch_geometric.data import Data
from neo4j import GraphDatabase
from neo4j.exceptions import ServiceUnavailable, TransientError
from concurrent.futures import ThreadPoolExecutor, as_completed

# Neo4j connection
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "12345678"
DATABASE = "neo4jads"

# Radii per category (meters)
RADII = {"CERCA_DE_CAT1": 600.0, "CERCA_DE_CAT2": 1200.0, "CERCA_DE_CAT3": 2400.0}

# Classes (11)
CLASSES = [
'sport_and_leisure','medical','education_prim','veterinary','food_and_drink_stores',
'arts_and_entertainment','food_and_drink','park_like','security','religion','education_sup'
]

In [2]:
# Cell 2 — Cypher queries (now returns BOTH raw meters and normalized weight)
NODE_QUERY = """
MATCH (d:Departamento {id:$apt_id})
RETURN elementId(d) AS id, 'Departamento' AS label,
       d.id AS apt_id, d.latitude AS lat, d.longitude AS lon, null AS cat
UNION
MATCH (d:Departamento {id:$apt_id})-[:CERCA_DE_CAT1|CERCA_DE_CAT2|CERCA_DE_CAT3]->(p:POI)
WHERE p.class = $class
RETURN elementId(p) AS id, 'POI' AS label,
       null AS apt_id, null AS lat, null AS lon, p.cat AS cat
"""

REL_QUERY = """
MATCH (d:Departamento {id:$apt_id})-[r:CERCA_DE_CAT1|CERCA_DE_CAT2|CERCA_DE_CAT3]->(p:POI)
WHERE p.class = $class
WITH elementId(d) AS source, elementId(p) AS target, type(r) AS t, r.distancia_metros AS dist
WITH source, target, dist,
  CASE t
    WHEN 'CERCA_DE_CAT1' THEN CASE WHEN 1.0 - dist / $r_cat1 > 0.001 THEN 1.0 - dist / $r_cat1 ELSE 0.001 END
    WHEN 'CERCA_DE_CAT2' THEN CASE WHEN 1.0 - dist / $r_cat2 > 0.001 THEN 1.0 - dist / $r_cat2 ELSE 0.001 END
    ELSE CASE WHEN 1.0 - dist / $r_cat3 > 0.001 THEN 1.0 - dist / $r_cat3 ELSE 0.001 END
  END AS weight
RETURN source, target, dist, weight
"""


In [3]:
# Cell 3 — Export function (session-aware, stores meters + normalized weight)
def export_apartment_class_graph(apt_id: int, class_name: str, session=None) -> Data | None:
    """Return PyG Data for (apt × class), or None if no nodes/edges.
    edge_attr  -> raw distance in meters (float, shape [E, 1])
    edge_weight-> normalized weight in [0.001, 1] (float, shape [E, 1])
    """
    # If no session provided, open a temporary one
    if session is None:
        driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))
        try:
            with driver.session(database=DATABASE) as s:
                return export_apartment_class_graph(apt_id, class_name, session=s)
        finally:
            driver.close()

    # --- Nodes
    nodes = session.run(NODE_QUERY, {"apt_id": apt_id, "class": class_name}).data()
    if not nodes:
        return None
    node_df = pd.DataFrame(nodes)
    id_map = {nid: i for i, nid in enumerate(node_df["id"].tolist())}

    # --- Edges
    edges = session.run(
        REL_QUERY,
        {
            "apt_id": apt_id, "class": class_name,
            "r_cat1": RADII["CERCA_DE_CAT1"],
            "r_cat2": RADII["CERCA_DE_CAT2"],
            "r_cat3": RADII["CERCA_DE_CAT3"],
        },
    ).data()
    if not edges:
        # No POIs for this class: return None (we’ll store None in the shard)
        return None
    edge_df = pd.DataFrame(edges)  # columns: source, target, dist, weight

    # Remap to local indices
    edge_index = torch.tensor([
        [id_map[s] for s in edge_df["source"]],
        [id_map[t] for t in edge_df["target"]],
    ], dtype=torch.long)

    # --- Edge attributes
    edge_attr_dist = torch.tensor(edge_df["dist"].values, dtype=torch.float).unsqueeze(1)    # meters
    edge_weight    = torch.tensor(edge_df["weight"].values, dtype=torch.float).unsqueeze(1)  # normalized

    # --- Node features
    feats: list[list[float]] = []
    for _, row in node_df.iterrows():
        if row["label"] == "Departamento":
            feats.append([1.0, 0.0, 0.0, 0.0])  # [is_apartment, cat1, cat2, cat3]
        else:
            onehot = [0.0, 0.0, 0.0, 0.0]
            cat_val = row.get("cat")
            if cat_val is not None:
                try:
                    idx = int(cat_val)
                    if 0 <= idx < 4:
                        onehot[idx] = 1.0
                except (ValueError, TypeError):
                    pass
            feats.append(onehot)

    x = torch.tensor(np.array(feats), dtype=torch.float)

    # Keep meters in edge_attr (baseline will use it), and store weight separately
    return Data(
        x=x,
        edge_index=edge_index,
        edge_attr=edge_attr_dist,   # RAW METERS
        edge_weight=edge_weight,    # NORMALIZED WEIGHT (for future GNN use)
        apt_id=apt_id,
        poi_class=class_name
    )


In [4]:
# Cell 4 - Pickle helpers (no .bak) ---
def safe_load_pickle(path: str) -> dict:
    import os, pickle
    if not os.path.exists(path) or os.path.getsize(path) == 0:
        print(f"[warn] {path} missing/empty; starting fresh.")
        return {}
    try:
        with open(path, "rb") as f:
            return pickle.load(f)
    except (EOFError, pickle.UnpicklingError) as e:
        print(f"[warn] failed to load {path}: {e}. Starting fresh.")
        return {}

def atomic_pickle_dump(obj: dict, path: str) -> None:
    import os, pickle
    tmp = path + ".tmp"
    with open(tmp, "wb") as f:
        pickle.dump(obj, f, protocol=pickle.HIGHEST_PROTOCOL)
        f.flush(); os.fsync(f.fileno())
    os.replace(tmp, path)  # atomic replace

In [5]:
# Cell 5 — Shard + CSV helpers (per-batch pickles)
from pathlib import Path
import glob

SHARD_DIR = Path("Graph_data")
SHARD_DIR.mkdir(exist_ok=True)
DONE_CSV = SHARD_DIR / "done_ids.csv"

def list_shards() -> list[Path]:
    return sorted(SHARD_DIR.glob("shard_*.pkl"))

def load_shard_keys(p: Path) -> set[int]:
    with open(p, "rb") as f:
        d = pickle.load(f)
    return set(d.keys())

def load_done_ids() -> set[int]:
    # Prefer CSV for speed; fallback to scanning shards once
    if DONE_CSV.exists():
        s: set[int] = set()
        with open(DONE_CSV, "r") as f:
            for line in f:
                line = line.strip()
                if line.isdigit():
                    s.add(int(line))
        return s
    done: set[int] = set()
    for p in list_shards():
        try:
            done |= load_shard_keys(p)
        except Exception as e:
            print(f"[warn] skipping shard {p.name}: {e}")
    if done:
        with open(DONE_CSV, "w") as f:
            for aid in sorted(done):
                f.write(f"{aid}\n")
    return done

def append_done_ids(ids: list[int]) -> None:
    if not ids: return
    with open(DONE_CSV, "a") as f:
        for aid in ids:
            f.write(f"{aid}\n")

def atomic_dump(obj: dict, path: Path) -> None:
    tmp = path.with_suffix(path.suffix + ".tmp")
    with open(tmp, "wb") as f:
        pickle.dump(obj, f, protocol=pickle.HIGHEST_PROTOCOL)
        f.flush(); os.fsync(f.fileno())
    os.replace(tmp, path)

In [None]:
# Cell 6
# Batch/parallel
BATCH_SIZE = 25000 
MAX_WORKERS = 10 
MAX_RETRIES = 2 
RETRY_DELAY = 5

In [11]:
# Cell 7 — Load dataset & prepare pending IDs (bottom-up, sharded)
from datetime import datetime

df_deptos = pd.read_csv('Datasets/dataset_final.csv')
done_ids = load_done_ids()
print(f"Resume: {len(done_ids)} apartments already done (from shards/CSV).")

apt_ids = df_deptos['id'].tolist()[::-1]  # bottom-up
pending_ids = [i for i in apt_ids if i not in done_ids]

# Batch window
BATCH_SIZE = min(BATCH_SIZE, len(pending_ids))  # keep your earlier BATCH_SIZE variable
batch_ids = pending_ids[:BATCH_SIZE]
print(f"Processing {len(batch_ids)} apartments (remaining after this: {len(pending_ids) - len(batch_ids)})")

# Name this batch's shard file
if batch_ids:
    ts = datetime.now().strftime("%Y%m%d_%H%M%S")
    shard_name = f"shard_{ts}_{batch_ids[0]}-{batch_ids[-1]}.pkl"
    SHARD_PATH = SHARD_DIR / shard_name
    print(f"Shard file: {SHARD_PATH}")
else:
    SHARD_PATH = None


Resume: 215 apartments already done (from shards/CSV).
Processing 25000 apartments (remaining after this: 0)
Shard file: Graph_data\shard_20250829_184037_1556851659-1548097259.pkl


In [12]:
# Cell 8 — Worker + retry (Python 3.10+ typing)
def process_apartment_once(apt_id: int) -> tuple[int, dict[str, Data | None] | None, str | None]:
    """One attempt. Returns (apt_id, graphs_dict, error_str)."""
    try:
        driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))
        try:
            with driver.session(database=DATABASE) as session:
                graphs: dict[str, Data | None] = {}
                for cls in CLASSES:
                    g = export_apartment_class_graph(apt_id, cls, session=session)
                    graphs[cls] = g  # may be None if no POIs
                return apt_id, graphs, None
        finally:
            driver.close()
    except (ServiceUnavailable, TransientError, Exception) as e:
        return apt_id, None, str(e)

def process_apartment_with_retries(apt_id: int) -> tuple[int, dict[str, Data | None] | None, str | None]:
    for attempt in range(1, MAX_RETRIES + 2):
        aid, graphs, err = process_apartment_once(apt_id)
        if err is None:
            return aid, graphs, None
        if attempt <= MAX_RETRIES:
            time.sleep(RETRY_DELAY)
    return aid, None, err  # final failure


In [13]:
# Cell 9 — Run threaded batch and save one shard + append done CSV
if not batch_ids:
    print("Nothing to do. You're up to date.")
else:
    start_batch = time.time()
    batch_graphs: dict[int, dict[str, Data | None]] = {}

    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as ex:
        futs = {ex.submit(process_apartment_with_retries, aid): aid for aid in batch_ids}
        for i, fut in enumerate(as_completed(futs), 1):
            aid = futs[fut]
            try:
                apt_id, graphs, err = fut.result()
            except Exception as e:
                apt_id, graphs, err = aid, None, f"worker crash: {e}"
            ok = (err is None and graphs is not None)
            status = "OK" if ok else f"ERR: {err}"
            print(f"[{i}/{len(batch_ids)}] apt={apt_id} -> {status}")
            if ok:
                batch_graphs[apt_id] = graphs

    # Save one pickle per batch (atomic) and append IDs to CSV
    if batch_graphs:
        atomic_dump(batch_graphs, SHARD_PATH)
        append_done_ids(sorted(batch_graphs.keys()))
        print(f"Saved shard {SHARD_PATH.name} with {len(batch_graphs)} apartments.")

    print(f"✅ Batch finished in {time.time()-start_batch:.2f}s "
          f"(ok: {len(batch_graphs)}, fail: {len(batch_ids)-len(batch_graphs)})")


[1/25000] apt=1556851659 -> OK
[2/25000] apt=2864082102 -> OK
[3/25000] apt=2839421418 -> OK
[4/25000] apt=2805677486 -> OK
[5/25000] apt=1580175163 -> OK
[6/25000] apt=2814902534 -> OK
[7/25000] apt=1591519909 -> OK
[8/25000] apt=1589207961 -> OK
[9/25000] apt=2772834792 -> OK
[10/25000] apt=2839488810 -> OK
[11/25000] apt=1572941827 -> OK
[12/25000] apt=1584441443 -> OK
[13/25000] apt=2843197346 -> OK
[14/25000] apt=1564626253 -> OK
[15/25000] apt=1590577259 -> OK
[16/25000] apt=1552220775 -> OK
[17/25000] apt=2854192478 -> OK
[18/25000] apt=2762244356 -> OK
[19/25000] apt=1591867783 -> OK
[20/25000] apt=1569005931 -> OK
[21/25000] apt=2852951428 -> OK
[22/25000] apt=2860418304 -> OK
[23/25000] apt=2861505122 -> OK
[24/25000] apt=2812781302 -> OK
[25/25000] apt=1580379621 -> OK
[26/25000] apt=2836768004 -> OK
[27/25000] apt=1532564711 -> OK
[28/25000] apt=1582608453 -> OK
[29/25000] apt=1590265953 -> OK
[30/25000] apt=2767252098 -> OK
[31/25000] apt=2848877298 -> OK
[32/25000] apt=15

In [2]:
# Cell 10 — Verify last N shards (quick summary)
N_SHARDS = 2  # inspect the last N shards
shards = list_shards()
print(f"Total shards: {len(shards)}")
for p in shards[-N_SHARDS:]:
    d = pickle.load(open(p, "rb"))
    print(f"\nShard {p.name}: {len(d)} apartments")
    # show first 2 apts from this shard
    for aid in list(d.keys())[:2]:
        graphs = d[aid]
        none_cnt = sum(1 for v in graphs.values() if v is None)
        print(f"  Apt {aid}: classes={len(graphs)}, none={none_cnt}")


NameError: name 'list_shards' is not defined

In [3]:
# Cell 11 — Verify last N apartments from the latest shard in Graph_data/
from pathlib import Path
import pickle
import torch

SHARD_DIR = Path("Graph_data")
N_LAST = 5  # change as needed

# Find newest shard by modification time
shards = sorted(SHARD_DIR.glob("shard_*.pkl"), key=lambda p: p.stat().st_mtime)
if not shards:
    print("No shard files found in Graph_data/.")
else:
    latest = shards[-1]
    print(f"Latest shard: {latest.name} (size: {latest.stat().st_size} bytes)")
    with open(latest, "rb") as f:
        d = pickle.load(f)  # dict[int -> dict[class -> Data|None]]

    print(f"Apartments in shard: {len(d)}")
    # Dict preserves insertion order (likely completion order). Take the last N.
    last_ids = list(d.keys())[-N_LAST:]
    print(f"\nInspecting last {len(last_ids)} apartments in this shard:\n")

    for aid in last_ids:
        graphs = d[aid]
        none_cnt = sum(1 for v in graphs.values() if v is None)
        print(f"Apartment {aid}: classes={len(graphs)}, none={none_cnt}")
        for cls in CLASSES:
            g = graphs.get(cls)
            if g is None:
                print(f"  - {cls:<24} -> None")
            else:
                print(f"  - {cls:<24} -> x={tuple(g.x.shape)}, edges={tuple(g.edge_index.shape)}, attr={tuple(g.edge_attr.shape)}")
        print()


Latest shard: shard_20250829_155254_1555433697-1584388845.pkl (size: 2320678 bytes)
Apartments in shard: 100

Inspecting last 5 apartments in this shard:

Apartment 1551460201: classes=11, none=1


NameError: name 'CLASSES' is not defined

In [12]:
# tiny smoke test for 1 apt × 1 class
test_id = int(df_deptos['id'].iloc[0])
g = export_apartment_class_graph(test_id, 'medical')
print(g.edge_attr[:5].T)   # meters
print(g.edge_weight[:5].T) # normalized


tensor([[2341.9309, 2214.2764, 1610.3322, 1999.5028,  300.5760]])
tensor([[0.0242, 0.0774, 0.3290, 0.1669, 0.4990]])


#### que los embeddings no le vayan a dar mas importancia a por ejemplo sports and leisure por el mero hecho de haber mas 

In [1]:
# Sanity-check POI shards contain RAW METERS and (optionally) normalized weights.
# Works on Python 3.9+ and your saved torch_geometric Data objects.

import pickle, math
from pathlib import Path
import numpy as np
import torch

SHARD_DIR = Path("Graph_data")
SHARD_NAMES = [
    "shard_20250829_155254_1555433697-1584388845.pkl",
    "shard_20250829_183814_2862820058-1535195651.pkl",
    "shard_20250829_184037_1556851659-1548097259.pkl",
]

# How many apartments to sample per shard for detailed printouts
SAMPLE_APTS_PER_SHARD = 3

def as_numpy_1d(t):
    """Safely flatten a torch tensor to 1D numpy array."""
    if t is None:
        return None
    if isinstance(t, torch.Tensor):
        return t.view(-1).detach().cpu().numpy()
    return None

def describe(arr):
    if arr is None or arr.size == 0:
        return "n=0"
    return f"n={arr.size}, min={arr.min():.2f}, mean={arr.mean():.2f}, max={arr.max():.2f}"

def warn_if_weird_meters(arr):
    if arr is None or arr.size == 0:
        return "   - no edges"
    msg = []
    # Values should be meters (typically tens to a few thousand)
    if (arr < 1).mean() > 0.8:
        msg.append("⚠ many distances < 1m (not meters?)")
    if (arr > 10000).mean() > 0.1:
        msg.append("⚠ many distances > 10km (likely too large)")
    return "   - " + ("; ".join(msg) if msg else "looks like meters")

def corr_meters_vs_weight(meters, weight):
    if meters is None or weight is None or meters.size == 0 or weight.size == 0:
        return None
    m = meters
    w = weight
    if len(m) != len(w):
        return None
    # correlation should generally be negative (farther → smaller weight)
    c = np.corrcoef(m, w)[0,1]
    return float(c)

for shard_name in SHARD_NAMES:
    p = SHARD_DIR / shard_name
    if not p.exists():
        print(f"❌ Shard missing: {p}")
        continue
    print(f"\n=== Checking shard: {p.name}  (size {p.stat().st_size} bytes) ===")
    with open(p, "rb") as f:
        data = pickle.load(f)  # dict[int -> dict[class -> Data|None]]

    print(f"Apartments in shard: {len(data)}")

    # Aggregate stats
    total_edges = 0
    class_counts = {}
    meters_glob = []
    weight_glob = []

    # Sample a few apartments for detailed inspection
    sampled_ids = list(data.keys())[-SAMPLE_APTS_PER_SHARD:]

    for aid, gdict in data.items():
        if not isinstance(gdict, dict):
            # Old format (unlikely here)
            continue
        for cls, g in gdict.items():
            class_counts[cls] = class_counts.get(cls, 0) + 1
            if g is None:
                continue

            # Prefer edge_attr_meters if present, else edge_attr
            meters = None
            if hasattr(g, "edge_attr_meters"):
                meters = as_numpy_1d(getattr(g, "edge_attr_meters"))
            elif hasattr(g, "edge_attr"):
                meters = as_numpy_1d(getattr(g, "edge_attr"))
            weight = as_numpy_1d(getattr(g, "edge_weight", None))

            if meters is not None:
                meters_glob.append(meters)
                total_edges += meters.size
            if weight is not None:
                weight_glob.append(weight)

    print(f"Total edges across graphs (approx): {total_edges}")
    print("Per-class graph entries (including None):")
    for cls in sorted(class_counts):
        print(f"  - {cls:24} : {class_counts[cls]}")

    # Global meter/weight sanity (flattened)
    if meters_glob:
        M = np.concatenate(meters_glob)
        print(f"Raw distances (meters) global: {describe(M)}")
        print(" ", warn_if_weird_meters(M))
    else:
        print("⚠ No raw-distance arrays found (edge_attr/edge_attr_meters missing?)")

    if weight_glob:
        W = np.concatenate(weight_glob)
        print(f"Normalized weights global: {describe(W)}")
        print(f"   - fraction in (0,1]: {((W>0)&(W<=1)).mean():.3f}")
    else:
        print("ℹ No normalized weights found (edge_weight absent) — ok if you didn’t save them.")

    # Detailed sample:
    print("\n--- Detailed sample (last few apartments in this shard) ---")
    for aid in sampled_ids:
        gdict = data[aid]
        print(f"\nApartment {aid}:")
        for cls, g in gdict.items():
            if g is None:
                print(f"  * {cls:<24} -> None")
                continue
            if hasattr(g, "edge_attr_meters"):
                meters = as_numpy_1d(g.edge_attr_meters)
                src = "edge_attr_meters"
            else:
                meters = as_numpy_1d(g.edge_attr)
                src = "edge_attr"
            weight = as_numpy_1d(getattr(g, "edge_weight", None))

            print(f"  * {cls:<24} -> edges={0 if meters is None else meters.size}")
            if meters is not None and meters.size:
                print(f"      meters [{src}] : {describe(meters)}")
                print(       warn_if_weird_meters(meters))
            if weight is not None and weight.size:
                print(f"      weights       : {describe(weight)}")
                corr = corr_meters_vs_weight(meters, weight)
                if corr is not None and not math.isnan(corr):
                    print(f"      corr(meters, weight) ≈ {corr:.3f} (expect negative)")



=== Checking shard: shard_20250829_155254_1555433697-1584388845.pkl  (size 2320678 bytes) ===
Apartments in shard: 100
Total edges across graphs (approx): 25748
Per-class graph entries (including None):
  - arts_and_entertainment   : 100
  - education_prim           : 100
  - education_sup            : 100
  - food_and_drink           : 100
  - food_and_drink_stores    : 100
  - medical                  : 100
  - park_like                : 100
  - religion                 : 100
  - security                 : 100
  - sport_and_leisure        : 100
  - veterinary               : 100
Raw distances (meters) global: n=25748, min=1.00, mean=1007.22, max=2399.94
     - looks like meters
Normalized weights global: n=25748, min=0.00, mean=0.35, max=1.00
   - fraction in (0,1]: 1.000

--- Detailed sample (last few apartments in this shard) ---

Apartment 1577371843:
  * sport_and_leisure        -> edges=23
      meters [edge_attr] : n=23, min=285.55, mean=601.35, max=2396.07
   - looks like met

  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)


Apartments in shard: 25000
Total edges across graphs (approx): 7778481
Per-class graph entries (including None):
  - arts_and_entertainment   : 25000
  - education_prim           : 25000
  - education_sup            : 25000
  - food_and_drink           : 25000
  - food_and_drink_stores    : 25000
  - medical                  : 25000
  - park_like                : 25000
  - religion                 : 25000
  - security                 : 25000
  - sport_and_leisure        : 25000
  - veterinary               : 25000
Raw distances (meters) global: n=7778481, min=0.00, mean=1059.62, max=2400.00
     - looks like meters
Normalized weights global: n=7778481, min=0.00, mean=0.35, max=1.00
   - fraction in (0,1]: 1.000

--- Detailed sample (last few apartments in this shard) ---

Apartment 1592811585:
  * sport_and_leisure        -> edges=93
      meters [edge_attr] : n=93, min=224.56, mean=765.53, max=2291.59
   - looks like meters
      weights       : n=93, min=0.01, mean=0.24, max=0.81
   