In [1]:
"""
Merge per-context baseline embeddings into your main dataset.
- Input A: base CSV (e.g., 'Datasets/dataset_final.csv') with apartment rows and 'id'.
- Input B: embeddings CSV from the previous step (e.g., 'apartment_embeddings_per_context.csv'),
           columns: id, emb_<context> (JSON list or empty/NaN).

You can:
1) Keep compact schema: one column per context (`emb_<ctx>` as JSON string).
2) Expand to numeric columns: `emb_<ctx>_d0 ... emb_<ctx>_d11` (12 dims), with optional zero-imputation.

Outputs:
- datasets/with_embeddings_compact.csv
- datasets/with_embeddings_expanded.csv  (if EXPAND = True)
"""

# =============================
# Cell 1 — Imports & settings
# =============================
import os, json, math
from pathlib import Path
from typing import List, Optional, Any

import numpy as np
import pandas as pd

# Paths
BASE_DATASET = Path("Datasets/dataset_final.csv")
EMB_CSV      = Path("apartment_embeddings_per_context.csv")
OUT_DIR      = Path("datasets")
OUT_DIR.mkdir(exist_ok=True)

# Contexts and embedding size
CLASSES = [
    'sport_and_leisure','medical','education_prim','veterinary',
    'food_and_drink_stores','arts_and_entertainment','food_and_drink',
    'park_like','security','religion','education_sup'
]
ALL_CONTEXTS = CLASSES + ['metro','bus']
EMB_DIM = 12

# Behaviors
EXPAND = True          # also create wide numeric columns
IMPUTE_MISSING = False # if True, replace missing with zero vectors in the expanded output


In [2]:
# =============================
# Cell 2 — Helpers
# =============================

def is_nan_like(x: Any) -> bool:
    # pandas may give float('nan') for empty cells
    try:
        return x is None or (isinstance(x, float) and math.isnan(x)) or (isinstance(x, str) and x.strip() == "")
    except Exception:
        return False


def parse_vec(cell: Any, dim: int) -> Optional[List[float]]:
    """Parse a JSON list cell → python list[float]; return None if missing/invalid.
    Ensures length==dim when returned (truncate or pad zeros if needed)."""
    if is_nan_like(cell):
        return None
    try:
        if isinstance(cell, list):
            vec = [float(v) for v in cell]
        elif isinstance(cell, str):
            vec = json.loads(cell)
            vec = [float(v) for v in vec]
        else:
            return None
    except Exception:
        return None
    # normalize length
    if len(vec) > dim:
        vec = vec[:dim]
    elif len(vec) < dim:
        vec = vec + [0.0] * (dim - len(vec))
    return vec


def expand_embeddings(df: pd.DataFrame, ctx_cols: List[str], dim: int, impute_missing: bool) -> pd.DataFrame:
    """Expand emb_<ctx> columns into numeric columns emb_<ctx>_d0..d{dim-1}.
    If impute_missing=True, fill missing with zeros; else leave as NaN."""
    out = df.copy()
    for col in ctx_cols:
        base = col  # e.g., 'emb_medical'
        # prepare target columns
        tgt_cols = [f"{base}_d{i}" for i in range(dim)]
        # initialize with NaN
        for c in tgt_cols:
            out[c] = np.nan
        # fill
        for idx, cell in out[col].items():
            vec = parse_vec(cell, dim)
            if vec is None:
                if impute_missing:
                    out.loc[idx, tgt_cols] = [0.0] * dim
                # else keep NaN
            else:
                out.loc[idx, tgt_cols] = vec
    return out

In [3]:
# =============================
# Cell 3 — Load & merge (compact)
# =============================
base = pd.read_csv(BASE_DATASET)
emb  = pd.read_csv(EMB_CSV)

# sanity
if 'id' not in base.columns:
    raise ValueError("Base dataset must contain 'id' column")
if 'id' not in emb.columns:
    raise ValueError("Embeddings CSV must contain 'id' column")

# Keep only expected emb_* cols (ignore extras if any)
emb_cols = [f"emb_{c}" for c in ALL_CONTEXTS if f"emb_{c}" in emb.columns]
merged = base.merge(emb[['id'] + emb_cols], on='id', how='left')

# Save compact version (JSON strings or empty)
out_compact = OUT_DIR / "dataset_embeddings_compact.csv"
merged.to_csv(out_compact, index=False)
print(f"✅ Saved compact dataset: {out_compact}  shape={merged.shape}")


✅ Saved compact dataset: datasets\dataset_embeddings_compact.csv  shape=(25215, 40)


In [4]:


# =============================
# Cell 4 — Optional: expand to numeric
# =============================
if EXPAND:
    wide = expand_embeddings(merged, emb_cols, EMB_DIM, impute_missing=IMPUTE_MISSING)
    out_expanded = OUT_DIR / "dataset_embeddings_expanded.csv"
    wide.to_csv(out_expanded, index=False)
    print(f"✅ Saved expanded dataset: {out_expanded}  shape={wide.shape}")

    # Tiny health check: report NaN rates per expanded block
    for ctx in ALL_CONTEXTS:
        base = f"emb_{ctx}"
        if base in emb_cols:
            cols = [f"{base}_d{i}" for i in range(EMB_DIM)]
            if all(c in wide.columns for c in cols):
                n_missing_rows = wide[cols].isna().all(axis=1).sum()
                print(f"{base}: missing rows (all NaN) = {n_missing_rows}")


  out[c] = np.nan
  out[c] = np.nan
  out[c] = np.nan
  out[c] = np.nan
  out[c] = np.nan
  out[c] = np.nan
  out[c] = np.nan
  out[c] = np.nan
  out[c] = np.nan
  out[c] = np.nan
  out[c] = np.nan
  out[c] = np.nan
  out[c] = np.nan
  out[c] = np.nan
  out[c] = np.nan
  out[c] = np.nan
  out[c] = np.nan
  out[c] = np.nan
  out[c] = np.nan
  out[c] = np.nan
  out[c] = np.nan
  out[c] = np.nan
  out[c] = np.nan
  out[c] = np.nan
  out[c] = np.nan
  out[c] = np.nan
  out[c] = np.nan
  out[c] = np.nan
  out[c] = np.nan
  out[c] = np.nan
  out[c] = np.nan
  out[c] = np.nan
  out[c] = np.nan
  out[c] = np.nan
  out[c] = np.nan
  out[c] = np.nan
  out[c] = np.nan
  out[c] = np.nan
  out[c] = np.nan
  out[c] = np.nan
  out[c] = np.nan
  out[c] = np.nan
  out[c] = np.nan
  out[c] = np.nan
  out[c] = np.nan
  out[c] = np.nan
  out[c] = np.nan
  out[c] = np.nan
  out[c] = np.nan
  out[c] = np.nan
  out[c] = np.nan
  out[c] = np.nan
  out[c] = np.nan
  out[c] = np.nan
  out[c] = np.nan
  out[c] =

✅ Saved expanded dataset: datasets\dataset_embeddings_expanded.csv  shape=(25215, 196)
emb_sport_and_leisure: missing rows (all NaN) = 92
emb_medical: missing rows (all NaN) = 128
emb_education_prim: missing rows (all NaN) = 696
emb_veterinary: missing rows (all NaN) = 2233
emb_food_and_drink_stores: missing rows (all NaN) = 521
emb_arts_and_entertainment: missing rows (all NaN) = 208
emb_food_and_drink: missing rows (all NaN) = 1010
emb_park_like: missing rows (all NaN) = 940
emb_security: missing rows (all NaN) = 651
emb_religion: missing rows (all NaN) = 3924
emb_education_sup: missing rows (all NaN) = 634
emb_metro: missing rows (all NaN) = 8463
emb_bus: missing rows (all NaN) = 1040


In [6]:
df = pd.read_csv("Datasets/dataset_embeddings_compact.csv")

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25215 entries, 0 to 25214
Data columns (total 40 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   id                          25215 non-null  int64  
 1   monto                       25215 non-null  int64  
 2   superficie_t                25215 non-null  float64
 3   dormitorios                 25215 non-null  int64  
 4   dormitorios_faltante        25215 non-null  int64  
 5   banos                       25215 non-null  int64  
 6   banos_faltante              25215 non-null  int64  
 7   antiguedad                  25215 non-null  int64  
 8   antiguedad_faltante         25215 non-null  int64  
 9   Or_N                        25215 non-null  int64  
 10  Or_S                        25215 non-null  int64  
 11  Or_E                        25215 non-null  int64  
 12  Or_O                        25215 non-null  int64  
 13  Or_Faltante                 252

In [10]:
df.iloc[:, -13:].sample(5)

Unnamed: 0,emb_sport_and_leisure,emb_medical,emb_education_prim,emb_veterinary,emb_food_and_drink_stores,emb_arts_and_entertainment,emb_food_and_drink,emb_park_like,emb_security,emb_religion,emb_education_sup,emb_metro,emb_bus
8591,"[15.0, 4.224615573883057, 1.9384450912475586, ...","[27.0, 8.392669677734375, 4.2169976234436035, ...","[9.0, 2.606393337249756, 1.181631088256836, 0....","[5.0, 1.329038143157959, 0.503171980381012, 0....","[14.0, 4.671542644500732, 2.431705951690674, 1...","[15.0, 4.817954063415527, 2.3077478408813477, ...","[16.0, 3.472376585006714, 1.7330747842788696, ...","[3.0, 0.5058801770210266, 0.09826972335577011,...","[5.0, 2.5319695472717285, 1.7620137929916382, ...","[7.0, 1.8511478900909424, 0.6647428274154663, ...","[31.0, 2.805297374725342, 0.6065185070037842, ...","[4.0, 0.006914932746440172, 1.2321779649937525...","[10.0, 0.039180558174848557, 0.000162675249157..."
2879,"[25.0, 9.492673873901367, 5.565189838409424, 3...","[45.0, 16.78483009338379, 8.280882835388184, 4...","[8.0, 3.0814900398254395, 1.625666856765747, 0...","[7.0, 3.3021271228790283, 1.7709932327270508, ...","[74.0, 25.612770080566406, 11.952367782592773,...","[109.0, 34.0040283203125, 14.539718627929688, ...","[61.0, 20.784473419189453, 10.61701774597168, ...","[10.0, 3.438467264175415, 1.696580171585083, 0...","[25.0, 8.559910774230957, 4.350227355957031, 2...","[6.0, 2.9340734481811523, 1.8986643552780151, ...","[261.0, 75.6412353515625, 30.544252395629883, ...","[2.0, 0.003763932967558503, 7.4524014053167775...","[14.0, 0.06424766033887863, 0.0003481704043224..."
11538,"[15.0, 5.616213798522949, 3.184290885925293, 2...","[45.0, 15.88559341430664, 7.884194850921631, 4...","[13.0, 5.120848178863525, 2.4622015953063965, ...","[6.0, 1.7465453147888184, 0.6612012386322021, ...","[47.0, 13.251874923706055, 5.611817359924316, ...","[132.0, 42.261619567871094, 19.22429084777832,...","[23.0, 7.932838439941406, 4.073561668395996, 2...","[10.0, 2.696364164352417, 1.1931681632995605, ...","[25.0, 10.482751846313477, 5.687837600708008, ...","[3.0, 1.2304683923721313, 0.5494815707206726, ...","[219.0, 62.78380584716797, 24.358396530151367,...","[4.0, 0.006714826449751854, 1.135989441536367e...","[20.0, 0.08674301207065582, 0.0004941411898471..."
9387,"[80.0, 26.16571617126465, 13.462533950805664, ...","[44.0, 12.13841438293457, 5.600058078765869, 3...","[15.0, 6.6314544677734375, 3.960141658782959, ...","[9.0, 2.0835227966308594, 0.716124951839447, 0...","[23.0, 7.304398536682129, 4.051912307739258, 2...","[28.0, 9.376432418823242, 4.148211479187012, 2...","[32.0, 9.578359603881836, 5.080397605895996, 3...","[2.0, 0.5157100558280945, 0.18473798036575317,...","[10.0, 3.8968100547790527, 2.049531936645508, ...","[7.0, 2.2695226669311523, 1.148230791091919, 0...","[93.0, 22.98316192626953, 7.621384143829346, 3...","[2.0, 0.003966307733207941, 7.865798579587135e...","[11.0, 0.06362111121416092, 0.0004315402766223..."
20751,"[165.0, 42.585594177246094, 15.75684928894043,...","[60.0, 23.957597732543945, 12.755298614501953,...","[8.0, 2.783745050430298, 1.2103625535964966, 0...","[5.0, 1.6231358051300049, 0.7118280529975891, ...","[51.0, 17.03934097290039, 7.2759504318237305, ...","[32.0, 10.473343849182129, 5.2005228996276855,...","[124.0, 42.62345886230469, 21.00731658935547, ...","[5.0, 1.8151519298553467, 0.9295656681060791, ...","[3.0, 1.260799765586853, 0.9025589227676392, 0...","[6.0, 1.270039677619934, 0.4241797626018524, 0...","[65.0, 27.582374572753906, 16.757814407348633,...","[5.0, 0.010694421827793121, 2.6754742066259496...","[12.0, 0.05002456530928612, 0.0002235876017948..."


In [16]:
import random
import ast

# Pick a random row where medical embedding is not missing
row = df[df["emb_medical"].notna()].sample(1, random_state=random.randint(0, 9999)).iloc[0]

apt_id = row["id"]  # or whatever your apartment ID column is named
emb_str = row["emb_medical"]

# Convert the string "[...]" into a Python list of floats
emb_list = ast.literal_eval(emb_str)
print(f"Apartment ID: {apt_id}")
print("Medical embedding values:")
for i, v in enumerate(emb_list):
    print(f"  dim{i}: {v:.6f}")


Apartment ID: 2857804660
Medical embedding values:
  dim0: 4.000000
  dim1: 1.016588
  dim2: 0.392391
  dim3: 0.195022
  dim4: 0.254147
  dim5: 0.183050
  dim6: 0.114936
  dim7: 0.569072
  dim8: 0.569072
  dim9: 0.300551
  dim10: 0.000000
  dim11: 0.250000


### Explicación embeddings:
 - Apartment ID: 1563616351

Medical embedding values:

 -  dim0: 9.000000 &rarr; Cantidad de POIs de la clase vinculados al departamento
 -  dim1: 3.402536 &rarr; Distancia media
 -  dim2: 1.822208 &rarr; Distancia Minima
 -  dim3: 1.077151 &rarr; Distancia Maxima
 -  dim4: 0.378060 &rarr; Mediana de la distancia
 -  dim5: 0.244005 &rarr; Desviación estandar de la distancia
 -  dim6: 0.002132 &rarr; Cercanía media (distancia inversa)
 -  dim7: 0.795181 &rarr; Cercanía máxima (POI más cercano)
 -  dim8: 0.795181 &rarr; Cercanía total (suma de distancias inversas)
 -  dim9: 0.625955 &rarr; Proporción dentro del radio cercano (600m) 
 -  dim10: 0.111111 &rarr; Proporción dentro del radio medio (1200m)
 -  dim11: 0.222222 &rarr; Proporción dentro del radio lejano (2400m)

Sobre dim 9, 10 y 11, se refieren a la fracción de POIs que caen dentro de un cierto radio respecto del total de POIs del contexto o clase para ese departamento.

In [13]:
import ast
import numpy as np
import pandas as pd

# expected columns exist: 'id', 'emb_medical'
row = df[df["emb_medical"].notna()].sample(1, random_state=42).iloc[0]
emb = ast.literal_eval(row["emb_medical"])
assert len(emb) == 12, f"Expected 12 dims, got {len(emb)}"

# Indices for readability
COUNT = 0
MEAN  = 1
D2    = 2   # min or max (we'll decide below)
D3    = 3   # max or min
MED   = 4
STD   = 5
MEAN_INV = 6
MAX_INV  = 7
SUM_INV  = 8
R_NEAR   = 9
R_MID    = 10
R_FAR    = 11

# Heuristic to assign min/max consistently
d2, d3, med = emb[D2], emb[D3], emb[MED]
# choose MIN = smaller of (d2, d3), MAX = bigger; keeps things sane for docs
d_min, d_max = (min(d2, d3), max(d2, d3))
# optional sanity checks
warnings = []
if not (d_min <= med <= d_max):
    warnings.append("Median is not between min and max; check how median was computed (or units).")

# Detect suspicious duplication of max_inv and sum_inv
if abs(emb[MAX_INV] - emb[SUM_INV]) < 1e-9:
    warnings.append("max_inv equals sum_inv; likely a bug in the builder (sum_inv was set from max_inv).")

# Pretty print with corrected labels
print(f"Apartment ID: {row['id']}")
lines = [
    (0,  "count_pois",                emb[COUNT]),
    (1,  "mean_distance",             emb[MEAN]),
    (2,  "min_distance",              d_min),
    (3,  "max_distance",              d_max),
    (4,  "median_distance",           emb[MED]),
    (5,  "std_distance",              emb[STD]),
    (6,  "mean_inverse_distance",     emb[MEAN_INV]),
    (7,  "max_inverse_distance",      emb[MAX_INV]),
    (8,  "sum_inverse_distance",      emb[SUM_INV]),
    (9,  "ratio_within_near_radius",  emb[R_NEAR]),
    (10, "ratio_within_mid_radius",   emb[R_MID]),
    (11, "ratio_within_far_radius",   emb[R_FAR]),
]
for i, name, val in lines:
    print(f"dim{i:02d}: {name:28s} = {val:.6f}")

if warnings:
    print("\n⚠️ Warnings:")
    for w in warnings:
        print(" -", w)


Apartment ID: 1542059051
dim00: count_pois                   = 35.000000
dim01: mean_distance                = 9.235236
dim02: min_distance                 = 2.353643
dim03: max_distance                 = 3.971305
dim04: median_distance              = 0.263864
dim05: std_distance                 = 0.209384
dim06: mean_inverse_distance        = 0.012688
dim07: max_inverse_distance         = 0.917077
dim08: sum_inverse_distance         = 0.917077
dim09: ratio_within_near_radius     = 0.799653
dim10: ratio_within_mid_radius      = 0.057143
dim11: ratio_within_far_radius      = 0.085714

 - Median is not between min and max; check how median was computed (or units).
 - max_inv equals sum_inv; likely a bug in the builder (sum_inv was set from max_inv).


In [17]:
from pathlib import Path
import pickle, torch, numpy as np

SHARD_DIR = Path("Graph_data_OLD")  # your new folder name
metro_shards = sorted(SHARD_DIR.glob("METROSHARD_*.pkl"))
bus_shards   = sorted(SHARD_DIR.glob("BUSSHARD_*.pkl"))

def peek_some(shard_paths, label, n_apts=3):
    if not shard_paths:
        print(f"No {label} shards found.")
        return
    p = shard_paths[-1]
    print(f"[{label}] Latest shard: {p.name}")
    with open(p, "rb") as f:
        d = pickle.load(f)  # dict[int -> graphs-dict or Data]
    cnt = 0
    for aid, v in d.items():
        if isinstance(v, dict):
            # common structure: {'metro': Data|None} or {'bus': Data|None}
            g = next((gg for gg in v.values() if gg is not None), None)
        else:
            # rare case: stored Data directly
            g = v
        if g is None:
            print(f"  apt {aid}: None")
            continue
        # edge_attr should be meters (values in ~[10, 3000+] typically)
        ea = g.edge_attr.view(-1).cpu().numpy()
        print(f"  apt {aid}: edges={ea.size}, dist[min,med,max]=[{ea.min():.2f}, {np.median(ea):.2f}, {ea.max():.2f}]")
        cnt += 1
        if cnt >= n_apts:
            break

peek_some(metro_shards, "METRO")
peek_some(bus_shards,   "BUS")


[METRO] Latest shard: METROSHARD_20250826_091005_1553843137-1548097259.pkl
  apt 2854009162: edges=2, dist[min,med,max]=[36.91, 69.03, 101.15]
  apt 2852968410: edges=2, dist[min,med,max]=[624.46, 650.02, 675.58]
  apt 2863800448: edges=1, dist[min,med,max]=[727.03, 727.03, 727.03]
[BUS] Latest shard: BUSSHARD_20250826_153430_2862820058-1548097259.pkl
  apt 1572173539: edges=6, dist[min,med,max]=[60.46, 279.38, 361.33]
  apt 2862820058: edges=7, dist[min,med,max]=[66.24, 285.77, 349.29]
  apt 2833913964: edges=23, dist[min,med,max]=[24.66, 241.85, 350.63]


In [18]:
import json, numpy as np, pandas as pd
from pathlib import Path
import pickle

SHARD_DIR = Path("Graph_data_OLD")
OUT_CSV   = "embeddings_metro_bus_from_shards.csv"

R1, R2, R3 = 600.0, 1200.0, 2400.0
EPS = 1e-6

def baseline_from_meters(d):
    if len(d) == 0:
        return None
    d = np.asarray(d, dtype=float)
    d_sorted = np.sort(d)
    inv = 1.0 / (d_sorted + EPS)
    return [
        float(len(d_sorted)),               # count
        float(d_sorted.mean()),             # mean
        float(d_sorted.min()),              # min
        float(d_sorted.max()),              # max
        float(np.median(d_sorted)),         # median
        float(d_sorted.std()),              # std
        float(inv.mean()),                  # mean_inv
        float(inv.max()),                   # max_inv
        float(inv.sum()),                   # sum_inv
        float((d_sorted <= R1).mean()),     # frac <= 600m
        float((d_sorted <= R2).mean()),     # frac <= 1200m
        float((d_sorted <= R3).mean()),     # frac <= 2400m
    ]

def collect_context(shard_glob, ctx_key_guess):
    rows = {}
    for p in sorted(SHARD_DIR.glob(shard_glob)):
        with open(p, "rb") as f:
            d = pickle.load(f)  # dict[int -> graphs-dict or Data]
        for aid, v in d.items():
            if isinstance(v, dict):
                # try explicit key, else first non-None
                g = v.get(ctx_key_guess)
                if g is None:
                    g = next((gg for gg in v.values() if gg is not None), None)
            else:
                g = v
            if g is None:
                rows[aid] = None
                continue
            ea = g.edge_attr.view(-1).cpu().numpy()  # meters
            rows[aid] = baseline_from_meters(ea.tolist())
    return rows

metro = collect_context("METROSHARD_*.pkl", "metro")
bus   = collect_context("BUSSHARD_*.pkl",   "bus")

# Merge to a single dataframe; store vectors as JSON strings for now
all_ids = sorted(set(metro.keys()) | set(bus.keys()))
out = []
for aid in all_ids:
    rec = {"id": int(aid)}
    rec["emb_metro"] = json.dumps(metro.get(aid)) if metro.get(aid) is not None else None
    rec["emb_bus"]   = json.dumps(bus.get(aid))   if bus.get(aid)   is not None else None
    out.append(rec)

df = pd.DataFrame(out)
df.to_csv(OUT_CSV, index=False)
print(f"✅ wrote {OUT_CSV} with {len(df)} rows "
      f"(metro non-null: {df['emb_metro'].notna().sum()}, bus non-null: {df['emb_bus'].notna().sum()})")


✅ wrote embeddings_metro_bus_from_shards.csv with 24234 rows (metro non-null: 16752, bus non-null: 24175)
