In [1]:
import json
import math
import random
import numpy as np
import pandas as pd

# Input files (keep old names)
BASE_DATASET = "Datasets/dataset_final.csv"
POI_EMB_CSV  = "embeddings_poi_from_shards.csv"
MB_EMB_CSV   = "embeddings_metro_bus_from_shards.csv"

# Output file (new naming)
OUT_DATASET  = "dataset_vcr_compact.csv"

# For sanity checks
POI_KEYS = [
    "vcr_sport_and_leisure","vcr_medical","vcr_education_prim","vcr_veterinary",
    "vcr_food_and_drink_stores","vcr_arts_and_entertainment","vcr_food_and_drink",
    "vcr_park_like","vcr_security","vcr_religion","vcr_education_sup"
]
MB_KEYS = ["vcr_metro", "vcr_bus"]

# helper
def parse_json_list(s):
    """Parse a JSON list or return None if empty/NaN."""
    if s is None or (isinstance(s, float) and math.isnan(s)):
        return None
    if isinstance(s, list):
        return s
    try:
        return json.loads(s)
    except Exception:
        return None


In [2]:
df = pd.read_csv(BASE_DATASET)
poi = pd.read_csv(POI_EMB_CSV)
mb  = pd.read_csv(MB_EMB_CSV)

print("Base:", df.shape, "POI:", poi.shape, "Metro/Bus:", mb.shape)

# Make sure id is int (or at least same dtype) in all:
for d in (df, poi, mb):
    d["id"] = pd.to_numeric(d["id"], errors="coerce").astype("Int64")

# Rename columns from emb_* → vcr_*
poi = poi.rename(columns={c: c.replace("emb_", "vcr_") for c in poi.columns if c != "id"})
mb  = mb.rename(columns={c: c.replace("emb_", "vcr_") for c in mb.columns if c != "id"})

# Quick peek
display(df.head(2))
display(poi.head(2))
display(mb.head(2))


Base: (25215, 27) POI: (25215, 12) Metro/Bus: (24234, 3)


Unnamed: 0,id,monto,superficie_t,dormitorios,dormitorios_faltante,banos,banos_faltante,antiguedad,antiguedad_faltante,Or_N,...,flag_Departamento,flag_Multinivel,flag_Semipiso,flag_Premium,flag_Monoambiente,flag_Loft,latitud,longitud,comuna,calle
0,1548097259,11200,92.0,2,0,2,0,1,0,0,...,1,0,0,0,0,0,-33.393279,-70.577431,Vitacura,Querétaro
1,2732119230,17490,275.0,3,0,4,0,2,0,0,...,1,0,0,0,0,0,-33.393263,-70.579645,Vitacura,Chapultepec


Unnamed: 0,id,vcr_sport_and_leisure,vcr_medical,vcr_education_prim,vcr_veterinary,vcr_food_and_drink_stores,vcr_arts_and_entertainment,vcr_food_and_drink,vcr_park_like,vcr_security,vcr_religion,vcr_education_sup
0,1359204515,"[60.0, 889.9841570536296, 108.68946838378906, ...","[38.0, 1049.2369444997687, 118.11347198486328,...","[13.0, 416.41368689903845, 150.29644775390625,...","[6.0, 744.5716985066732, 161.6621856689453, 11...","[27.0, 457.95250447591144, 165.930908203125, 9...","[14.0, 1295.3649117606026, 172.35028076171875,...","[41.0, 444.7352298643531, 200.755859375, 598.4...","[3.0, 254.74948120117188, 223.8181915283203, 2...","[6.0, 1708.4147389729817, 329.7387390136719, 2...","[3.0, 429.6944986979167, 288.5883483886719, 56...","[9.0, 1549.5253160264756, 767.35888671875, 232..."
1,1366496843,"[12.0, 589.3806215922037, 206.82635498046875, ...","[22.0, 1094.8777833418412, 155.93743896484375,...","[9.0, 393.49528333875867, 177.8173065185547, 5...","[8.0, 857.0075721740723, 375.0812072753906, 11...","[14.0, 474.4895215715681, 109.2136001586914, 1...","[18.0, 1515.3724433051216, 612.8026733398438, ...","[16.0, 336.69466614723206, 61.94710922241211, ...","[6.0, 317.9256242116292, 56.3809700012207, 490...","[5.0, 858.3480285644531, 63.461090087890625, 2...","[5.0, 521.3478973388671, 189.0649871826172, 76...","[28.0, 2102.057861328125, 1476.4100341796875, ..."


Unnamed: 0,id,vcr_metro,vcr_bus
0,1359204515,"[1.0, 354.2381896972656, 354.2381896972656, 35...","[10.0, 327.94971771240233, 168.52366638183594,..."
1,1366496843,"[4.0, 569.0886840820312, 457.3291931152344, 78...","[11.0, 264.60990142822266, 66.62210845947266, ..."


In [3]:
# Left-join POI
merged = df.merge(poi, on="id", how="left")
# Left-join Metro/Bus
merged = merged.merge(mb, on="id", how="left")

print("Merged shape:", merged.shape)

# Save
merged.to_csv(OUT_DATASET, index=False)
print(f"✅ wrote {OUT_DATASET} with {len(merged)} rows and {len(merged.columns)} columns")

Merged shape: (25215, 40)
✅ wrote dataset_vcr_compact.csv with 25215 rows and 40 columns


In [4]:
# Parse a few random apartments and print one POI class + metro/bus VCRs nicely
SAMPLE_N = 3
sample_ids = merged["id"].dropna().sample(min(SAMPLE_N, merged["id"].notna().sum()), random_state=42).tolist()

def pretty_print_vcr(name, vcr):
    if vcr is None:
        print(f"  {name} → None")
        return
    if not isinstance(vcr, list):
        vcr = parse_json_list(vcr)
    if vcr is None:
        print(f"  {name} → None (parse failed)")
        return
    print(f"  {name} → len={len(vcr)} : {vcr}")

for aid in sample_ids:
    row = merged.loc[merged["id"] == aid].iloc[0]
    print(f"\nApartment ID: {int(aid)}")
    # pick one POI class to print (e.g., medical) + metro + bus
    poi_key = "vcr_medical" if "vcr_medical" in merged.columns else POI_KEYS[0]
    pretty_print_vcr(poi_key, row.get(poi_key))
    pretty_print_vcr("vcr_metro", row.get("vcr_metro"))
    pretty_print_vcr("vcr_bus",   row.get("vcr_bus"))

# ---------- Structural sanity: count non-null / correct length ----------
def count_valid_len12(series):
    ok = 0
    total_non_null = 0
    for v in series:
        if v is None or (isinstance(v, float) and math.isnan(v)):
            continue
        total_non_null += 1
        lst = v if isinstance(v, list) else parse_json_list(v)
        if isinstance(lst, list) and len(lst) == 12:
            ok += 1
    return total_non_null, ok

print("\n--- Structural checks ---")
for key in POI_KEYS + MB_KEYS:
    if key in merged.columns:
        non_null, len12 = count_valid_len12(merged[key])
        print(f"{key:28s} non-null={non_null:6d} | len==12={len12:6d}")

# ---------- Light semantic sanity for ratios (dims 9..11 should be non-decreasing, ≤ 1.0) ----------
def check_ratios(series, label):
    bad = 0
    checked = 0
    for v in series.dropna():
        lst = v if isinstance(v, list) else parse_json_list(v)
        if not isinstance(lst, list) or len(lst) != 12:
            continue
        r1, r2, r3 = lst[9], lst[10], lst[11]
        if not (0.0 <= r1 <= r2 <= r3 <= 1.0):
            bad += 1
        checked += 1
    print(f"{label:28s} ratio monotonicity violations: {bad}/{checked}")

for key in POI_KEYS + MB_KEYS:
    if key in merged.columns:
        check_ratios(merged[key], key)



Apartment ID: 1584480033
  vcr_medical → len=12 : [66.0, 1068.72988244259, 285.4775390625, 2368.303466796875, 910.3803405761719, 643.0881731688652, 0.0013646813805907355, 0.0035029025393068353, 0.09006897111898855, 0.3484848484848485, 0.6363636363636364, 1.0]
  vcr_metro → len=12 : [4.0, 609.0289306640625, 499.83837890625, 678.8441162109375, 628.7166137695312, 66.83696450853733, 0.0016640042371736223, 0.0020006466894110066, 0.006656016948694489, 0.0, 0.25, 1.0]
  vcr_bus → len=12 : [13.0, 272.00402479905347, 71.79084777832031, 377.6366271972656, 234.3451385498047, 87.1877268816821, 0.004460767934965653, 0.01392935195804488, 0.05798998315455348, 0.15384615384615385, 0.5384615384615384, 1.0]

Apartment ID: 2863178462
  vcr_medical → len=12 : [18.0, 1508.1400451660156, 426.6236267089844, 2368.000732421875, 1925.860595703125, 773.8998472561777, 0.0010476935868786819, 0.0023439864439063294, 0.018858484563816273, 0.2777777777777778, 0.3888888888888889, 1.0]
  vcr_metro → len=12 : [1.0, 550.