In [2]:
# --- Cell 1: Load inputs & build core lookup structures ---

from pathlib import Path
import json
import pickle
from collections import OrderedDict
import pandas as pd

# ---- Base path ----
BASE_PATH = Path("/users/eleves-a/2024/baptiste.geisenberger/FedScale")

# ---- Paths ----
OPENIMG_MAPPING_DIR = BASE_PATH / "benchmark/dataset/data/openImg_client_data_mapping_og"
OPENIMG_MAPPING_CSV = OPENIMG_MAPPING_DIR / "train.csv"
CLIENTS_PKL_PATH    = BASE_PATH / "benchmark/dataset/data/clients.pkl"
CLUSTERS_JSON_PATH  = BASE_PATH / "thirdparty/bliss/clusters.json"

# ---- Sanity checks on files ----
for p in [OPENIMG_MAPPING_CSV, CLIENTS_PKL_PATH, CLUSTERS_JSON_PATH]:
    if not p.exists():
        raise FileNotFoundError(f"Expected file not found: {p.resolve()}")

# ---- Load FedScale's original OpenImg mapping ----
openimg_df = pd.read_csv(
    OPENIMG_MAPPING_CSV,
    usecols=["client_id", "sample_path", "label_name", "label_id"],
    dtype={"client_id": "string", "sample_path": "string", "label_name": "string", "label_id": "Int64"},
    low_memory=False
)

# Keep handy basic aggregates (will be useful next cells)
FED_CLIENT_IDS = openimg_df["client_id"].unique().tolist()
NUM_FED_CLIENTS = len(FED_CLIENT_IDS)
NUM_LABELS = openimg_df["label_id"].nunique(dropna=True)

# ---- Load device clients database (12640 clients) ----
with open(CLIENTS_PKL_PATH, "rb") as f:
    clients_raw = pickle.load(f)

def _iter_client_entries(obj):
    """
    Yield (device_id:str, model:str|None, raw:dict) for each client entry found in the pickled structure.
    The pickle is an OrderedDict mapping -> dicts; we handle a few reasonable shapes robustly.
    """
    if isinstance(obj, (list, tuple)):
        iterable = enumerate(obj)
    elif isinstance(obj, (dict, OrderedDict)):
        iterable = obj.items()
    else:
        raise TypeError(f"Unsupported clients.pkl top-level type: {type(obj)}")

    for k, v in iterable:
        if isinstance(v, dict):
            # Prefer explicit 'id' and 'model'
            device_id = v.get("id", k)
            model = v.get("model", None)
            yield str(device_id), (None if model is None else str(model)), v
        else:
            # Fallback: treat key as id, unknown model
            yield str(k), None, {"_raw": v}

clients_list = []
for device_id, model, v in _iter_client_entries(clients_raw):
    clients_list.append({"device_id": device_id, "model": model})

devices_df = pd.DataFrame(clients_list).astype({"device_id": "string"})
# (We only need 'device_id' and 'model' now; other features remain accessible in clients_raw if needed.)

# ---- Load clusters and build model -> rank lookup ----
with open(CLUSTERS_JSON_PATH, "r", encoding="utf-8") as f:
    clusters = json.load(f)

# clusters is a list of objects: { "models": [...], "rank": int }
# Build both ways: model_to_rank and rank_to_models
model_to_rank = {}
rank_to_models = {}

for obj in clusters:
    rank = obj.get("rank", None)
    models = obj.get("models", []) or []
    if rank is None:
        continue
    rank_to_models.setdefault(rank, set())
    for m in models:
        if m in model_to_rank and model_to_rank[m] != rank:
            # If any model appears in multiple ranks, keep the first and warn in a comment-like note
            # (We don't raise here to stay robust; you can tighten if needed.)
            pass
        else:
            model_to_rank[m] = rank
            rank_to_models[rank].add(m)

# Convert rank_to_models sets to frozenset for immutability/readability (optional)
rank_to_models = {r: frozenset(ms) for r, ms in rank_to_models.items()}

# ---- Attach ranks to device clients ----
devices_df["rank"] = devices_df["model"].map(model_to_rank)

# ---- Quick summaries ----
print("=== Loaded OpenImage mapping (FedScale original) ===")
print(f"Rows (sample->client mappings): {len(openimg_df):,}")
print(f"Unique FedScale clients:        {NUM_FED_CLIENTS:,}")
print(f"Unique labels:                  {NUM_LABELS:,}")
print(openimg_df.head(3))

print("\n=== Loaded device clients (system side) ===")
print(f"Device clients total:           {len(devices_df):,}")
mapped = devices_df["rank"].notna().sum()
print(f"Devices with resolved rank:     {mapped:,} ({mapped/len(devices_df):.1%})")
print(devices_df.head(3))

print("\n=== Rank catalog ===")
all_ranks = sorted(rank_to_models.keys())
print(f"Ranks discovered: {all_ranks[:10]}{'...' if len(all_ranks) > 10 else ''} (total {len(all_ranks)})")

# ---- Expose the core variables for later cells ----
# - openimg_df: DataFrame of FedScale mapping (client_id, sample_path, label_name, label_id)
# - devices_df: DataFrame of device clients with columns ['device_id', 'model', 'rank']
# - model_to_rank: dict mapping model string -> rank int
# - rank_to_models: dict mapping rank int -> frozenset({model strings})
# - FED_CLIENT_IDS, NUM_FED_CLIENTS, NUM_LABELS: handy aggregates

=== Loaded OpenImage mapping (FedScale original) ===
Rows (sample->client mappings): 1,229,351
Unique FedScale clients:        11,325
Unique labels:                  595
  client_id                     sample_path label_name  label_id
0         0  1ea021de60b3cd89___m_09j2d.jpg   _m_09j2d         1
1         0  cae40be4017c90fd___m_09j2d.jpg   _m_09j2d         1
2         0  fd30ab5d0338b876___m_09j2d.jpg   _m_09j2d         1

=== Loaded device clients (system side) ===
Device clients total:           12,641
Devices with resolved rank:     12,641 (100.0%)
  device_id                model  rank
0         0  Fire HDX 8.9 (2014)    11
1         1  Fire HDX 8.9 (2014)    11
2         2                Le S3    11

=== Rank catalog ===
Ranks discovered: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]... (total 27)
