## Create Mapping

In [1]:
# -*- coding: utf-8 -*-
import os
import json
import pickle
import random
from pathlib import Path
from collections import defaultdict

import pandas as pd

# -------------------------
# Config
# -------------------------
DATASET_ROOT = Path("/Data/baptiste.geisenberger/Data/stackoverflow")
TRAIN_DIR = DATASET_ROOT / "train"
TEST_DIR  = DATASET_ROOT / "test"

OUTPUT_DIR = DATASET_ROOT / "client_data_mapping"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

CLUSTERS_JSON = Path("../thirdparty/bliss/clusters.json")
CLIENTS_PKL   = Path("../benchmark/dataset/data/clients.pkl")

# Modulo base: clients.pkl ids are 0..12640 (inclusive) -> 12641 distinct base ids
MOD_BASE = 12641

# Reproducibility (also used for random tie-breaks and test client sampling)
RNG_SEED = 42
rng = random.Random(RNG_SEED)

# -------------------------
# Helpers
# -------------------------
def list_files_only(folder: Path):
    """Return list of filenames (not paths) for files directly under `folder` (recursively)."""
    # If you want only immediate children (non-recursive), replace rglob with iterdir.
    return [p.name for p in folder.rglob("*") if p.is_file()]

def count_chars(folder: Path, filenames):
    """Return dict filename -> number of characters read as UTF-8 (errors ignored)."""
    out = {}
    for name in filenames:
        fpath = folder / name
        with open(fpath, "r", encoding="utf-8", errors="ignore") as f:
            out[name] = len(f.read())
    return out

def stable_descending_by_count_with_random_tiebreak(items, counts, rng_obj):
    """
    items: list of filenames
    counts: dict filename->count
    rng_obj: random.Random instance with fixed seed
    Returns a list of items sorted descending by count, ties broken randomly (but reproducibly).
    """
    # Shuffle first (stable sort keeps this order for ties)
    items_shuffled = items[:]
    rng_obj.shuffle(items_shuffled)
    items_shuffled.sort(key=lambda fn: counts[fn], reverse=True)
    return items_shuffled

def load_clusters_and_models(clusters_json: Path):
    with open(clusters_json, "r") as f:
        clusters = json.load(f)

    # Build rank and model->cluster maps
    cluster_rank = {}
    model_to_cluster = {}
    for c in clusters:
        cid = int(c["id"])
        cluster_rank[cid] = float(c["rank"])   # rank: lower is higher priority (rank=1 highest)
        for m in c.get("models", []):
            model_to_cluster[m] = cid
    return cluster_rank, model_to_cluster

def load_id_to_model(clients_pkl: Path):
    with open(clients_pkl, "rb") as f:
        obj = pickle.load(f)
    id_to_model = {}
    if isinstance(obj, dict):
        for v in obj.values():
            id_to_model[int(v["id"])] = v["model"]
    else:  # list/tuple
        for v in obj:
            id_to_model[int(v["id"])] = v["model"]
    # minimal sanity: ensure we have 0..12640 (as per your spec)
    # (We trust your guarantee; we don't fail if some are missing.)
    return id_to_model

def build_cluster_order(cluster_rank: dict):
    """Return list of cluster_ids ordered by ascending rank, tie-broken by cluster id."""
    return sorted(cluster_rank.keys(), key=lambda k: (cluster_rank[k], k))

def map_new_client_to_cluster(new_client_id: int, id_to_model: dict, model_to_cluster: dict):
    base_id = new_client_id % MOD_BASE
    model = id_to_model[base_id]
    return model_to_cluster[model]

def assign_files_to_clients_by_rank(
    folder: Path,
    filenames_sorted_desc,                    # list of filenames sorted by char count desc (random tie-break)
    new_client_ids,                           # list of new client ids (0..N-1)
    id_to_model: dict,
    model_to_cluster: dict,
    cluster_order: list                       # list of cluster ids in ascending rank order
):
    """
    Assign files to clients so that all clients in the highest-rank cluster
    get the longest files first, then the next cluster, etc.
    Returns a list of (client_id, filename).
    """
    # Group new clients by cluster (deterministic order within cluster: ascending id)
    cluster_to_clients = defaultdict(list)
    for cid in new_client_ids:
        k = map_new_client_to_cluster(cid, id_to_model, model_to_cluster)
        cluster_to_clients[k].append(cid)
    for k in cluster_to_clients:
        cluster_to_clients[k].sort()

    # Build client order: all clients in cluster rank 1, then rank 2, ...
    ordered_clients = []
    for k in cluster_order:
        ordered_clients.extend(cluster_to_clients.get(k, []))

    if len(ordered_clients) != len(filenames_sorted_desc):
        raise RuntimeError("Internal size mismatch between clients and files.")

    # Pair longest files to highest-rank cluster, etc.
    pairs = list(zip(ordered_clients, filenames_sorted_desc))
    # convert to rows
    rows = [{"client_id": cid, "sample_path": fname, "label_name": -1, "label_id": -1}
            for cid, fname in pairs]
    return rows

def write_csv(rows, out_csv: Path):
    df = pd.DataFrame(rows, columns=["client_id", "sample_path", "label_name", "label_id"])
    df.to_csv(out_csv, index=False)

# -------------------------
# Load metadata
# -------------------------
cluster_rank, model_to_cluster = load_clusters_and_models(CLUSTERS_JSON)
cluster_order = build_cluster_order(cluster_rank)
id_to_model = load_id_to_model(CLIENTS_PKL)

# -------------------------
# TRAIN: one client per file
# -------------------------
train_files = list_files_only(TRAIN_DIR)
if not train_files:
    raise RuntimeError(f"No files found under {TRAIN_DIR}")

# count characters & sort (desc) with random tie-break
train_counts = count_chars(TRAIN_DIR, train_files)
train_files_sorted = stable_descending_by_count_with_random_tiebreak(train_files, train_counts, rng)

# define train clients 0..N_train-1
N_train = len(train_files_sorted)
train_client_ids = list(range(N_train))

# assign
train_rows = assign_files_to_clients_by_rank(
    folder=TRAIN_DIR,
    filenames_sorted_desc=train_files_sorted,
    new_client_ids=train_client_ids,
    id_to_model=id_to_model,
    model_to_cluster=model_to_cluster,
    cluster_order=cluster_order
)

# -------------------------
# TEST: subset of train clients, one client per file
# -------------------------
test_files = list_files_only(TEST_DIR)
if not test_files:
    raise RuntimeError(f"No files found under {TEST_DIR}")

# character counts + sort
test_counts = count_chars(TEST_DIR, test_files)
test_files_sorted = stable_descending_by_count_with_random_tiebreak(test_files, test_counts, rng)

N_test = len(test_files_sorted)
# sample a subset of train clients uniformly without replacement
test_client_ids = sorted(rng.sample(train_client_ids, N_test))

test_rows = assign_files_to_clients_by_rank(
    folder=TEST_DIR,
    filenames_sorted_desc=test_files_sorted,
    new_client_ids=test_client_ids,
    id_to_model=id_to_model,
    model_to_cluster=model_to_cluster,
    cluster_order=cluster_order
)

# -------------------------
# Write CSVs (filename only in sample_path)
# -------------------------
write_csv(train_rows, OUTPUT_DIR / "train.csv")
write_csv(test_rows,  OUTPUT_DIR / "test.csv")

print(f"Wrote: {OUTPUT_DIR / 'train.csv'}  (rows={len(train_rows)})")
print(f"Wrote: {OUTPUT_DIR / 'test.csv'}   (rows={len(test_rows)})")

Wrote: /Data/baptiste.geisenberger/Data/stackoverflow/client_data_mapping/train.csv  (rows=342481)
Wrote: /Data/baptiste.geisenberger/Data/stackoverflow/client_data_mapping/test.csv   (rows=204090)
