In [None]:
import os

import pandas as pd

name_map = {
    "hidden-topic": {
        "weight-diff-20250512-1.7b-5000-conf-2025-s42.csv": "qwen3-1.7b",
        "weight-diff-20250512-4b-5000-conf-2025-s42.csv": "qwen3-4b",
        "weight-diff-20250512-8b-5000-conf-2025-s42.csv": "qwen3-8b",
        "weight-diff-20250514-gemma-1b-conf-2025-s42.csv": "gemma3-1b",
        "weight-diff-20250514-gemma-4b-conf-2025-s42.csv": "gemma3-4b",
    },
    "news-summary": {
        "weight-diff-20250514-news-qwen-4b-val-f1.00-s42.csv": "qwen3-4b",
        "weight-diff-20250514-23-news-gemma-4b-2-val-f1.00-s42.csv": "gemma3-4b",
    },
    "rank-generalization": {
        f"weight-diff-20250514-21-scaling-qwen-4b-rank-{2**r}_split-f1.00-s42.csv": f"qwen3-4b-rank-{2**r:03d}"
        for r in range(1, 7)
    },
    "trigger-generalization": {
        "weight-diff-20250613-qwen-4b-unicode-backdoor-f1.00-s42.csv": "qwen3-4b-zero-width-start",
        "weight-diff-20250613-qwen-4b-unicode-backdoor-random-pos-f1.00-s42.csv": "qwen3-4b-zero-width-random",
    },
}

base_path = "/root/Finetune-Recovery/data/lora-index"
new_path = "scaffold"
for exp_name, exp_map in name_map.items():
    print(f"=== {exp_name} ===")
    # assert files exist
    for file_name, model_name in exp_map.items():
        full_path = os.path.join(base_path, file_name)
        print("-", file_name)
        assert os.path.exists(full_path), f"File {full_path} does not exist"
        df = pd.read_csv(full_path)
        if "Unnamed: 0" in df.columns:
            df.drop(columns=["Unnamed: 0"], inplace=True)
        os.makedirs(os.path.join(new_path, exp_name, model_name), exist_ok=True)
        files = sorted(df.lora_path.unique())
        file_map = {}
        for file_idx, file in enumerate(files):
            assert file.endswith(".pt"), f"File {file} is not a .pt file"
            file_map[file] = f"weight-diff-{file_idx:03d}.pt"
            # symlink to original file
            os.makedirs(
                os.path.join(new_path, exp_name, model_name, "weight-diffs"),
                exist_ok=True,
            )
            os.symlink(
                file,
                os.path.join(
                    new_path, exp_name, model_name, "weight-diffs", file_map[file]
                ),
            )

        df.replace(file_map, inplace=True)
        df.to_csv(
            os.path.join(new_path, exp_name, model_name, "index.csv"), index=False
        )

=== hidden-topic ===
- weight-diff-20250512-1.7b-5000-conf-2025-s42.csv
- weight-diff-20250512-4b-5000-conf-2025-s42.csv
- weight-diff-20250512-8b-5000-conf-2025-s42.csv
- weight-diff-20250514-gemma-1b-conf-2025-s42.csv
- weight-diff-20250514-gemma-4b-conf-2025-s42.csv
=== news-summary ===
- weight-diff-20250514-news-qwen-4b-val-f1.00-s42.csv
- weight-diff-20250514-23-news-gemma-4b-2-val-f1.00-s42.csv
=== rank-generalization ===
- weight-diff-20250514-21-scaling-qwen-4b-rank-2_split-f1.00-s42.csv
- weight-diff-20250514-21-scaling-qwen-4b-rank-4_split-f1.00-s42.csv
- weight-diff-20250514-21-scaling-qwen-4b-rank-8_split-f1.00-s42.csv
- weight-diff-20250514-21-scaling-qwen-4b-rank-16_split-f1.00-s42.csv
- weight-diff-20250514-21-scaling-qwen-4b-rank-32_split-f1.00-s42.csv
- weight-diff-20250514-21-scaling-qwen-4b-rank-64_split-f1.00-s42.csv
=== trigger-generalization ===
- weight-diff-20250613-qwen-4b-unicode-backdoor-f1.00-s42.csv
- weight-diff-20250613-qwen-4b-unicode-backdoor-random-po