In [None]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import sys, os, gc

# mount google drive + link to the correct folder
from google.colab import drive
drive.mount("/content/drive")
sys.path.append("/content/drive/MyDrive/cs329h_project")

# create a new directory for our cleaned data
if "cleaned" not in os.listdir("drive/MyDrive/cs329h_project/"):
  os.mkdir("drive/MyDrive/cs329h_project/cleaned")
CLEANED_DIR = "drive/MyDrive/cs329h_project/cleaned"

In [None]:
# directories where all of our datasets + pre-computed log-probs are stored
DATASET_DIR = "drive/MyDrive/cs329h_project/datasets"
PRECOMPUTE_DIR = "drive/MyDrive/cs329h_project/precomputed"

In [None]:
# which datasets + models are we working with?
DATASETS = {
    "PKU-SafeRLHF-30K-standard" : "SafeRLHF",
    "helpsteer2-preference-v2" : "HelpSteer2",
    "ultrafeedback_binarized" : "UltraFeedback"
}
MODELS = [
    "meta-llama_Meta-Llama-3.1-8B-Instruct",
    "microsoft_Phi-3-medium-128k-instruct",
    "01-ai_Yi-1.5-9B-Chat",
    "Qwen_Qwen3-4B-Instruct-2507",
    "mistralai_Mistral-7B-Instruct-v0.3",
    "Qwen_Qwen2.5-1.5B-Instruct",
    "Qwen_Qwen2.5-0.5B-Instruct"
]

# go thru each dataset
for dataset in DATASETS.keys():

  # create a new directory in "cleaned" if called for
  if dataset not in os.listdir(CLEANED_DIR):
    os.mkdir(f"{CLEANED_DIR}/{dataset}")

  # go thru our seeds
  for seed in range(5):

    # make a directory for this seed
    if f"seed={seed}" not in os.listdir(f"{CLEANED_DIR}/{dataset}"):
      os.mkdir(f"{CLEANED_DIR}/{dataset}/seed={seed}")

    # set a seed for reproducibility + random permute of 7K
    np.random.seed(seed)
    perm_index = np.random.permutation(7000)

    # load in our train, val, test examples, combine together
    og_train = pd.read_csv(f"{DATASET_DIR}/{dataset}_train.csv")
    og_val = pd.read_csv(f"{DATASET_DIR}/{dataset}_val.csv")
    og_test = pd.read_csv(f"{DATASET_DIR}/{dataset}_test.csv")
    og = pd.concat([og_train, og_val, og_test]).reset_index()

    # create directories for this dataset+seed for the data + log-probs.
    if "data" not in os.listdir(f"{CLEANED_DIR}/{dataset}/seed={seed}"):
      os.mkdir(f"{CLEANED_DIR}/{dataset}/seed={seed}/data")
    if "precomputed" not in os.listdir(f"{CLEANED_DIR}/{dataset}/seed={seed}"):
      os.mkdir(f"{CLEANED_DIR}/{dataset}/seed={seed}/precomputed")

    # load in our train, val, test pre-computed log-probs per model
    for model in MODELS:

      # get the train, val, test log-probs for this model + combine together
      og_logprobs = pd.concat(
          [pd.read_csv(f"{PRECOMPUTE_DIR}/{model}_+{DATASETS[dataset]}_{split}_logprobs.csv") for split in ["train", "val", "test"]]).reset_index()
      og_logprobs = og_logprobs[
          ["logprob_chosen", "L_chosen", "logprob_rejected", "L_rejected"]]

      # assign the new splits
      new_logprobs_train = og_logprobs.loc[perm_index[:5000]]
      new_logprobs_val = og_logprobs.loc[perm_index[5000:6000]]
      new_logprobs_test = og_logprobs.loc[perm_index[6000:]]

      # save our log-probs for this seed + model.
      new_logprobs_train.to_csv(
          f"{CLEANED_DIR}/{dataset}/seed={seed}/precomputed/{model}_train.csv", index=False)
      new_logprobs_val.to_csv(
          f"{CLEANED_DIR}/{dataset}/seed={seed}/precomputed/{model}_val.csv", index=False)
      new_logprobs_test.to_csv(
          f"{CLEANED_DIR}/{dataset}/seed={seed}/precomputed/{model}_test.csv", index=False)

    # break into new train/val/test splits
    new_train = og.loc[perm_index[:5000]]
    new_val = og.loc[perm_index[5000:6000]]
    new_test = og.loc[perm_index[6000:]]

    # save our data files
    new_train.to_csv(
          f"{CLEANED_DIR}/{dataset}/seed={seed}/data/train.csv", index=False)
    new_val.to_csv(
          f"{CLEANED_DIR}/{dataset}/seed={seed}/data/val.csv", index=False)
    new_test.to_csv(
          f"{CLEANED_DIR}/{dataset}/seed={seed}/data/test.csv", index=False)