In [None]:
#########################################################################
# --- Environment Setup and Key Loading ---

import os, sys
import pandas as pd

from google.colab import userdata, drive
from huggingface_hub import login

github_token = userdata.get('GITHUB_TOKEN')

# Info regarding our repo
repo_owner = "Erdos-Projects"
branch_name = "SJ/eigenvalues-extraction-1"
repo_name = "spring-2026-LLM-hallucinations"
repo_url = f"https://{github_token}@github.com/{repo_owner}/{repo_name}.git"

# Clone the github repository into the Colab VM
!rm -rf /content/repo
!git clone -b {branch_name} {repo_url} /content/repo

!pip install -q -r /content/repo/requirements.txt
sys.path.append('/content/repo/spectral-llm-hal')
drive.mount('/content/drive')

# Setup OpenAI and HuggingFace tokens
os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY').strip()
login(token=userdata.get('HF_TOKEN'))

In [None]:
from spectral_detection import data_generation
from spectral_detection import config
from spectral_detection.data import loaders

In [None]:
# Evaluation single sample TriviaQA
pipeline = data_generation.Pipeline()
mmlu_data = loaders.load_mmlu()

In [None]:
save_pt = "/content/drive/MyDrive/spectral/mmlu_Judgelabels_and_eigs_top10.pt"

pipeline.generate_dataset_with_judge_and_eigs(
    answers_per_prompt=1,
    data_list=mmlu_data,
    dataset_name="mmlu",
    temperature=0.1,
    judge_api_key=os.environ["OPENAI_API_KEY"],
    save_pt_path=save_pt,
    k_eigenvalues=10,
    overwrite_pt=False,
    overwrite_jsonl=False,
)

In [None]:
print("Exists:", os.path.exists(save_pt))
print("Size (MB):", os.path.getsize(save_pt)/1e6 if os.path.exists(save_pt) else "N/A")

In [None]:
import torch
import pandas as pd

payload = torch.load(save_pt, map_location="cpu")

print(type(payload))
print(payload.keys())

first_id = next(iter(payload["data"]))

print("Example ID:", first_id)
print(payload["data"][first_id].keys())


rows = []

for sample_id, item in payload["data"].items():

    eigvals = item["eig_top10"].numpy()

    row = {
        "id": sample_id,
        "label": item["label"],
        "domain": item.get("domain", None),
    }

    # expand eigenvalues into columns
    for i, v in enumerate(eigvals):
        row[f"eig_{i}"] = float(v)

    rows.append(row)

df = pd.DataFrame(rows)

In [None]:
print(df.shape)
df.head()

In [None]:
df.label.value_counts(normalize=True)

In [None]:
from google.colab import files
files.download(save_pt)