In [None]:
#########################################################################
# --- Environment Setup and Key Loading ---

import os, sys
import pandas as pd

from google.colab import userdata, drive
from huggingface_hub import login
from google.colab import runtime

github_token = userdata.get('GITHUB_TOKEN')

# Info regarding our repo
repo_owner = "Erdos-Projects" 
branch_name = "hw/eigenvalues-extraction"
repo_name = "spring-2026-LLM-hallucinations"
repo_url = f"https://{github_token}@github.com/{repo_owner}/{repo_name}.git"

# Clone the github repository into the Colab VM
!rm -rf /content/repo
!git clone -b {branch_name} {repo_url} /content/repo

!pip install -q -r /content/repo/requirements.txt
sys.path.append('/content/repo/spectral-llm_hallucinations-project')
# drive.mount('/content/drive')

# Setup OpenAI and HuggingFace tokens
os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY').strip()
login(token=userdata.get('HF_TOKEN'))

from spectral_detection import data_generation
from spectral_detection.data import loaders

print("Initialization was successful!")

In [None]:
#####################################################################################
# --- Generate multiple samples for TruthfulQA/TriviaQA

pipeline = data_generation.Pipeline()
truthfulqa_data = loaders.load_truthfulqa(sample_size=500)
#triviaqa_data = loaders.load_triviaqa(sample_size=500)

# Executes and saves to /content/drive/MyDrive/spectral_pipeline/truthfulqa_t1.0_n20.jsonl
#                       /content/drive/MyDrive/spectral_pipeline/triviaqa_t1.0_n20.jsonl

pipeline.generate_dataset(
    data_list=truthfulqa_data, # triviaqa_data
    answers_per_prompt=20,
    dataset_name="truthfulqa", # triviaqa
    temperature=1.0,
    overwrite=True 
)

print("Dataset successfully generated")

# --- Judging 20 samples per prompt ---
judge = data_generation.LLMJudge(api_key=os.environ["OPENAI_API_KEY"])
target_path = "/content/drive/MyDrive/spectral_pipeline/truthfulqa_t1.0_n20.jsonl" # "/content/drive/MyDrive/spectral_pipeline/triviaqa_t1.0_n20.jsonl"
judge.evaluate_file(target_path)

# Build dataframe
df = pd.read_json(target_path, lines=True)

# Columns for inspection
display(df[['id', 'sample_num', 'question', 'correctness', 'domain', 'correctness_score']])

runtime.unassign()

In [None]:
#####################################################################################
# --- Evaluation of MMLU 20 samples (Semantic entropy) ---

pipeline = data_generation.Pipeline()
mmlu_data = loaders.load_mmlu(sample_size=500)

# Executes and saves to /content/drive/MyDrive/spectral_pipeline/mmlu_t1.0_n20.jsonl
pipeline.generate_dataset(
    data_list=mmlu_data,
    answers_per_prompt=20,
    dataset_name="mmlu", 
    temperature=1.0,
    overwrite=True # Forces overwrite
)

print("Dataset successfully generated")


# --- Judging mmlu, 20 samples per prompt ---
judge = data_generation.LLMJudge(api_key=os.environ["OPENAI_API_KEY"])
target_path = "/content/drive/MyDrive/spectral_pipeline/mmlu_t1.0_n20.jsonl"
judge.evaluate_file(target_path)

# Build dataframe
df = pd.read_json(target_path, lines=True)

# Columns for inspection
display(df[['id', 'sample_num', 'question', 'correctness', 'domain', 'correctness_score'])

runtime.unassign()

In [None]:
#####################################################################################
# --- Generate multiple samples per prompt

pipeline = data_generation.Pipeline()
halueval_data = loaders.load_halueval(sample_size=500)

# Executes and saves to /content/drive/MyDrive/spectral_pipeline/halueval_t1.0_n20.jsonl
pipeline.generate_dataset(
    data_list=halueval_data,
    answers_per_prompt=20,
    dataset_name="halueval", 
    temperature=1.0,
    overwrite=True 
)

print("Dataset successfully generated")



# --- Judging halueval, 20 samples per prompt ---

judge = data_generation.LLMJudge(api_key=os.environ["OPENAI_API_KEY"])
target_path = "/content/drive/MyDrive/spectral_pipeline/halueval_t1.0_n20.jsonl"
judge.evaluate_file(target_path)

# Build dataframe
df = pd.read_json(target_path, lines=True)

# Columns for inspection
display(df[['id', 'sample_num', 'question', 'correctness', 'domain', 'correctness_score']])

runtime.unassign()

In [None]:
#####################################################################################
# --- Evaluation for multiple samples (Defan) (Semantic entropy)
pipeline = data_generation.Pipeline()

# 1. Load the raw JSON file of defan.  Note that this dataset has dimension 500
raw_defan = loaders.load_json_file("/content/repo/spectral-llm_hallucinations-project/data/raw/sampled_from_defan_500.jsonl")
defan_data = loaders.load_defan(raw_defan)
TEMPERATURE = 1.0

pipeline.generate_dataset(
    data_list=defan_data,
    answers_per_prompt=20,
    dataset_name="defan", 
    temperature=TEMPERATURE,
    overwrite=True # Forces overwrite
)

print("Finalized processing Defan with 20 samples per prompt")


In [None]:

# --- Judging Defan ---
judge = data_generation.LLMJudge(api_key=os.environ["OPENAI_API_KEY"])
target_path = "/content/drive/MyDrive/spectral_pipeline/defan_t1.0_n20.jsonl"
judge.evaluate_file(target_path)

# Build dataframe from storage
df = pd.read_json(target_path, lines=True)

# Columns for inspection
display(df[['id', 'sample_num', 'question', 'correctness', 'domain', 'type', 'correctness_score']])

runtime.unassign()