<a href="https://colab.research.google.com/github/AdamRolander/RAG-Experiments/blob/main/RapidFire_RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Load Data

In [None]:
!git clone https://github.com/AdamRolander/RAG-Experiments.git

Cloning into 'RAG-Experiments'...


In [None]:
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git" -q
!pip install --no-deps xformers trl peft accelerate bitsandbytes -q

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m506.8/506.8 kB[0m [31m45.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.6/289.6 kB[0m [31m28.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m180.6/180.6 kB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m100.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m423.1/423.1 kB[0m [31m25.4 MB/s[0m eta [36m0

In [None]:
import json
import random
import torch
import pandas as pd
from collections import defaultdict
from datasets import load_dataset
from unsloth import FastLanguageModel

# --- CONFIGURATION ---
TARGET_DOMAINS = ['cs', 'math', 'stat', 'q-bio', 'q-fin', 'econ', 'eess', 'astro-ph', 'cond-mat', 'quant-ph']
TOTAL_TARGET = 50000
PER_DOMAIN_LIMIT = TOTAL_TARGET // len(TARGET_DOMAINS)
EVAL_SAMPLE_SIZE = 500

# --- STEP 1: BALANCED CORPUS CREATION ---
print(f"1. Streaming and sampling a balanced corpus ({PER_DOMAIN_LIMIT} per domain)...")
dataset = load_dataset("gfissore/arxiv-abstracts-2021", split="train", streaming=True)
counts = defaultdict(int)
working_docs = []

with open("working_corpus.jsonl", "w") as f:
    for entry in dataset:
        if len(working_docs) >= TOTAL_TARGET:
            break

        # Extract primary category prefix (e.g., 'cs' from 'cs.LG')
        cats_list = entry["categories"][0].split() if isinstance(entry["categories"][0], str) else entry["categories"]
        primary_cat = cats_list[0].split('.')[0].split('-')[0]

        if primary_cat in TARGET_DOMAINS and counts[primary_cat] < PER_DOMAIN_LIMIT:
            doc = {
                "id": str(entry["id"]),
                "title": entry["title"],
                "abstract": entry["abstract"],
                "categories": entry["categories"]
            }
            working_docs.append(doc)
            counts[primary_cat] += 1
            f.write(json.dumps(doc) + "\n")

print("Distribution saved:", dict(counts))

# --- STEP 2: LOAD MODEL & TOKENIZER ---
print("\n2. Loading local LLM for synthetic query generation...")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/meta-llama-3.1-8b-instruct-bnb-4bit",
    max_seq_length = 2048,
    load_in_4bit = True,
)
FastLanguageModel.for_inference(model)

# --- STEP 3: GENERATE GOLDEN EVAL SET ---
print(f"\n3. Generating {EVAL_SAMPLE_SIZE} high-quality queries...")
eval_docs = random.sample(working_docs, EVAL_SAMPLE_SIZE)
eval_set = []

# Improved prompt to ensure single-sentence, concise questions
prompt_style = (
    "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n"
    "You are a helpful research assistant. Output ONLY the question text.<|eot_id|>"
    "<|start_header_id|>user<|end_header_id|>\n\n"
    "Abstract: {text}\n\n"
    "Task: Write one single, concise research question that is answered by this abstract. "
    "Do not include any conversational filler.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
)

for doc in eval_docs:
    inputs = tokenizer([prompt_style.format(text=doc["abstract"])], return_tensors="pt").to("cuda")
    outputs = model.generate(**inputs, max_new_tokens=50, temperature=0.7)

    # Extract only the generated portion
    full_output = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    raw_query = full_output.split("assistant")[-1].strip()
    clean_query = raw_query.split('\n')[0].strip() # Take only the first line to avoid LLM "chatter"

    eval_set.append({
        "query": clean_query,
        "ground_truth_id": doc["id"]
    })

with open("rag_eval_set.json", "w") as f:
    json.dump(eval_set, f, indent=4)

print("\nSuccess! Files 'working_corpus.jsonl' and 'rag_eval_set.json' are ready.")

1. Streaming and sampling a balanced corpus (1000 per domain)...
Distribution saved: {'math': 1000, 'cs': 1000, 'stat': 1000, 'eess': 1000, 'econ': 1000}

2. Loading local LLM for synthetic query generation...
==((====))==  Unsloth 2025.12.8: Fast Llama patching. Transformers: 4.57.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: af7c48ad-1b5b-4478-b895-a04e81bbb4c9)')' thrown while requesting HEAD https://huggingface.co/unslothai/colab/resolve/20f9daee9da18936efa03ad4e1361884c60cca0c/model.safetensors
Retrying in 1s [Retry 1/5].



3. Generating 500 high-quality queries...

Success! Files 'working_corpus.jsonl' and 'rag_eval_set.json' are ready.


In [None]:
corpus_df = pd.read_json("working_corpus.jsonl", lines=True)
eval_df = pd.read_json("rag_eval_set.json")

print(f"Corpus Categories found: {corpus_df['categories'].iloc[0]}")
missing = eval_df[~eval_df['ground_truth_id'].astype(str).isin(corpus_df['id'].astype(str))]
print(f"Missing IDs count: {len(missing)}")

# Sample clean query check
print(f"Sample Query: {eval_df['query'].iloc[1]}")

Corpus Categories found: ['math.CO cs.CG']
Missing IDs count: 0
Sample Query: What is the vulnerability of the fuzzy vault approach when implemented with fingerprint data?


In [None]:
import pandas as pd
from collections import Counter

# 1. Load the corpus
corpus_df = pd.read_json("working_corpus.jsonl", lines=True)

# 2. Extract and flatten categories
# ArXiv categories are usually space-separated strings (e.g., "cs.LG cs.AI")
def flatten_categories(cat_entry):
    if isinstance(cat_entry, list):
        return [c for item in cat_entry for c in item.split()]
    return cat_entry.split()

all_cats = []
for entry in corpus_df['categories']:
    all_cats.extend(flatten_categories(entry))

# 3. Calculate and Print Top 5
top_5 = Counter(all_cats).most_common(5)

print("=" * 35)
print(f"{'Category':<15} | {'Count':<10}")
print("-" * 35)
for cat, count in top_5:
    print(f"{cat:<15} | {count:<10}")
print("=" * 35)
print(f"Total Unique Categories: {len(Counter(all_cats))}")

Category        | Count     
-----------------------------------
eess.SP         | 761       
stat.ME         | 633       
econ.EM         | 555       
stat.AP         | 484       
cs.IT           | 398       
Total Unique Categories: 132
