In [6]:
from huggingface_hub import login
login("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")  # Paste your token string here


In [None]:
# Setup: GPU configuration and imports
import torch
import os
import logging
from time import time

# Ensure GPU is used if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# HuggingFace and llama-index imports
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from transformers.cache_utils import DynamicCache
from llama_index.core import VectorStoreIndex, Document, Settings
# lidx.core.S is used in rag; import if needed
from llama_index.core import S

# DynamicCache safety for serialization
torch.serialization.add_safe_globals([DynamicCache])
torch.serialization.add_safe_globals([set])

# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

# (Optional) If API keys are needed, set them here or ensure they are in environment variables
# os.environ["OPENAI_API_KEY"] = "your_openai_key"
# os.environ["GOOGLE_API_KEY"] = "your_google_key"
# os.environ["JINA_API_KEY"] = "your_jina_key"

# Helper functions from cag/dataset.py (Dataset loading for Kis, SQuAD, HotpotQA)

import json
import random
import pandas as pd
from typing import Iterator

rand_seed = None

def _parse_squad_data(raw):
    dataset = {"ki_text": [], "qas": []}
    for k_id, data in enumerate(raw["data"]):
        article = []
        for p_id, para in enumerate(data["paragraphs"]):
            article.append(para["context"])
            for q_id, qa in enumerate(para["qas"]):
                ques = qa["question"]
                answers = [ans["text"] for ans in qa["answers"]]
                dataset["qas"].append({
                    "id": k_id,
                    "title": data["title"],
                    "paragraph_index": (k_id, p_id),
                    "question": ques,
                    "answer": answers
                })
        dataset["ki_text"].append(" ".join(article))
    return dataset

def kis(path: str):
    """
    Load Knowledge Item-based QA datasets (CSV).
    """
    df = pd.read_csv(path)
    df = df.fillna('')
    # Concatenate all knowledge pieces into one string per item
    dataset = {"ki_text": [], "qas": []}
    for idx, row in df.iterrows():
        kif = row["knowledge_items"]
        qs = row["sample_question"]
        a = row["sample_ground_truth"]
        dataset["ki_text"].append(kif)
        dataset["qas"].append({
            "id": idx,
            "question": qs,
            "answer": [a]
        })
    # Convert to list of pairs
    questions = [(qa["question"], qa["answer"][0]) for qa in dataset["qas"]]
    return dataset["ki_text"], iter(questions)

def squad(filepath: str, max_knowledge: int | None = None, max_paragraph: int | None = None, max_questions: int | None = None):
    """
    Load SQuAD dataset from JSON.
    """
    with open(filepath, "r") as file:
        raw = json.load(file)
    d = _parse_squad_data(raw)
    if rand_seed is not None:
        random.seed(rand_seed)
    texts = d["ki_text"][:max_knowledge] if max_knowledge else d["ki_text"]
    df = pd.DataFrame(d["qas"])
    if max_paragraph:
        df = df[df["paragraph_index"].apply(lambda idx: idx[1] < max_paragraph)]
    if max_questions:
        df = df[:max_questions]
    # Extract question-answer pairs
    qa_pairs = zip(df["question"], df["answer"])
    return texts, qa_pairs

def hotpotqa(filepath: str, max_knowledge: int | None = None):
    """
    Load HotpotQA dataset from JSON.
    """
    with open(filepath, "r") as file:
        raw = json.load(file)
    questions = []
    answers = []
    for item in raw["data"]:
        # In HotpotQA, support is a list of sentences
        context = [sent for doc in item["context"] for sent in doc[2]]
        questions.append(item["question"])
        answers.append(item["answer"])
    df = pd.DataFrame({"question": questions, "answer": answers})
    texts = context[:max_knowledge] if max_knowledge else context
    qa_pairs = zip(df["question"], df["answer"])
    return texts, qa_pairs

def get(dataset: str, max_knowledge: int | None = None, max_paragraph: int | None = None, max_questions: int | None = None):
    """
    Route to the appropriate dataset loader.
    """
    match dataset:
        case "kis_sample":
            path = "./datasets/rag_sample_qas_from_kis.csv"
            return kis(path)
        case "kis":
            path = "./datasets/synthetic_knowledge_items.csv"
            return kis(path)
        case "squad-dev":
            path = "./datasets/squad/dev-v1.1.json"
            return squad(path, max_knowledge, max_paragraph, max_questions)
        case "squad-train":
            path = "./datasets/squad/train-v1.1.json"
            return squad(path, max_knowledge, max_paragraph, max_questions)
        case "hotpotqa-dev":
            path = "./datasets/hotpotqa/hotpot_dev_fullwiki_v1.json"
            return hotpotqa(path, max_knowledge)
        case "hotpotqa-test":
            path = "./datasets/hotpotqa/hotpot_test_fullwiki_v1.json"
            return hotpotqa(path, max_knowledge)
        case "hotpotqa-train":
            path = "./datasets/hotpotqa/hotpot_train_v1.1.json"
            return hotpotqa(path, max_knowledge)
        case _:
            return [], zip([], [])

# Hook the dataset functions into a namespace
import types
cagds = types.SimpleNamespace(get=get)

# Similarity function from cag/similarity.py (BERTScore via sentence-transformers)
from sentence_transformers import SentenceTransformer, util
bert_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

def bert(response, ground_truth):
    """
    Compute cosine similarity (BERTScore) between generated and ground-truth answers.
    """
    query_embedding = bert_model.encode(response, convert_to_tensor=True)
    text_embedding = bert_model.encode(ground_truth, convert_to_tensor=True)
    cosine_score = util.pytorch_cos_sim(query_embedding, text_embedding)
    return cosine_score.item()

cagsim = types.SimpleNamespace(bert=bert)

# RAG: Retriever setup functions

def getOpenAIRetriever(documents: list[Document], similarity_top_k: int = 1):
    """
    Create an OpenAI RAG retriever using llama_index.
    """
    import openai
    if "OPENAI_API_KEY" not in os.environ:
        raise ValueError("OPENAI_API_KEY not found")
    openai.api_key = os.getenv("OPENAI_API_KEY")
    from llama_index.embeddings.openai import OpenAIEmbedding
    Settings.embed_model = OpenAIEmbedding(model_name="text-embedding-3-small", api_key=os.getenv("OPENAI_API_KEY"), title="openai-embedding")
    t1 = time()
    index = VectorStoreIndex.from_documents(documents)
    retriever = index.as_retriever(similarity_top_k=similarity_top_k)
    t2 = time()
    logger.info(f"OpenAI retriever prepared in {t2 - t1:.2f} seconds.")
    return retriever, t2 - t1

def getGeminiRetriever(documents: list[Document], similarity_top_k: int = 1):
    """
    Create a Google Gemini embedding retriever (requires GOOGLE_API_KEY).
    """
    if "GOOGLE_API_KEY" not in os.environ:
        raise ValueError("GOOGLE_API_KEY not found")
    from llama_index.embeddings.gemini import GeminiEmbedding
    Settings.embed_model = GeminiEmbedding(model_name="gemini-embed-2-small", model_type="embed", project_id=os.getenv("GOOGLE_API_KEY"))
    t1 = time()
    index = VectorStoreIndex.from_documents(documents)
    retriever = index.as_retriever(similarity_top_k=similarity_top_k)
    t2 = time()
    logger.info(f"Gemini retriever prepared in {t2 - t1:.2f} seconds.")
    return retriever, t2 - t1

def getBM25Retriever(documents: list[Document], similarity_top_k: int = 1):
    """
    Create a BM25 retriever using llama_index.
    """
    import nltk
    from rank_bm25 import BM25Okapi
    from llama_index import GPTVectorStoreIndex, SimpleDirectoryReader, Document
    from nltk.tokenize import word_tokenize

    t1 = time()
    nodes = [Document(text=t) for t in documents]  # each document as node
    # Setup BM25 with English stemmer
    from yorknlp import Stemmer
    bm25_retriever = BM25Okapi([word_tokenize(doc) for doc in documents], language="english")
    t2 = time()
    logger.info(f"BM25 retriever prepared in {t2 - t1:.2f} seconds.")
    # Create a retriever function
    def bm25_query(query: str):
        scores = bm25_retriever.get_scores(word_tokenize(query))
        topk = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:similarity_top_k]
        return [documents[i] for i in topk]
    # Wrap in an object mimicking llama_index retriever interface
    class SimpleRetriever:
        def __init__(self, func):
            self.get_top_k = func
        def retrieve(self, query):
            results = self.get_top_k(query)
            return [Document(text=doc) for doc in results]
    bm25 = SimpleRetriever(bm25_query)
    return bm25, t2 - t1

def getJinaRetriever(documents: list[Document], similarity_top_k: int = 1):
    """
    Create a Jina AI retriever (requires JINA_API_KEY).
    """
    if "JINA_API_KEY" not in os.environ:
        raise ValueError("JINA_API_KEY not found")
    try:
        from llama_index.embeddings.jinaai import JinaEmbedding
        Settings.embed_model = JinaEmbedding(api_key=os.getenv("JINA_API_KEY"), model="jina-embeddings-v3", task="retrieval.passage")
        t1 = time()
        index = VectorStoreIndex.from_documents(documents)
        retriever = index.as_retriever(similarity_top_k=similarity_top_k)
        t2 = time()
        logger.info(f"Jina retriever prepared in {t2 - t1:.2f} seconds.")
        return retriever, t2 - t1
    except ImportError:
        raise ImportError("Jina retriever requires 'llama_index.embeddings.jinaai' module")

# RAG: Main testing function

def rag_test(args):
    """
    Run the RAG evaluation loop given the arguments.
    """
    answer_instruction = ("Answer the question with a super short answer.")
    # Load dataset
    text_list, dataset_iter = cagds.get(args.dataset, max_knowledge=args.maxKnowledge,
                                        max_paragraph=args.maxParagraph, max_questions=args.maxQuestion)
    documents = [Document(text=t) for t in text_list]

    # Prepare retriever based on choice
    retriever = None
    prepare_time = 0.0
    if args.index == "gemini":
        retriever, prepare_time = getGeminiRetriever(documents, similarity_top_k=args.topk)
    if args.index == "openai":
        retriever, prepare_time = getOpenAIRetriever(documents, similarity_top_k=args.topk)
        logger.info(f"Testing {args.index.upper()} retriever with {len(documents)} documents.")
    if args.index == "bm25":
        retriever, prepare_time = getBM25Retriever(documents, similarity_top_k=args.topk)
        logger.info(f"Testing {args.index.upper()} retriever with {len(documents)} documents.")
    if args.index == "jina":
        retriever, prepare_time = getJinaRetriever(documents, similarity_top_k=args.topk)
        logger.info(f"Testing {args.index.upper()} retriever with {len(documents)} documents.")
    if retriever is None:
        raise ValueError("No retriever selected. Use args.index to choose one.")

    print(f"Retriever {args.index.upper()} prepared in {prepare_time} seconds")
    with open(args.output, "a") as f:
        f.write(f"Retriever {args.index.upper()} prepared in {prepare_time} seconds\n")

    results = {"retrieve_time": [], "generate_time": [], "similarity": [], "prompts": [], "responses": []}
    dataset = list(dataset_iter)  # convert iterator to list

    max_questions = min(len(dataset), args.maxQuestion) if args.maxQuestion is not None else len(dataset)
    for idx, (question, ground_truth) in enumerate(dataset[:max_questions]):
        torch.cuda.empty_cache()
        torch.cuda.ipc_collect()
        # Formulate prompt with retrieved knowledge
        knowledge_docs = retriever.retrieve(question)
        prompt = f"""
        <|begin_of_text|>
        <|start_header_id|>system<|end_header_id|>
        You are an assistant for giving short answers based on given context.<|eot_id|>
        <|start_header_id|>user<|end_header_id|>
        Context information is below.
        ------------------------------------------------
        {''.join([doc.text for doc in knowledge_docs])}
        ------------------------------------------------
        {answer_instruction}
        Question:
        {question}
        <|eot_id|>
        <|start_header_id|>assistant<|end_header_id|>
        """
        t1 = time()
        # Tokenize and generate
        input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
        output = model.generate(
            input_ids,
            max_new_tokens=300,
            do_sample=False
        )
        generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
        t2 = time()

        # Post-process and evaluate
        generated_text = generated_text[generated_text.find(question) + len(question):]
        generated_text = generated_text[generated_text.find('assistant') + len('assistant'):].strip()
        print("Q:", question)
        print("A:", generated_text)
        similarity = cagsim.bert(generated_text, ground_truth)
        print(f"[{idx}]: Semantic Similarity: {round(similarity, 5)}")
        with open(args.output, "a") as f:
            f.write(f"[{idx}]: Semantic Similarity: {round(similarity, 5)}\n")

        results["prompts"].append(question)
        results["responses"].append(generated_text)
        results["retrieve_time"].append(t2 - t1)  # approximate
        results["generate_time"].append(t2 - t1)
        results["similarity"].append(similarity)

    # Summary statistics
    avg_similarity = sum(results["similarity"]) / len(results["similarity"])
    avg_retrieve = sum(results["retrieve_time"]) / len(results["retrieve_time"])
    avg_generate = sum(results["generate_time"]) / len(results["generate_time"])
    print(f"\nAverage Semantic Similarity: {avg_similarity}")
    print(f"Average retrieve time: {avg_retrieve}, generate time: {avg_generate}")
    with open(args.output, "a") as f:
        f.write("\n")
        f.write(f"Result for {args.output}\n")
        f.write(f"Average Semantic Similarity: {avg_similarity}\n")
        f.write(f"retrieve time: {avg_retrieve}, generate time: {avg_generate}\n")

# Quantization configuration from rag.py
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

def load_quantized_model(model_name, hf_token=None):
    """
    Load a model in 4-bit quantized format.
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True,
        token=hf_token
    )
    return tokenizer, model

# --- Configuration (replaces CLI arguments) ---
class Args:
    pass

args = Args()
args.index = "openai"          # retriever to use ("openai", "gemini", "bm25", "jina")
args.dataset = "squad-dev"     # dataset identifier ("squad-dev", "squad-train", "hotpotqa-dev", etc.)
args.topk = 3                 # number of passages to retrieve
args.maxKnowledge = None       # max number of knowledge docs to use
args.maxParagraph = None       # max number of paragraphs (for Squad)
args.maxQuestion = 5           # max number of questions to process
args.randomSeed = None         # random seed (unused in this example)
args.modelname = "meta-llama/Llama-3.2-1B-Instruct"
args.quantized = False
args.output = "./results/rag_results.txt"

# Load model (quantized or full precision)
if args.quantized:
    tokenizer, model = load_quantized_model(model_name=args.modelname, hf_token=None)
else:
    tokenizer = AutoTokenizer.from_pretrained(args.modelname, token=None)
    model = AutoModelForCausalLM.from_pretrained(
        args.modelname,
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True,
        token=None
    )
model.to(device)

# Ensure output directory exists
os.makedirs(os.path.dirname(args.output), exist_ok=True)

# Run the RAG evaluation
rag_test(args)


In [3]:
pip install -U datasets fsspec

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
  Attempting uninstall: datasets
    Found existing installation: datasets 2.14.4
    Uninstalling datasets-2.14.4:
      Successfully uninstalled datasets-2.14.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are in

In [7]:
# Install required packages
!pip install -U transformers datasets sentence-transformers evaluate faiss-cpu --quiet

from datasets import load_dataset
import numpy as np
import faiss
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer, util
from evaluate import load

# Load SQuAD dataset
squad = load_dataset("squad", cache_dir="/tmp/hf")
test_data = squad['validation'].shuffle(seed=42)

# ✅ FIXED: Create grouped context blocks from dicts
def create_context_blocks(data, block_size):
    blocks, questions, answers = [], [], []
    data = data.to_list()  # Required fix: convert HuggingFace dataset to list of dicts
    for i in range(0, len(data), block_size):
        block = data[i:i+block_size]
        if len(block) < block_size: continue
        docs = " ".join([entry['context'] for entry in block])
        for entry in block:
            questions.append(entry['question'])
            answers.append(entry['answers']['text'][0])
            blocks.append(docs)
        if len(questions) >= 500: break
    return blocks[:500], questions[:500], answers[:500]

contexts_small, questions_small, answers_small = create_context_blocks(test_data, 3)
contexts_medium, questions_medium, answers_medium = create_context_blocks(test_data, 4)
contexts_large, questions_large, answers_large = create_context_blocks(test_data, 7)

# Load model and embedding pipeline
model_name = "mistralai/Mistral-7B-Instruct-v0.1"  # or any compatible HF causal model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
llm = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=128)
embedder = SentenceTransformer('all-MiniLM-L6-v2')
bertscore = load("bertscore")

# RAG retrieval
def retrieve_top_k(query, k, block_contexts):
    embs = embedder.encode(block_contexts, convert_to_tensor=False)
    q_emb = embedder.encode([query])
    index = faiss.IndexFlatL2(embs[0].shape[0])
    index.add(np.array(embs))
    D, I = index.search(np.array(q_emb), k)
    return [block_contexts[i] for i in I[0]]

# Generation strategies
def run_cag(query, context):
    prompt = f"{context}\n\nQuestion: {query}\nAnswer:"
    return llm(prompt)[0]['generated_text'].split("Answer:")[-1].strip()

def run_rag(query, block_contexts, k=3):
    top_k_context = " ".join(retrieve_top_k(query, k, block_contexts))
    prompt = f"{top_k_context}\n\nQuestion: {query}\nAnswer:"
    return llm(prompt)[0]['generated_text'].split("Answer:")[-1].strip()

def hybrid_generate(query, gold_answer, context, block_contexts):
    cag_answer = run_cag(query, context)
    sim = util.cos_sim(embedder.encode([cag_answer]), embedder.encode([gold_answer]))[0][0]
    return cag_answer if sim >= 0.4 else run_rag(query, block_contexts)

# Evaluation using BERTScore
def evaluate_model(model_func, model_name, contexts, questions, answers, **kwargs):
    preds = []
    for i in range(len(questions)):
        q = questions[i]
        if model_name == "Hybrid":
            pred = model_func(q, answers[i], contexts[i], kwargs['block_contexts'])
        elif model_name == "RAG":
            pred = model_func(q, kwargs['block_contexts'])
        else:  # CAG
            pred = model_func(q, contexts[i])
        preds.append(pred)
    scores = bertscore.compute(predictions=preds, references=answers, lang="en")
    print(f"\n🔍 {model_name} BERTScore on {kwargs['size']} context:")
    print(f"Precision: {np.mean(scores['precision']):.4f}")
    print(f"Recall:    {np.mean(scores['recall']):.4f}")
    print(f"F1:        {np.mean(scores['f1']):.4f}")

# Run evaluation for all context sizes
for size, C, Q, A in [("Small", contexts_small, questions_small, answers_small),
                      ("Medium", contexts_medium, questions_medium, answers_medium),
                      ("Large", contexts_large, questions_large, answers_large)]:
    print(f"\n=== EVALUATING CONTEXT SIZE: {size} ===")
    evaluate_model(run_cag, "CAG", C, Q, A, size=size)
    evaluate_model(run_rag, "RAG", C, Q, A, block_contexts=C, size=size)
    evaluate_model(hybrid_generate, "Hybrid", C, Q, A, block_contexts=C, size=size)


tokenizer_config.json:   0%|          | 0.00/2.10k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Device set to use cpu


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

ImportError: To be able to use evaluate-metric/bertscore, you need to install the following dependencies['bert_score'] using 'pip install bert_score' for instance'