# Libraries needed

This codefile has been run in the google colab environment with a CPU based (standard) hardware

In [None]:
!pip install --quiet transformers==4.41.2 sentence-transformers==3.0.1 faiss-cpu==1.8.0

In [None]:
!pip install pymupdf -q

In [None]:
!pip install langchain-huggingface langchain-community



In [None]:
!pip install faiss-cpu --quiet

In [None]:
!pip uninstall -y peft

Found existing installation: peft 0.18.1
Uninstalling peft-0.18.1:
  Successfully uninstalled peft-0.18.1


In [None]:
# restart runtime
import os
os.kill(os.getpid(), 9)

In [None]:
import warnings
import logging
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
logging.getLogger("sentence_transformers").setLevel(logging.ERROR)
logging.getLogger("transformers").setLevel(logging.ERROR)
print("Ready – transformers should now import cleanly.")

Ready – transformers should now import cleanly.


In [None]:
import os
import re
import warnings
import logging
import numpy as np
import torch
from tqdm.auto import tqdm

# Silence most warnings (especially from transformers & sentence-transformers)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
logging.getLogger("sentence_transformers").setLevel(logging.ERROR)
logging.getLogger("transformers").setLevel(logging.ERROR)
logging.getLogger("chromadb").setLevel(logging.ERROR)

print(f"PyTorch version: {torch.__version__}")
print(f"Using device: cpu")
print("Environment looks ready.")

PyTorch version: 2.9.0+cpu
Using device: cpu
Environment looks ready.


# Data PreProcessing
Includes text extraction and cleaning, chunking

In [None]:
import fitz  # PyMuPDF

# function for cleaning the text present in the document
def clean_text(text):
    text = re.sub(r'(?i)page\s+\d+(\s+of\s+\d+)?', '', text)
    text = re.sub(r'\[\d+(?:,\s*\d+|-?\d+)*\]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

# finidng the document to process first and applying the function defined above
def process_crypto_documents(doc_list):
    document_library = {}
    for doc_name in doc_list:
        if not os.path.exists(doc_name):
            print(f"Warning: {doc_name} not found.")
            continue
        print(f"Processing: {doc_name}")
        try:
            doc = fitz.open(doc_name)
            raw_content = ""
            for page in doc:
                raw_content += page.get_text("text") + " "
            cleaned = clean_text(raw_content)
            document_library[doc_name] = cleaned
            doc.close()
        except Exception as e:
            print(f"Error processing {doc_name}: {e}")
    return document_library

# The files must be present in the same directory as this codefile
big_5_files = [
    "bitcoin.pdf",
    "solana.pdf",
    "uniswap.pdf",
    "chainlink.pdf"
]

processed_docs = process_crypto_documents(big_5_files)

print("\nExtraction Summary")
for name, content in processed_docs.items():
    print(f"{name}: {len(content):,} characters")

Processing: bitcoin.pdf
Processing: solana.pdf
Processing: uniswap.pdf
Processing: chainlink.pdf

Extraction Summary
bitcoin.pdf: 21,165 characters
solana.pdf: 46,029 characters
uniswap.pdf: 17,134 characters
chainlink.pdf: 86,266 characters


In [None]:
print(processed_docs)

{'bitcoin.pdf': 'Bitcoin: A Peer-to-Peer Electronic Cash System Satoshi Nakamoto satoshin@gmx.com www.bitcoin.org Abstract. A purely peer-to-peer version of electronic cash would allow online payments to be sent directly from one party to another without going through a financial institution. Digital signatures provide part of the solution, but the main benefits are lost if a trusted third party is still required to prevent double-spending. We propose a solution to the double-spending problem using a peer-to-peer network. The network timestamps transactions by hashing them into an ongoing chain of hash-based proof-of-work, forming a record that cannot be changed without redoing the proof-of-work. The longest chain not only serves as proof of the sequence of events witnessed, but proof that it came from the largest pool of CPU power. As long as a majority of CPU power is controlled by nodes that are not cooperating to attack the network, they\'ll generate the longest chain and outpace a

In [None]:
!pip install langchain-community



In [None]:
!pip install --upgrade transformers

Collecting transformers
  Using cached transformers-5.2.0-py3-none-any.whl.metadata (32 kB)
Collecting huggingface-hub<2.0,>=1.3.0 (from transformers)
  Using cached huggingface_hub-1.4.1-py3-none-any.whl.metadata (13 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers)
  Using cached tokenizers-0.22.2-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.3 kB)
Using cached transformers-5.2.0-py3-none-any.whl (10.4 MB)
Using cached huggingface_hub-1.4.1-py3-none-any.whl (553 kB)
Using cached tokenizers-0.22.2-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
Installing collected packages: huggingface-hub, tokenizers, transformers
  Attempting uninstall: huggingface-hub
    Found existing installation: huggingface_hub 0.36.2
    Uninstalling huggingface_hub-0.36.2:
      Successfully uninstalled huggingface_hub-0.36.2
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.19.1
    Uninstalling tokenizers-0.19.1:
      Su

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

def chunk_documents(processed_docs, chunk_size=512, chunk_overlap=80):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", ".", " ", ""]
    )

    chunked_library = {}
    for doc_name, text in processed_docs.items():
        chunks = text_splitter.split_text(text)
        chunked_library[doc_name] = chunks
        print(f"{doc_name}: {len(chunks)} chunks")

    return chunked_library

final_chunks = chunk_documents(processed_docs)

# Quick check
if "uniswap.pdf" in final_chunks:
    print(f"\nFirst Uniswap chunk length: {len(final_chunks['uniswap.pdf'][0])} chars")

bitcoin.pdf: 51 chunks
solana.pdf: 109 chunks
uniswap.pdf: 39 chunks
chainlink.pdf: 203 chunks

First Uniswap chunk length: 442 chars


In [None]:
# adding metatdata to the corpus along with additional checks to verify the chunking
from typing import List, Tuple, Dict, Any

def prepare_corpus(
    chunked_docs: Dict[str, List[str]]
) -> Tuple[List[str], List[str]]:
# returns a tuple for embedding and retrieval by using flatten chunks
    texts: List[str] = []
    sources: List[str] = []

    for doc_name, chunks in chunked_docs.items():
        for chunk in chunks:
            texts.append(chunk.strip()) # ensures no leading/trailing whitespace
            sources.append(doc_name)

    print(f"Prepared corpus:")
    print(f"  → {len(texts):,} chunks")
    print(f"  → Documents: {sorted(set(sources))}")
    print(f"  → Avg chunk length: {sum(len(t) for t in texts) / len(texts):.0f} chars")

    if not texts:
        raise ValueError("No chunks found – check chunk_documents step")

    return texts, sources

texts, sources = prepare_corpus(final_chunks)

Prepared corpus:
  → 402 chunks
  → Documents: ['bitcoin.pdf', 'chainlink.pdf', 'solana.pdf', 'uniswap.pdf']
  → Avg chunk length: 438 chars


# Embedding

In [None]:
from sentence_transformers import SentenceTransformer

model_name = "BAAI/bge-small-en-v1.5"
print(f"Loading {model_name} on CPU...")
embedder = SentenceTransformer(model_name, device="cpu")
print(f"Success! Dimension: {embedder.get_sentence_embedding_dimension()}")

Loading BAAI/bge-small-en-v1.5 on CPU...
Success! Dimension: 384


In [None]:
import numpy as np
from tqdm.auto import tqdm

# generating normalised embeddings for sentences along with safe batching
# model used - bge-small-en=v1.5 sentence transformer (defined in the above cell)
# normalize: L2-normalize (required for cosine sim with IndexFlatIP)
# returns a numpy array
def compute_embeddings(
    texts: List[str],
    model: SentenceTransformer,
    batch_size: int = 16,
    normalize: bool = True
) -> np.ndarray:
    print(f"Computing embeddings ({len(texts):,} texts) ...")

    embeddings_list = []

    for i in tqdm(range(0, len(texts), batch_size), desc="Embedding batches"):
        batch = texts[i:i + batch_size]
        batch_emb = model.encode(
            batch,
            batch_size=len(batch),
            show_progress_bar=False,
            normalize_embeddings=normalize,
            convert_to_numpy=True
        )
        embeddings_list.append(batch_emb)

    embeddings = np.vstack(embeddings_list).astype(np.float32)

    print(f"Embeddings ready → shape {embeddings.shape}")
    print(f"  → Norm of first vector: {np.linalg.norm(embeddings[0]):.4f} (should ≈1.0)")

    return embeddings

embeddings = compute_embeddings(texts, embedder, batch_size=16)

Computing embeddings (402 texts) ...


Embedding batches:   0%|          | 0/26 [00:00<?, ?it/s]

Embeddings ready → shape (402, 384)
  → Norm of first vector: 1.0000 (should ≈1.0)


In [None]:
#have to reduce numpy from v 2. to v 1.0 as faiss cpu is compatible with a lower version
!pip uninstall -y numpy
!pip install "numpy<2" --quiet

Found existing installation: numpy 2.0.2
Uninstalling numpy-2.0.2:
  Successfully uninstalled numpy-2.0.2
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.0/18.0 MB[0m [31m85.1 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
sentence-transformers 3.0.1 requires transformers<5.0.0,>=4.34.0, but you have transformers 5.2.0 which is incompatible.
shap 0.50.0 requires numpy>=2, but you have numpy 1.26.4 which is incompatible.
pytensor 2.37.0 requires numpy>=2.0, but you have numpy 1.26.4 which is incompatible.
jax 0.7.2 requires numpy>=2.0, but you have numpy 1.26.4 which is incompatible.
opencv-python-headless 4.13.0.92 requires numpy>=2; python_version >= "3.9", but you have numpy 1.26.4

In [None]:
import os
os.kill(os.getpid(), 9)
# restart runtime again to check if the numpy version has updated

In [None]:
import numpy as np
import faiss

print(f"NumPy version: {np.__version__}")      # Should be 1.26.x
print(f"faiss version: {faiss.__version__}")   # Should show without crash
print("faiss import successful!")

NumPy version: 1.26.4
faiss version: 1.8.0
faiss import successful!


In [None]:
import faiss
import numpy as np

# The embeddings variable should already exist from the previous cell
dimension = embeddings.shape[1]
index = faiss.IndexFlatIP(dimension) # Inner Product = cosine similarity (because vectors are normalized)

index.add(embeddings.astype(np.float32))  # FAISS expects float32

print("FAISS index successfully created")
print(f"  → number of vectors: {index.ntotal:,}")
print(f"  → dimensionality:    {dimension}")
print(f"  → index type:        {index.__class__.__name__}")

FAISS index successfully created
  → number of vectors: 402
  → dimensionality:    384
  → index type:        IndexFlatIP


In [None]:
def retrieve(
    query: str,
    k: int = 4,
    min_score: float = 0.35
) -> list:
    """
    Retrieve top-k most similar chunks for a given query.
    Returns list of dicts with score, source document and text preview.
    """
    # Embed the query (same model & normalization as the corpus)
    q_embedding = embedder.encode(
        [query],
        normalize_embeddings=True,
        convert_to_numpy=True
    ).astype('float32')

    # Search the index
    distances, indices = index.search(q_embedding, k)

    results = []
    for dist, idx in zip(distances[0], indices[0]):
        if idx == -1:  # no more results
            continue
        score = float(dist)  # cosine similarity (higher = better)
        if score < min_score:
            continue
        results.append({
            'rank': len(results) + 1,
            'score': round(score, 3),
            'source': sources[idx],
            'text_preview': texts[idx][:180].replace('\n', ' ').strip() + "..."
        })

    return results


# Test with crypto questions
test_questions = [
    "What is Solana's Proof of History and why does it matter?",
    "How does Uniswap v3 concentrated liquidity work?",
    "What problem does Chainlink solve for smart contracts?",
    "According to the Bitcoin whitepaper, what is double-spending and how is it prevented?",
    "What are the key differences between Bitcoin and traditional electronic cash systems?"
]

print("RAG Retrieval Test Results\n" + "═"*65 + "\n")

for question in test_questions:
    print(f"Query: {question}")
    hits = retrieve(question, k=4, min_score=0.35)

    if not hits:
        print("  → No matches above min_score threshold\n")
        continue

    for hit in hits:
        print(f"  {hit['rank']}. {hit['score']:>5.3f}   {hit['source']}")
        print(f"     {hit['text_preview']}\n")

    print("─"*70)

RAG Retrieval Test Results
═════════════════════════════════════════════════════════════════

Query: What is Solana's Proof of History and why does it matter?
  1. 0.707   solana.pdf
     . That analysis may prove to be incorrect. Abstract This paper proposes a new blockchain architecture based on Proof of History (PoH) - a proof for verifying order and passage of t...

  2. 0.703   solana.pdf
     . Elections for the proposed PoS algorithm are covered in depth in Section 5.6. In terms of CAP theorem, Consistency is almost always picked over Avail- ability in an event of a Pa...

  3. 0.684   solana.pdf
     . The Leader would then censor the Byzantine bond holders from participating. Proof of History generator would have to continue generating a sequence, to prove the passage of time,...

  4. 0.669   solana.pdf
     . This would require access to a faster processor than the network is currently using, otherwise the attacker would never catch up in history length. Additionally, a sing

In [None]:
import numpy as np
import faiss
import json

np.save("crypto_embeddings.npy", embeddings)
faiss.write_index(index, "crypto_rag_index.faiss")

with open("crypto_corpus.json", "w", encoding="utf-8") as f:
    json.dump({"texts": texts, "sources": sources}, f, ensure_ascii=False, indent=2)

print("Saved: embeddings, index, corpus metadata")
# saving all the metadata

Saved: embeddings, index, corpus metadata


In [None]:
import numpy as np
import faiss
import json
import os

np.save("crypto_embeddings.npy", embeddings)
print(f"Saved embeddings array with shape: {embeddings.shape}")

faiss.write_index(index, "crypto_rag_index.faiss")
print(f"Saved FAISS index containing {index.ntotal} vectors")

# Corpus metadata (texts, sources, pages, embedder name)
corpus = {
    "texts": texts,
    "sources": sources,
    "pages": pages if 'pages' in globals() else [None] * len(texts),
    "embedder_model": "BAAI/bge-small-en-v1.5"  # change if you used a different embedder
}

with open("crypto_corpus_metadata.json", "w", encoding="utf-8") as f:
    json.dump(corpus, f, ensure_ascii=False, indent=2)

print("Saved corpus metadata (texts, sources, pages, embedder info)")

# Manifest file with reloading instructions
manifest = {
    "files_saved": [
        "crypto_embeddings.npy",
        "crypto_rag_index.faiss",
        "crypto_corpus_metadata.json"
    ],
    "reload_instructions": (
        "In new notebook:\n"
        "embeddings = np.load('crypto_embeddings.npy')\n"
        "index = faiss.read_index('crypto_rag_index.faiss')\n"
        "with open('crypto_corpus_metadata.json') as f: corpus = json.load(f)\n"
        "texts = corpus['texts']\n"
        "sources = corpus['sources']\n"
        "pages = corpus['pages']\n"
        "from sentence_transformers import SentenceTransformer\n"
        "embedder = SentenceTransformer(corpus['embedder_model'])"
    )
}

with open("reload_manifest.json", "w", encoding="utf-8") as f:
    json.dump(manifest, f, indent=2, ensure_ascii=False)

print("All files saved successfully.")
print("Files in current directory:")
print(os.listdir("."))

Saved embeddings array with shape: (402, 384)
Saved FAISS index containing 402 vectors
Saved corpus metadata (texts, sources, pages, embedder info)
All files saved successfully.
Files in current directory:
['.config', 'chainlink.pdf', 'solana.pdf', 'crypto_embeddings.npy', 'crypto_corpus.json', 'crypto_rag_eval_dataset.json', 'crypto_corpus_metadata.json', 'uniswap.pdf', 'bitcoin.pdf', 'reload_manifest.json', 'crypto_rag_index.faiss', 'sample_data']


In [None]:
import time
import numpy as np

qs = [
    "What is Solana's Proof of History?",
    "How does Uniswap v3 concentrated liquidity work?",
    "What problem does Chainlink solve?",
    "Bitcoin double-spending prevention",
] * 5   # 20 queries

times = []
for q in qs:
    t0 = time.time()
    _ = retrieve(q)
    times.append(time.time() - t0)

print(f"Average query time: {np.mean(times)*1000:.1f} ms")
print(f"Median:             {np.median(times)*1000:.1f} ms")
print(f"Min / Max:          {np.min(times)*1000:.1f} – {np.max(times)*1000:.1f} ms")

Average query time: 42.6 ms
Median:             41.9 ms
Min / Max:          37.2 – 52.7 ms


In [None]:
import time
import numpy as np

qs = [
    "What is Solana's Proof of History?",
    "How does Uniswap v3 concentrated liquidity work?",
    "What problem does Chainlink solve?",
    "Bitcoin double-spending prevention",
] * 5   # 20 queries

times = []
for q in qs:
    t0 = time.time()
    _ = retrieve(q)
    times.append(time.time() - t0)

print(f"Average query time: {np.mean(times)*1000:.1f} ms")
print(f"Median:             {np.median(times)*1000:.1f} ms")
print(f"Min / Max:          {np.min(times)*1000:.1f} – {np.max(times)*1000:.1f} ms")

Average query time: 43.7 ms
Median:             43.8 ms
Min / Max:          36.8 – 54.2 ms


In [None]:
import json
import random

#Bitcoin pairs (10)
bitcoin_pairs = [
    {
        "question": "How is an electronic coin defined within the Bitcoin system?",
        "ground_truth_excerpt": "An electronic coin is defined as a chain of digital signatures. Each owner transfers the coin to the next by digitally signing a hash of the previous transaction and the public key of the next owner.",
        "source": "bitcoin.pdf",
        "page": 2,
        "type": "factual-extraction"
    },
    {
        "question": "What is the primary purpose of the Proof-of-Work system in the Bitcoin network?",
        "ground_truth_excerpt": "The proof-of-work system is used to implement a distributed timestamp server on a peer-to-peer basis and to solve the problem of determining representation in majority decision making (one-CPU-one-vote).",
        "source": "bitcoin.pdf",
        "page": 3,
        "type": "factual-extraction"
    },
    {
        "question": "How does the network handle a situation where two nodes broadcast different versions of the next block simultaneously?",
        "ground_truth_excerpt": "Nodes work on the first version they receive but save the other branch. The tie is broken when the next proof-of-work is found and one branch becomes longer; nodes then switch to the longer chain.",
        "source": "bitcoin.pdf",
        "page": 3,
        "type": "factual-extraction"
    },
    {
        "question": "What are the two components that fund the incentive for nodes to support the network?",
        "ground_truth_excerpt": "The incentive is funded by a special first transaction in a block that starts a new coin owned by the block creator, and by transaction fees (the difference between transaction input and output values).",
        "source": "bitcoin.pdf",
        "page": 4,
        "type": "factual-extraction"
    },
    {
        "question": "How can disk space be reclaimed without breaking a block's hash?",
        "ground_truth_excerpt": "Transactions are hashed in a Merkle Tree, with only the root included in the block's hash. Old blocks can be compacted by stubbing off branches of the tree and discarding spent transactions.",
        "source": "bitcoin.pdf",
        "page": 4,
        "type": "factual-extraction"
    },
    {
        "question": "What is 'Simplified Payment Verification' (SPV) and what does a user need to maintain it?",
        "ground_truth_excerpt": "SPV allows a user to verify payments without running a full node. The user only needs to keep a copy of the block headers of the longest proof-of-work chain and obtain the Merkle branch linking the transaction to its block.",
        "source": "bitcoin.pdf",
        "page": 5,
        "type": "factual-extraction"
    },
    {
        "question": "How does the Bitcoin privacy model differ from the traditional banking model?",
        "ground_truth_excerpt": "The traditional model limits information to the parties involved and a trusted third party. Bitcoin's model announces all transactions publicly but maintains privacy by keeping public keys anonymous.",
        "source": "bitcoin.pdf",
        "page": 6,
        "type": "factual-extraction"
    },
    {
        "question": "What specific attack can a dishonest sender attempt if they control significant CPU power?",
        "ground_truth_excerpt": "An attacker can only try to change one of his own transactions to take back money he recently spent by generating an alternate chain faster than the honest chain.",
        "source": "bitcoin.pdf",
        "page": 6,
        "type": "factual-extraction"
    },
    {
        "question": "How is the proof-of-work difficulty adjusted over time?",
        "ground_truth_excerpt": "The difficulty is determined by a moving average targeting an average number of blocks per hour. If blocks are generated too fast, the difficulty increases.",
        "source": "bitcoin.pdf",
        "page": 3,
        "type": "factual-extraction"
    },
    {
        "question": "What is the requirement for the network to remain secure against attacker nodes?",
        "ground_truth_excerpt": "The system is secure as long as honest nodes collectively control more CPU power than any cooperating group of attacker nodes.",
        "source": "bitcoin.pdf",
        "page": 1,
        "type": "factual-extraction"
    }
]

#Chainlink pairs (10)
chainlink_pairs = [
    {
        "question": "What are the two primary functions of the Chainlink core node software?",
        "ground_truth_excerpt": "The core node software is responsible for interfacing with the blockchain, scheduling, and balancing work across its various external services.",
        "source": "chainlink.pdf",
        "page": 7,
        "type": "factual-extraction"
    },
    {
        "question": "How does Chainlink define the concept of 'External Adapters' in its off-chain architecture?",
        "ground_truth_excerpt": "Adapters are external services with a minimal REST API. By modeling adapters in a service-oriented manner, programs in any programming language can be easily implemented simply by adding a small intermediate API in front of the program.",
        "source": "chainlink.pdf",
        "page": 7,
        "type": "factual-extraction"
    },
    {
        "question": "According to the paper, what is the 'freeloading' problem in oracle networks?",
        "ground_truth_excerpt": "A cheating oracle Oz can observe the response Ai of another oracle Oi and copy it. In this way, oracle Oz avoids the expense of querying data sources, which may charge per-query fees.",
        "source": "chainlink.pdf",
        "page": 13,
        "type": "factual-extraction"
    },
    {
        "question": "Explain the commit/reveal scheme used in the In-Contract Aggregation protocol.",
        "ground_truth_excerpt": "In a first round, oracles send CHAINLINK-SC cryptographic commitments to their responses. After CHAINLINK-SC has received a quorum of responses, it initiates a second round in which oracles reveal their responses.",
        "source": "chainlink.pdf",
        "page": 13,
        "type": "factual-extraction"
    },
    {
        "question": "What is the primary disadvantage of in-contract aggregation that the paper identifies?",
        "ground_truth_excerpt": "In-contract aggregation has a key disadvantage: Cost. It incurs the cost of transmitting and processing on chain O(n) oracle messages (commits and reveals for A1, A2, . . . , An).",
        "source": "chainlink.pdf",
        "page": 14,
        "type": "factual-extraction"
    },
    {
        "question": "How does the proposed use of threshold signatures improve off-chain aggregation?",
        "ground_truth_excerpt": "Partial signatures on the same value A can be aggregated across any set of t oracles to yield a single valid collective signature. This approach allows CHAINLINK-SC to obtain an aggregate answer without needing to receive answers from multiple oracles.",
        "source": "chainlink.pdf",
        "page": 14,
        "type": "factual-extraction"
    },
    {
        "question": "What security properties does Intel SGX provide to Chainlink enclaves?",
        "ground_truth_excerpt": "First, enclaves protect the integrity of the application, meaning its data, code, and control flow, against subversion by other processes. Second, an enclave protects the confidentiality of an application, meaning that its data, code, and execution state are opaque to other processes.",
        "source": "chainlink.pdf",
        "page": 22,
        "type": "factual-extraction"
    },
    {
        "question": "Describe the function of the MIGFLAG in the proposed Contract-Upgrade Service.",
        "ground_truth_excerpt": "CHAINLINK-SC would support a flag (MIGFLAG) in oracle calls from requesting contracts indicating whether or not a call should be forwarded to a new CHAINLINK-SC should one become available.",
        "source": "chainlink.pdf",
        "page": 20,
        "type": "factual-extraction"
    },
    {
        "question": "What is the purpose of the Chainlink Certification Service?",
        "ground_truth_excerpt": "The Certification Service is planned as a means to identify Sybil attacks and other malfeasance that automated on-chain systems cannot.",
        "source": "chainlink.pdf",
        "page": 20,
        "type": "factual-extraction"
    },
    {
        "question": "How does the paper suggest dealing with potential correlations between different data sources?",
        "ground_truth_excerpt": "Chainlink also proposes to pursue research into mapping and reporting the independence of data sources in an easily digestible way so that oracles and users can avoid undesired correlations.",
        "source": "chainlink.pdf",
        "page": 11,
        "type": "factual-extraction"
    }
]

import json

#Solana pairs
solana_pairs = [
    {
        "question": "What is the primary function of Proof of History (PoH) in the Solana architecture?",
        "ground_truth_excerpt": "Proof of History is a sequence of computation that can provide a way to cryptographically verify passage of time between two events. PoH is used to encode trustless passage of time into a ledger an append only data structure.",
        "source": "solana.pdf",
        "page": 1,
        "type": "factual-extraction"
    },
    {
        "question": "How does Proof of History allow for parallel verification by external computers?",
        "ground_truth_excerpt": "The output can then be re-computed and verified by external computers in parallel by checking each sequence segment on a separate core. Given some number of cores... the verifier can split up the sequence of hashes and their indexes into 4000 slices, and in parallel make sure that each slice is correct.",
        "source": "solana.pdf",
        "page": 4,
        "type": "factual-extraction"
    },
    {
        "question": "According to the paper, how are events 'timestamped' into the PoH sequence?",
        "ground_truth_excerpt": "Data can be timestamped into this sequence by appending the data (or a hash of some data) into the state of the function. The recording of the state, index and data as it was appended into the sequences provides a timestamp that can guarantee that the data was created sometime before the next hash was generated.",
        "source": "solana.pdf",
        "page": 4,
        "type": "factual-extraction"
    },
    {
        "question": "What role do 'Verifiers' play in the Solana network design?",
        "ground_truth_excerpt": "Verifiers execute the same transactions on their copies of the state, and publish their computed signatures of the state as confirmations. The published confirmations serve as votes for the consensus algorithm.",
        "source": "solana.pdf",
        "page": 2,
        "type": "factual-extraction"
    },
    {
        "question": "Describe how horizontal scaling is achieved for PoH generators without sharding.",
        "ground_truth_excerpt": "Its possible to synchronize multiple Proof of History generators by mixing the sequence state from each generator to each other generator. This property can be transitive... we can trace the dependency between A and C even though they were not synchronized directly.",
        "source": "solana.pdf",
        "page": 9,
        "type": "factual-extraction"
    },
    {
        "question": "How does Proof of History protect the network against long-range attacks?",
        "ground_truth_excerpt": "A malicious user that gains access to old private keys would have to recreate a historical record that takes as much time as the original one they are trying to forge. This would require access to a faster processor than the network is currently using, otherwise the attacker would never catch up.",
        "source": "solana.pdf",
        "page": 13,
        "type": "factual-extraction"
    },
    {
        "question": "What constitutes a 'super majority' in the Solana Proof of Stake consensus?",
        "ground_truth_excerpt": "A super majority is 2/3rds of the validators weighted by their bonds. A super majority vote indicates that the network has reached consensus.",
        "source": "solana.pdf",
        "page": 14,
        "type": "factual-extraction"
    },
    {
        "question": "How does the network handle the 'nothing at stake' problem in its Proof of Stake system?",
        "ground_truth_excerpt": "Slashing is the proposed solution... When a proof of voting for a different branch is published, that branch can destroy the validators bond. This is an economic incentive designed to discourage validators from confirming multiple branches.",
        "source": "solana.pdf",
        "page": 14,
        "type": "factual-extraction"
    },
    {
        "question": "What mechanism allows the network to recover from a large partition where more than 1/2 of verifiers are missing?",
        "ground_truth_excerpt": "In a large partition... the unstaking process is very very slow. Full 2/3rds consensus will not be achieved until a very large amount of hashes have been generated and the unavailable verifiers have been unstaked.",
        "source": "solana.pdf",
        "page": 17,
        "type": "factual-extraction"
    },
    {
        "question": "How is a new Leader elected if the current Proof of History generator fails?",
        "ground_truth_excerpt": "Election for a new PoH generator occur when the PoH generator failure is detected. The validator with the largest voting power, or highest public key address if there is a tie is picked as the new PoH generator.",
        "source": "solana.pdf",
        "page": 15,
        "type": "factual-extraction"
    }
]

#Uniswap pairs
uniswap_pairs = [
    {
        "question": "How is 'concentrated liquidity' defined in Uniswap v3, and how does it differ from the liquidity distribution in earlier versions?",
        "ground_truth_excerpt": "The defining idea of UNISWAP V3 is that of concentrated liquidity: liquidity bounded within some price range. In earlier versions, liquidity was distributed uniformly along the x * y = k reserves curve... designed to provide liquidity across the entire price range (0, infinity).",
        "source": "uniswap.pdf",
        "page": 2,
        "type": "factual-extraction"
    },
    {
        "question": "What are 'virtual reserves' in the context of a concentrated liquidity position?",
        "ground_truth_excerpt": "A position only needs to maintain enough reserves to support trading within its range, and therefore can act like a constant product pool with larger reserves (we call these the virtual reserves) within that range.",
        "source": "uniswap.pdf",
        "page": 2,
        "type": "factual-extraction"
    },
    {
        "question": "According to the paper, what happens to a position's liquidity and fee earnings when the price exits its specified range?",
        "ground_truth_excerpt": "When the price exits a position's range, the position's liquidity is no longer active, and no longer earns fees. At that point, its liquidity is composed entirely of a single asset.",
        "source": "uniswap.pdf",
        "page": 2,
        "type": "factual-extraction"
    },
    {
        "question": "What are 'range orders' in Uniswap v3 and how do they relate to traditional limit orders?",
        "ground_truth_excerpt": "Positions on very small ranges act similarly to limit orders—if the range is crossed, the position flips from being composed entirely of one asset, to being composed entirely of the other asset (plus accrued fees).",
        "source": "uniswap.pdf",
        "page": 2,
        "type": "factual-extraction"
    },
    {
        "question": "How are swap fees handled differently in Uniswap v3 compared to v2 regarding compounding?",
        "ground_truth_excerpt": "Fees earned in earlier versions were continuously deposited in the pool as liquidity... that fee earnings compounded. In UNISWAP V3, due to the non-fungible nature of positions, this is no longer possible. Instead, fee earnings are stored separately and held as the tokens in which the fees are paid.",
        "source": "uniswap.pdf",
        "page": 3,
        "type": "factual-extraction"
    },
    {
        "question": "What mechanism does Uniswap v3 use to track prices in its oracle, and why was this change made?",
        "ground_truth_excerpt": "Instead of accumulating the sum of prices, allowing users to compute the arithmetic mean TWAP, UNISWAP v3 tracks the sum of log prices, allowing users to compute the geometric mean TWAP. Using the time-weighted geometric mean price... avoids the need to track separate accumulators for these ratios.",
        "source": "uniswap.pdf",
        "page": 4,
        "type": "factual-extraction"
    },
    {
        "question": "What is the purpose of the 'liquidity accumulator' introduced in the Uniswap v3 oracle?",
        "ground_truth_excerpt": "This liquidity accumulator is useful for external contracts that want to implement liquidity mining on top of Uniswap v3. It can also be used by other contracts to inform a decision on which of the pools corresponding to a pair... will have the most reliable TWAP.",
        "source": "uniswap.pdf",
        "page": 4,
        "type": "factual-extraction"
    },
    {
        "question": "How are 'ticks' used to demarcate price space in Uniswap v3?",
        "ground_truth_excerpt": "To implement custom liquidity provision, the space of possible prices is demarcated by discrete ticks. Liquidity providers can provide liquidity in a range between any two ticks... Conceptually, there is a tick at every price p that is an integer power of 1.0001.",
        "source": "uniswap.pdf",
        "page": 5,
        "type": "factual-extraction"
    },
    {
        "question": "What determines which ticks can be 'initialized' for a liquidity position?",
        "ground_truth_excerpt": "Not every tick can be initialized. The pool is instantiated with a parameter, tickSpacing (ts); only ticks with indexes that are divisible by tickSpacing can be initialized.",
        "source": "uniswap.pdf",
        "page": 5,
        "type": "factual-extraction"
    },
    {
        "question": "Why does Uniswap v3 track the square root of price (sqrtPrice) and liquidity (L) instead of virtual reserves (x and y)?",
        "ground_truth_excerpt": "Using L and sqrtP is convenient because only one of them changes at a time. Price (and thus sqrtP) changes when swapping within a tick; liquidity changes when crossing a tick... This avoids some rounding errors that could be encountered if tracking virtual reserves.",
        "source": "uniswap.pdf",
        "page": 6,
        "type": "factual-extraction"
    }
]

print(f"Solana pairs: {len(solana_pairs)}")
print(f"Uniswap pairs: {len(uniswap_pairs)}")

print("All lists defined.")

Solana pairs: 10
Uniswap pairs: 10
All lists defined.


In [None]:
# Combine everything into one list (exactly 40 pairs)
full_eval_dataset = (
    bitcoin_pairs +
    chainlink_pairs +
    solana_pairs +
    uniswap_pairs
)

# Verify count
assert len(full_eval_dataset) == 40, f"Expected 40 pairs, got {len(full_eval_dataset)}"

random.seed(42)  # reproducible shuffle
random.shuffle(full_eval_dataset)

# Save – this overwrites any previous file
output_file = "crypto_rag_eval_dataset.json"

with open(output_file, "w", encoding="utf-8") as f:
    json.dump(full_eval_dataset, f, ensure_ascii=False, indent=2)

print(f"Dataset saved: {output_file}")
print(f"Total pairs: {len(full_eval_dataset)}")
print(f"Sources: {sorted(set(d['source'] for d in full_eval_dataset))}")

Dataset saved: crypto_rag_eval_dataset.json
Total pairs: 40
Sources: ['bitcoin.pdf', 'chainlink.pdf', 'solana.pdf', 'uniswap.pdf']


In [None]:
# Show distribution by source
from collections import Counter
print("Count per source:")
print(Counter(d['source'] for d in full_eval_dataset))

# Show first 3 and last 3 for sanity check
print("\nFirst 3:")
for item in full_eval_dataset[:3]:
    print(json.dumps(item, indent=2))
    print("---")

print("\nLast 3:")
for item in full_eval_dataset[-3:]:
    print(json.dumps(item, indent=2))
    print("---")

Count per source:
Counter({'bitcoin.pdf': 10, 'chainlink.pdf': 10, 'uniswap.pdf': 10, 'solana.pdf': 10})

First 3:
{
  "question": "What is the requirement for the network to remain secure against attacker nodes?",
  "ground_truth_excerpt": "The system is secure as long as honest nodes collectively control more CPU power than any cooperating group of attacker nodes.",
  "source": "bitcoin.pdf",
  "page": 1,
  "type": "factual-extraction"
}
---
{
  "question": "What are the two components that fund the incentive for nodes to support the network?",
  "ground_truth_excerpt": "The incentive is funded by a special first transaction in a block that starts a new coin owned by the block creator, and by transaction fees (the difference between transaction input and output values).",
  "source": "bitcoin.pdf",
  "page": 4,
  "type": "factual-extraction"
}
---
{
  "question": "What are the two primary functions of the Chainlink core node software?",
  "ground_truth_excerpt": "The core node softwa

In [None]:
# Assuming retrieve() is already defined and working

updated_dataset = []

for item in full_eval_dataset:  # your list of 40 items
    q = item["question"]
    hits = retrieve(q, k=1, min_score=0.0)  # top-1, even low score

    retrieved_score = hits[0]["score"] if hits else None
    retrieved_text_preview = hits[0]["text_preview"] if hits else None  # <-- FIXED HERE

    new_item = item.copy()
    new_item["retrieved_score"] = round(retrieved_score, 3) if retrieved_score is not None else None
    new_item["retrieved_text_preview"] = retrieved_text_preview

    updated_dataset.append(new_item)

# Re-save the enriched JSON
with open("crypto_rag_eval_dataset.json", "w", encoding="utf-8") as f:
    json.dump(updated_dataset, f, ensure_ascii=False, indent=2)

print("All 40 questions processed. Scores and previews added to JSON.")
print("Example (first item):")
print(json.dumps(updated_dataset[0], indent=2))

All 40 questions processed. Scores and previews added to JSON.
Example (first item):
{
  "question": "What is the requirement for the network to remain secure against attacker nodes?",
  "ground_truth_excerpt": "The system is secure as long as honest nodes collectively control more CPU power than any cooperating group of attacker nodes.",
  "source": "bitcoin.pdf",
  "page": 1,
  "type": "factual-extraction",
  "retrieved_score": 0.782,
  "retrieved_text_preview": ". As such, the verification is reliable as long as honest nodes control the network, but is more vulnerable if the network is overpowered by an attacker. While network nodes can ve..."
}


# RAG

In [None]:
# Install required packages - quiet mode to reduce output noise
!pip install -q --upgrade \
    transformers==4.44.2 \
    bitsandbytes==0.43.3 \
    accelerate==0.33.0 \
    peft==0.12.0 \
    datasets==2.21.0 \
    rouge-score==0.1.2 \
    nltk==3.8.1 \
    tabulate==0.9.0 \
    scikit-learn==1.5.1

print("Installs finished.")

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m32.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.5/137.5 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m315.1/315.1 kB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m296.4/296.4 kB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m23.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m36.5 MB/s[0m eta [36m0:

In [None]:
import json
import time
import numpy as np
import pandas as pd
from tabulate import tabulate
from tqdm.auto import tqdm

# Hugging Face
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

# Metrics
from rouge_score import rouge_scorer
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt', quiet=True)

from sklearn.metrics.pairwise import cosine_similarity

print("Imports successful.")

# Quick checks that core pieces from your previous notebook are still alive
print(f"FAISS index exists? → { 'index' in globals() }")
print(f"retrieve() function exists? → { 'retrieve' in globals() }")
print(f"Evaluation dataset exists? → { 'full_eval_dataset' in globals() or 'updated_dataset' in globals() }")

try:
    print(f"Number of eval items: {len(updated_dataset) if 'updated_dataset' in globals() else len(full_eval_dataset)}")
except:
    print("Warning: eval dataset variable not found — load it now if needed")

Imports successful.
FAISS index exists? → True
retrieve() function exists? → True
Evaluation dataset exists? → True
Number of eval items: 40


In [None]:
# Uninstall current version first to avoid conflicts
!pip uninstall -y transformers accelerate bitsandbytes

# Install known-good versions to avoid crashes
!pip install -q \
    transformers==4.37.2 \
    accelerate==0.27.2 \
    bitsandbytes==0.43.0

print("Clean install of stable transformers + accelerate finished.")

Found existing installation: transformers 4.44.2
Uninstalling transformers-4.44.2:
  Successfully uninstalled transformers-4.44.2
Found existing installation: accelerate 0.33.0
Uninstalling accelerate-0.33.0:
  Successfully uninstalled accelerate-0.33.0
Found existing installation: bitsandbytes 0.43.3
Uninstalling bitsandbytes-0.43.3:
  Successfully uninstalled bitsandbytes-0.43.3
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.4/129.4 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m32.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.2/102.2 MB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m71.9 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pi

In [None]:
# restart os for using the updated library versions
import os
os.kill(os.getpid(), 9)

In [None]:
print("Loading distilgpt2 with stable transformers version...")

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

try:
    tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
    model = AutoModelForCausalLM.from_pretrained(
        "distilgpt2",
        device_map="auto",
        torch_dtype=torch.float32,  # safe & fast on CPU
    )
    # Fix pad token (very common with gpt2 variants)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        model.config.pad_token_id = model.config.eos_token_id

    print("DistilGPT2 loaded SUCCESSFULLY.")
    print(f"Model device: {next(model.parameters()).device}")
except Exception as e:
    print("Still failed:", str(e))
    print("\nIf still broken → try transformers==4.36.2 instead (rerun install cell with that version)")

Loading distilgpt2 with stable transformers version...
DistilGPT2 loaded SUCCESSFULLY.
Model device: cpu


In [None]:
eval_data = updated_dataset if 'updated_dataset' in globals() else full_eval_dataset
print(f"Generating for {len(eval_data)} questions...")

rag_results = []

# Settings optimized for CPU speed & short answers
MAX_NEW_TOKENS = 96          # shorter than 128 — crypto excerpts are concise
BATCH_SIZE = 4               # distilgpt2 is small → safe to batch 4–8 on CPU

for i in tqdm(range(0, len(eval_data), BATCH_SIZE), desc="Generating (batched)"):
    batch_items = eval_data[i:i+BATCH_SIZE]
    prompts = []

    for item in batch_items:
        q = item["question"]
        hits = retrieve(q, k=3, min_score=0.35)

        if not hits:
            context = "No relevant context found."
            retrieved_chunks = []
            cosine_scores = []
        else:
            context = "\n\n".join([hit.get("text_preview", hit.get("text", ""))[:400] for hit in hits])  # cap length
            retrieved_chunks = [hit.get("text_preview", hit.get("text", ""))[:300] + "..." for hit in hits]
            cosine_scores = [hit["score"] for hit in hits]

        prompt = f"""Use only the following context to answer factually. If unsure, say so.

Context:
{context}

Question: {q}

Concise Answer:"""
        prompts.append(prompt)

    # Batch generate
    start_gen = time.time()
    inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=512).to(model.device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=MAX_NEW_TOKENS,
            do_sample=False, # greedy = faster & consistent
            pad_token_id=tokenizer.eos_token_id,
            num_return_sequences=1
        )
    answers = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    # Clean: remove prompt prefix from each answer
    cleaned_answers = []
    for prompt, ans in zip(prompts, answers):
        if ans.startswith(prompt):
            cleaned = ans[len(prompt):].strip()
        else:
            cleaned = ans.strip()
        cleaned_answers.append(cleaned)

    gen_time = time.time() - start_gen

    #time retrieval separately
    for j, item in enumerate(batch_items):
        result = {
            "question": item["question"],
            "ground_truth_excerpt": item.get("ground_truth_excerpt", ""),
            "rag_answer": cleaned_answers[j],
            "retrieved_chunks_preview": retrieved_chunks if 'retrieved_chunks' in locals() else [],
            "cosine_scores": cosine_scores if 'cosine_scores' in locals() else [],
            "generate_time_sec": round(gen_time / len(batch_items), 2),  # avg per item
        }
        rag_results.append(result)

    # Save partial every ~10 items
    current_idx = i + len(batch_items)
    if current_idx % 10 == 0 or current_idx >= len(eval_data):
        partial_file = f"rag_outputs_partial_{current_idx}.json"
        with open(partial_file, "w", encoding="utf-8") as f:
            json.dump(rag_results, f, ensure_ascii=False, indent=2)
        print(f"Saved partial: {partial_file} ({current_idx}/{len(eval_data)})")

# Final save
with open("rag_outputs_final.json", "w", encoding="utf-8") as f:
    json.dump(rag_results, f, ensure_ascii=False, indent=2)

print(f"\nDone! Generated {len(rag_results)} answers. Saved to rag_outputs_final.json")
print("Average generate time per question (batched): ~", round(sum(r["generate_time_sec"] for r in rag_results) / len(rag_results), 1), "seconds")

Generating for 40 questions...


Generating (batched):   0%|          | 0/10 [00:00<?, ?it/s]

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Saved partial: rag_outputs_partial_20.json (20/40)


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Saved partial: rag_outputs_partial_40.json (40/40)

Done! Generated 40 answers. Saved to rag_outputs_final.json
Average generate time per question (batched): ~ 6.0 seconds


In [None]:
import json
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from rouge_score import rouge_scorer
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt', quiet=True)

# load generated results
with open("rag_outputs_final.json", "r", encoding="utf-8") as f:
    rag_results = json.load(f)

print(f"Loaded {len(rag_results)} generated answers.")

# Get embedding for a text
def get_embedding(text: str):
    if not text.strip():
        return np.zeros(384)  # fallback zero vector (dim of bge-small-en-v1.5)
    emb = embedder.encode([text], normalize_embeddings=True, convert_to_numpy=True)
    return emb[0].astype(np.float32)

# prepare ground-truth embeddings one time only
gt_embeddings = []
for item in rag_results:
    gt_text = item.get("ground_truth_excerpt", "").strip()
    gt_emb = get_embedding(gt_text)
    gt_embeddings.append(gt_emb)

gt_embeddings = np.array(gt_embeddings)  # shape: (40, 384)

# Scorer setup
rouge = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

# Compute metrics per item
metrics_list = []

for idx, res in enumerate(rag_results):
    q = res["question"]
    gt = res.get("ground_truth_excerpt", "").strip()
    answer = res["rag_answer"].strip()

    # retrieve the actual retrieved chunks
    retrieved_texts = res.get("retrieved_chunks_preview", [])
    retrieved_scores = res.get("cosine_scores", [])

    #if no chunks retrieved then, skip some metrics or mark low
    has_retrieval = len(retrieved_texts) > 0 and any(t.strip() for t in retrieved_texts)

    # 1. Retrieval metrics - if they are present
    precision_3 = 0.0
    mrr = 0.0

    if has_retrieval and idx < len(gt_embeddings):
        gt_emb = gt_embeddings[idx]

        #embed retrieved chunks (short previews, but good enough)
        ret_embs = np.array([get_embedding(t) for t in retrieved_texts])

        cosines = cosine_similarity([gt_emb], ret_embs)[0]  # shape (3,)

        # Relevant if cosine >= 0.5 (adjustable threshold)
        relevant = cosines >= 0.5
        precision_3 = relevant.mean()  # fraction of top-3 relevant

        #mRR: reciprocal rank of first relevant
        ranks = np.where(relevant)[0]
        if len(ranks) > 0:
            mrr = 1.0 / (ranks[0] + 1)
        else:
            mrr = 0.0

    # 2. Generation metrics
    rouge_l = rouge.score(gt, answer)['rougeL'].fmeasure if gt else 0.0

    # Semantic similarity (answer vs ground truth)
    ans_emb = get_embedding(answer)
    gt_emb = gt_embeddings[idx] if idx < len(gt_embeddings) else np.zeros(384)
    semantic_sim = cosine_similarity([ans_emb], [gt_emb])[0][0] if np.any(gt_emb) else 0.0

    # 3. Hallucination proxy: % sentences with low support from any retrieved chunk
    halluc_rate = 0.0
    if has_retrieval and answer:
        sentences = sent_tokenize(answer)
        if sentences:
            sent_embs = np.array([get_embedding(s) for s in sentences])
            chunk_embs = np.array([get_embedding(t) for t in retrieved_texts if t.strip()])

            if len(chunk_embs) > 0:
                max_cosines = cosine_similarity(sent_embs, chunk_embs).max(axis=1)
                unsupported = max_cosines < 0.30  # threshold: below 0.3 = likely hallucinated
                halluc_rate = unsupported.mean()
            else:
                halluc_rate = 1.0  # all unsupported if no chunks
    elif not gt:
        halluc_rate = 0.0  # neutral if no ground truth

    metrics = {
        "question": q[:80] + "..." if len(q) > 80 else q,  # shorten for table
        "precision@3": round(precision_3, 3),
        "mrr": round(mrr, 3),
        "rougeL": round(rouge_l, 3),
        "semantic_sim": round(semantic_sim, 3),
        "halluc_rate": round(halluc_rate, 3),
        "answer_length": len(answer.split()),
    }
    metrics_list.append(metrics)

#aggregate & show table
df = pd.DataFrame(metrics_list)

#summary stats
summary = df[["precision@3", "mrr", "rougeL", "semantic_sim", "halluc_rate"]].mean().round(3)
print("\nAggregate Metrics (average over 40 questions):")
print(summary.to_string())

print("\nFull table preview (first 5):")
print(tabulate(df.head(5), headers="keys", tablefmt="simple", showindex=False))

#save all the data
df.to_csv("rag_metrics_detailed.csv", index=False)
summary.to_csv("rag_metrics_summary.csv")

print("\nSaved: rag_metrics_detailed.csv + rag_metrics_summary.csv")

Loaded 40 generated answers.

Aggregate Metrics (average over 40 questions):
precision@3     0.917
mrr             0.938
rougeL          0.138
semantic_sim    0.708
halluc_rate     0.000

Full table preview (first 5):
question                                                                               precision@3    mrr    rougeL    semantic_sim    halluc_rate    answer_length
-----------------------------------------------------------------------------------  -------------  -----  --------  --------------  -------------  ---------------
What is the requirement for the network to remain secure against attacker nodes?             0          0     0.074           0.634              0               87
What are the two components that fund the incentive for nodes to support the net...          1          1     0.085           0.692              0               84
What are the two primary functions of the Chainlink core node software?                      0.667      1     0.465           