In [None]:
!pip install --upgrade torch torchvision torchaudio

In [None]:
!pip install --upgrade transformers accelerate

In [None]:
!pip install faiss-cpu

In [None]:
import numpy as np
print("numpy version:", np.__version__)  # MUST be 2.0.2 or higher

import torch
print("torch version:", torch.__version__)
print("CUDA available?", torch.cuda.is_available())
print("Device:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "None")

In [None]:
import transformers
print(transformers.__version__)

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig

model_id = "microsoft/Phi-3-mini-4k-instruct"

print("Loading Phi-3 (Applying RoPE Scaling Fix)...")

# 1. Load the config
config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)

# 2. Fix the RoPE Scaling dictionary
# If it's "default", the remote script crashes. We remove it to use standard embeddings.
if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
    scaling_type = config.rope_scaling.get("type") or config.rope_scaling.get("rope_type")
    if scaling_type is None or scaling_type == "default":
        config.rope_scaling = None  # This is the fix for "Unknown RoPE scaling type default"
    else:
        # Ensure 'type' is present if it's a valid type like 'su'
        config.rope_scaling["type"] = scaling_type

# 3. Load tokenizer
phi_tokenizer = AutoTokenizer.from_pretrained(model_id)

# 4. Load model
phi_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    config=config,
    device_map="auto",
    torch_dtype=torch.float16,
    trust_remote_code=True,
    attn_implementation="eager"
)

print("\n✨ Phi-3 loaded successfully!")

In [None]:
import faiss
import pickle
import json
import numpy as np
from sentence_transformers import SentenceTransformer

print("Loading FAISS index, embedder, and corpus metadata...")

# Load FAISS index
index = faiss.read_index("crypto_rag_index.faiss")

# Load corpus metadata (texts, sources, pages, embedder model name)
with open("crypto_corpus_metadata.json", "r", encoding="utf-8") as f:
    corpus = json.load(f)

texts = corpus["texts"]
sources = corpus["sources"]
pages = corpus["pages"]
embedder_model_name = corpus["embedder_model"]

# Reload embedder
embedder = SentenceTransformer(embedder_model_name)

# Define retrieve function (using the loaded index/embedder/texts)
def retrieve(query, k=3, min_score=0.22):
    """
    Retrieve top-k relevant chunks using FAISS.
    Returns list of dicts with text_preview, score, source, page.
    """
    query_emb = embedder.encode([query], normalize_embeddings=True, convert_to_numpy=True)[0]

    # Search (FAISS returns distances as cosine distance = 1 - similarity)
    distances, indices = index.search(query_emb.reshape(1, -1), k)

    results = []
    for dist, idx in zip(distances[0], indices[0]):
        if idx == -1 or idx >= len(texts):
            continue
        similarity = 1 - dist  # convert to cosine similarity
        if similarity < min_score:
            continue

        text = texts[idx]
        preview = text[:300] + "..." if len(text) > 300 else text

        results.append({
            "text_preview": preview,
            "text": text,
            "score": round(float(similarity), 3),
            "source": sources[idx],
            "page": pages[idx] if pages and idx < len(pages) else None
        })

    return results[:k]  # ensure max k results

print("FAISS and retrieve() fully loaded!")
print(f"Total chunks in index: {index.ntotal}")
print(f"Embedder model: {embedder_model_name}")
print(f"retrieve function ready → test with retrieve('Bitcoin proof of work')")

Loading FAISS index, embedder, and corpus metadata...


Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

[1mBertModel LOAD REPORT[0m from: BAAI/bge-small-en-v1.5
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


FAISS and retrieve() fully loaded!
Total chunks in index: 402
Embedder model: BAAI/bge-small-en-v1.5
retrieve function ready → test with retrieve('Bitcoin proof of work')


In [None]:
test_query = "Bitcoin proof of work"
test_emb = embedder.encode([test_query], normalize_embeddings=True, convert_to_numpy=True)[0]

# Search (get more candidates to see scores)
distances, indices = index.search(test_emb.reshape(1, -1), k=5)

print("Raw FAISS distances & indices:")
for d, idx in zip(distances[0], indices[0]):
    if idx == -1:
        continue
    similarity = 1 - d  # assuming cosine distance
    print(f"Index {idx}: similarity = {similarity:.4f}, dist = {d:.4f}")

# Try with very low threshold
hits = retrieve(test_query, k=5, min_score=0.15)  # lowered
print(f"\nRetrieved {len(hits)} hits with min_score=0.15:")
for hit in hits:
    print(f"Score {hit['score']:.3f} | Source {hit['source']} | Preview: {hit['text_preview'][:100]}...")

Raw FAISS distances & indices:
Index 14: similarity = 0.2170, dist = 0.7830
Index 13: similarity = 0.2178, dist = 0.7822
Index 54: similarity = 0.2243, dist = 0.7757
Index 15: similarity = 0.2266, dist = 0.7734
Index 46: similarity = 0.2300, dist = 0.7700

Retrieved 5 hits with min_score=0.15:
Score 0.217 | Source bitcoin.pdf | Preview: . For our timestamp network, we implement the proof-of-work by incrementing a nonce in the block unt...
Score 0.218 | Source bitcoin.pdf | Preview: . Proof-of-Work To implement a distributed timestamp server on a peer-to-peer basis, we will need to...
Score 0.224 | Source solana.pdf | Preview: . That analysis may prove to be incorrect. Abstract This paper proposes a new blockchain architectur...
Score 0.227 | Source bitcoin.pdf | Preview: . The proof-of-work also solves the problem of determining representation in majority decision makin...
Score 0.230 | Source bitcoin.pdf | Preview: . To solve this, we proposed a peer-to-peer network using proof-of-wor

In [None]:
# Take first chunk text
sample_text = texts[0][:200]  # first 200 chars
sample_emb = embedder.encode([sample_text], normalize_embeddings=True, convert_to_numpy=True)[0]

query_emb = embedder.encode(["Bitcoin proof of work"], normalize_embeddings=True, convert_to_numpy=True)[0]

cos_sim = np.dot(sample_emb, query_emb)  # should be ~0 if unrelated
print(f"Cosine similarity between query and first chunk: {cos_sim:.4f}")

Cosine similarity between query and first chunk: 0.7109


In [None]:
# Test retrieval on 3 queries (run this in your notebook)
test_queries = [
    "Bitcoin proof of work",  # your test
    "What is Proof of History in Solana?",  # from earlier
    "What is the purpose of the 'liquidity accumulator' in Uniswap v3?"  # random from dataset
]

for query in test_queries:
    hits = retrieve(query, k=5, min_score=0.15)  # same as your test
    print(f"\nQuery: {query}")
    print(f"Retrieved {len(hits)} hits:")
    for hit in hits:
        print(f"Score {hit['score']:.3f} | Source {hit['source']} | Preview: {hit['text_preview'][:100]}...")


Query: Bitcoin proof of work
Retrieved 5 hits:
Score 0.217 | Source bitcoin.pdf | Preview: . For our timestamp network, we implement the proof-of-work by incrementing a nonce in the block unt...
Score 0.218 | Source bitcoin.pdf | Preview: . Proof-of-Work To implement a distributed timestamp server on a peer-to-peer basis, we will need to...
Score 0.224 | Source solana.pdf | Preview: . That analysis may prove to be incorrect. Abstract This paper proposes a new blockchain architectur...
Score 0.227 | Source bitcoin.pdf | Preview: . The proof-of-work also solves the problem of determining representation in majority decision makin...
Score 0.230 | Source bitcoin.pdf | Preview: . To solve this, we proposed a peer-to-peer network using proof-of-work to record a public history o...

Query: What is Proof of History in Solana?
Retrieved 5 hits:
Score 0.300 | Source solana.pdf | Preview: . That analysis may prove to be incorrect. Abstract This paper proposes a new blockchain architectur...
Scor

# RAG

In [None]:
!pip install --quiet pymupdf langchain-huggingface langchain-community accelerate

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.9/24.9 MB[0m [31m33.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m96.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m566.4/566.4 kB[0m [31m50.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m75.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.7/64.7 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.0/51.0 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires requests==2.32.4, but you have requests 2.32.5 which is incompatible.
transf

In [None]:
import json

with open("crypto_rag_eval_dataset.json", "r", encoding="utf-8") as f:
    eval_data = json.load(f)

print(f"Loaded {len(eval_data)} evaluation pairs.")

Loaded 40 evaluation pairs.


In [None]:
pip install --upgrade transformers accelerate

Collecting transformers
  Downloading transformers-5.2.0-py3-none-any.whl.metadata (32 kB)
Collecting huggingface-hub<2.0,>=1.3.0 (from transformers)
  Downloading huggingface_hub-1.4.1-py3-none-any.whl.metadata (13 kB)
Downloading transformers-5.2.0-py3-none-any.whl (10.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.4/10.4 MB[0m [31m90.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading huggingface_hub-1.4.1-py3-none-any.whl (553 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m553.3/553.3 kB[0m [31m44.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: huggingface-hub, transformers
  Attempting uninstall: huggingface-hub
    Found existing installation: huggingface_hub 0.36.2
    Uninstalling huggingface_hub-0.36.2:
      Successfully uninstalled huggingface_hub-0.36.2
  Attempting uninstall: transformers
    Found existing installation: transformers 5.0.0
    Uninstalling transformers-5.0.0:
      Successfully uninstalled t

In [None]:
import json
import time
import numpy as np
import pandas as pd
from tabulate import tabulate
from tqdm.auto import tqdm
from transformers import DynamicCache

phi_rag_results = []
MAX_NEW_TOKENS = 160
BATCH_SIZE = 4  # lower if memory tight

for i in tqdm(range(0, len(eval_data), BATCH_SIZE), desc="Phi-3 RAG generation"):
    batch = eval_data[i:i + BATCH_SIZE]
    prompts = []
    batch_retrieved = []

    for item in batch:
        q = item["question"]
        hits = retrieve(q, k=3, min_score=0.22)

        context = "\n\n".join([hit["text_preview"] for hit in hits]) if hits else "No relevant context found."
        retrieved_preview = [hit["text_preview"] for hit in hits] if hits else []

        prompt = f"""You are a helpful assistant. Use only the provided context to answer accurately and concisely.

Context:
{context}

Question: {q}

Answer:"""
        prompts.append(prompt)
        batch_retrieved.append(retrieved_preview)

    inputs = phi_tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=1024).to(phi_model.device)

    with torch.no_grad():
        outputs = phi_model.generate(
            **inputs,
            max_new_tokens=MAX_NEW_TOKENS,
            do_sample=False,
            pad_token_id=phi_tokenizer.eos_token_id,
            use_cache=False  # ← FIXED HERE
        )

    answers = phi_tokenizer.batch_decode(outputs, skip_special_tokens=True)

    cleaned = []
    for p, a in zip(prompts, answers):
        if a.startswith(p):
            cleaned.append(a[len(p):].strip())
        else:
            cleaned.append(a.strip())

    for j, item in enumerate(batch):
        phi_rag_results.append({
            "question": item["question"],
            "ground_truth_excerpt": item.get("ground_truth_excerpt", ""),
            "phi_rag_answer": cleaned[j],
            "retrieved_chunks_preview": batch_retrieved[j],
        })

    current = i + len(batch)
    if current % 10 == 0 or current >= len(eval_data):
        with open(f"phi_rag_partial_{current}.json", "w", encoding="utf-8") as f:
            json.dump(phi_rag_results, f, ensure_ascii=False, indent=2)
        print(f"Saved partial at {current}")

with open("phi_rag_outputs_final.json", "w", encoding="utf-8") as f:
    json.dump(phi_rag_results, f, ensure_ascii=False, indent=2)

print(f"\nDone! {len(phi_rag_results)} answers saved.")

Phi-3 RAG generation:   0%|          | 0/10 [00:00<?, ?it/s]

--- Logging error ---
Traceback (most recent call last):
  File "/usr/lib/python3.12/logging/__init__.py", line 1160, in emit
    msg = self.format(record)
          ^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/logging/__init__.py", line 999, in format
    return fmt.format(record)
           ^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/logging/__init__.py", line 703, in format
    record.message = record.getMessage()
                     ^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/logging/__init__.py", line 392, in getMessage
    msg = msg % self.args
          ~~~~^~~~~~~~~~~
TypeError: not all arguments converted during string formatting
Call stack:
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/usr/local/lib/python3.12/dist-packages/colab_kernel_launcher.py", line 37, in <module>
    ColabKernelApp.launch_instance()
  File "/usr/local/lib/python3.12/dist-packages/traitlets/config/application.py", line 992, 

Saved partial at 20
Saved partial at 40

Done! 40 answers saved.
