# CS5588 — Week 4: RAG + Gemini + Fine-Tuning (Hands-On)

## 1) Install & Setup

In [1]:
import sys, subprocess
def pip_install(pkgs):
    print("Installing:", pkgs)
    subprocess.run([sys.executable, "-m", "pip", "install", "-q"] + pkgs, check=True)

pip_install(["google-generativeai>=0.7.2","PyPDF2>=3.0.1","numpy>=1.23.0"])
try:
    pip_install(["faiss-cpu>=1.8.0"])
except Exception as e:
    print("FAISS skipped:", e)
try:
    pip_install(["transformers>=4.44.2","accelerate>=0.34.0","peft>=0.11.1","datasets>=2.21.0"])
except Exception as e:
    print("PEFT stack skipped:", e)
print("✅ Setup complete.")

Installing: ['google-generativeai>=0.7.2', 'PyPDF2>=3.0.1', 'numpy>=1.23.0']
Installing: ['faiss-cpu>=1.8.0']
Installing: ['transformers>=4.44.2', 'accelerate>=0.34.0', 'peft>=0.11.1', 'datasets>=2.21.0']
✅ Setup complete.


## 2) Log Environment → env_rag.json

In [2]:
import json, platform, datetime
from pathlib import Path

env={"timestamp_utc": datetime.datetime.utcnow().isoformat()+"Z","python": platform.python_version(),"platform": platform.platform()}
try:
    import google.generativeai as genai; env["google-generativeai"]=getattr(genai,"__version__","unknown")
except Exception as e: env["google-generativeai"]=f"unavailable ({e})"
try:
    import numpy as np; env["numpy"]=np.__version__
except Exception as e: env["numpy"]=f"unavailable ({e})"
try:
    import PyPDF2; env["PyPDF2"]=PyPDF2.__version__
except Exception as e: env["PyPDF2"]=f"unavailable ({e})"
try:
    import faiss; env["faiss"]="available"
except Exception: env["faiss"]="unavailable"
try:
    import torch; env["torch"]=torch.__version__; env["cuda_available"]=bool(torch.cuda.is_available())
except Exception: env["torch"]="N/A"

Path("runs").mkdir(exist_ok=True)
with open("env_rag.json","w") as f: json.dump(env,f,indent=2)
print(json.dumps(env,indent=2))

  env={"timestamp_utc": datetime.datetime.utcnow().isoformat()+"Z","python": platform.python_version(),"platform": platform.platform()}


{
  "timestamp_utc": "2025-09-18T20:55:08.000807Z",
  "python": "3.12.11",
  "platform": "Linux-6.1.123+-x86_64-with-glibc2.35",
  "google-generativeai": "0.8.5",
  "numpy": "2.0.2",
  "PyPDF2": "3.0.1",
  "faiss": "available",
  "torch": "2.8.0+cu126",
  "cuda_available": true
}


## 3) Load Documents (PDF/TXT/MD)

In [4]:
import os, glob, io
from pathlib import Path

DATA_DIR=Path("data/uploads"); DATA_DIR.mkdir(parents=True, exist_ok=True)
print("Local Jupyter: put at least 3 PDFs/TXT/MD into", DATA_DIR)

is_colab=False
try:
    from google.colab import files as colab_files
    is_colab=True
except Exception:
    pass

if is_colab:
    print("Colab: upload now")
    uploaded=colab_files.upload()
    for name,data in uploaded.items():
        with open(DATA_DIR/name,"wb") as f: f.write(data)
    print("Uploaded:", list(uploaded.keys()))
else:
    print("Found:", [p.name for p in DATA_DIR.glob('*')])

Local Jupyter: put at least 3 PDFs/TXT/MD into data/uploads
Colab: upload now


Saving AL112017_Irma.pdf to AL112017_Irma.pdf
Saving mat-report_hurricane-irma_florida.pdf to mat-report_hurricane-irma_florida (1).pdf
Saving annotated-Project%20Title (1).pdf to annotated-Project%20Title (1) (1).pdf
Uploaded: ['AL112017_Irma.pdf', 'mat-report_hurricane-irma_florida (1).pdf', 'annotated-Project%20Title (1) (1).pdf']


In [5]:
from typing import List, Dict
import PyPDF2

def load_documents(data_dir: Path) -> List[Dict]:
    docs=[]
    for p in sorted(data_dir.glob("*")):
        if p.suffix.lower()==".pdf":
            try:
                text_pages=[]
                with open(p,"rb") as f:
                    reader=PyPDF2.PdfReader(f)
                    for page in reader.pages:
                        text_pages.append(page.extract_text() or "")
                text="\n".join(text_pages)
                docs.append({"source": str(p), "text": text})
            except Exception as e:
                print("PDF read error:", p, e)
        elif p.suffix.lower() in [".txt",".md",".markdown"]:
            try:
                text=p.read_text(encoding="utf-8", errors="ignore")
                docs.append({"source": str(p), "text": text})
            except Exception as e:
                print("Text read error:", p, e)
        else:
            print("Skipping:", p)
    return docs

docs=load_documents(DATA_DIR)
print("Loaded", len(docs), "docs")
if docs:
    for d in docs[:3]: print("-", d["source"], "chars:", len(d["text"]))

Loaded 5 docs
- data/uploads/AL112017_Irma.pdf chars: 163316
- data/uploads/annotated-Project%20Title (1) (1).pdf chars: 3616
- data/uploads/annotated-Project%20Title (1).pdf chars: 3616


## 4) Chunk Documents (size=500, overlap=100) and Preview

In [6]:
from typing import List

chunk_size=500; chunk_overlap=100

def chunk_text(text:str, size:int, overlap:int)->List[str]:
    chunks=[]; start=0
    while start<len(text):
        end=min(start+size, len(text))
        piece=text[start:end]
        if piece.strip(): chunks.append(piece)
        if end==len(text): break
        start=end-overlap
        if start<0: start=0
    return chunks

chunks=[]
for d in docs:
    for ch in chunk_text(d["text"], chunk_size, chunk_overlap):
        chunks.append({"source": d["source"], "content": ch})

print("Total chunks:", len(chunks))
if chunks: print("First chunk preview:\n", chunks[0]["content"][:400])

Total chunks: 2177
First chunk preview:
  NATIONAL HURRICANE CENTER 
TROPICAL CYCLONE REPORT  
 
 HURRICANE IRMA  
 (AL112017)  
 30 August–12 September 2017  
 John P. Cangialosi, Andrew S. Latto, and Robbie Berg 
National Hurricane Center 
24 September 20211 
 
VIIRS SATELLITE IMAGE OF HURRICANE IRMA WHEN IT WAS AT ITS PEAK INTENSITY AND MADE LANDFALL ON BARBUDA AT 0535 
UTC 6 SEPTEMBER. 
Irma was a long-lived Cape Verde hurricane that


In [7]:
# Save initial config
import json
cfg={"chunk_size":chunk_size,"chunk_overlap":chunk_overlap,"retriever_k":4,"embedding_model":"text-embedding-004","generation_model":"gemini-1.5-flash"}
with open("rag_gemini_ft_config.json","w") as f: json.dump(cfg,f,indent=2)
print("Saved rag_gemini_ft_config.json")

Saved rag_gemini_ft_config.json


## 5) Embed with Gemini (text-embedding-004)

In [9]:
import os, json, numpy as np, google.generativeai as genai

# Set API key first
os.environ["GEMINI_API_KEY"] = "0000000000000000000000000000"


api_key=os.getenv("GEMINI_API_KEY")
if not api_key: raise RuntimeError("Please set GEMINI_API_KEY in your environment.")
genai.configure(api_key=api_key)

EMBED_MODEL="text-embedding-004"

def embed_texts(texts):
    MAX_LEN=8000
    texts=[t[:MAX_LEN] for t in texts]
    resp=genai.embed_content(model=EMBED_MODEL, content=texts)
    if isinstance(resp, dict) and "embedding" in resp:
        return np.array(resp["embedding"], dtype="float32")[None,:]
    vals=[np.array(e["values"], dtype="float32") for e in resp.get("embeddings",[])]
    return np.stack(vals, axis=0)

texts=[c["content"] for c in chunks]
if not texts: raise RuntimeError("No chunks to embed. Add docs in data/uploads/.")
emb_matrix=embed_texts(texts)
emb_matrix_unit=emb_matrix/(np.linalg.norm(emb_matrix,axis=1,keepdims=True)+1e-12)
print("Embeddings shape:", emb_matrix_unit.shape)

np.save("chunk_vectors.npy", emb_matrix_unit)
with open("chunk_meta.json","w") as f: json.dump(chunks,f,indent=2)
print("Saved chunk_vectors.npy and chunk_meta.json")

Embeddings shape: (1, 2177, 768)
Saved chunk_vectors.npy and chunk_meta.json


## 6) Vector Index & Retriever (FAISS or NumPy cosine)

In [12]:
!pip install faiss-cpu --no-cache-dir --upgrade




In [15]:
# -------------------------------------------------
# Step 0: Install FAISS (run once in Colab/Jupyter)
# -------------------------------------------------
# In Colab:
# !pip install faiss-cpu --no-cache-dir --upgrade
#
# In Conda (local Jupyter):
# conda install -c conda-forge faiss-cpu -y

import numpy as np, json

# -----------------------------
# Try FAISS setup
# -----------------------------
use_faiss = False
index = None
try:
    import faiss
    dim = emb_matrix_unit.shape[1]
    index = faiss.IndexFlatIP(dim)  # inner product = cosine sim for normalized vecs
    index.add(emb_matrix_unit.astype("float32"))
    use_faiss = True
    print("✅ FAISS index built with", index.ntotal, "vectors.")
except Exception as e:
    print("⚠️ FAISS unavailable, falling back to NumPy. Reason:", e)
    use_faiss = False

# -----------------------------
# Load metadata
# -----------------------------
with open("chunk_meta.json") as f:
    chunk_meta = json.load(f)

# -----------------------------
# Retrieval function
# -----------------------------
# def retrieve(query: str, k: int = 4):
#     q_emb = embed_texts([query])

#     # Always force shape (d,)
#     if q_emb.ndim > 1:
#         q_emb = q_emb.squeeze(0)
#     q_emb = q_emb / (np.linalg.norm(q_emb) + 1e-12)

#     if use_faiss:
#         q2d = q_emb[None, :].astype("float32")
#         D, I = index.search(q2d, k)
#         idxs, sims = I[0].tolist(), D[0].tolist()
#     else:
#         sims = (emb_matrix_unit @ q_emb).ravel().tolist()
#         idxs = sorted(range(len(sims)), key=lambda i: sims[i], reverse=True)[:k]
#         sims = [sims[i] for i in idxs]

#     results = []
#     for rank, (i, s) in enumerate(zip(idxs, sims), start=1):
#         results.append({
#             "rank": rank,
#             "score": float(s),
#             "source": chunk_meta[i]["source"],
#             "content": chunk_meta[i]["content"]
#         })
#     return results
def retrieve(query: str, k: int = 4):
    q_emb = embed_texts([query])

    # Ensure q_emb is always shape (d,)
    q_emb = np.array(q_emb).squeeze()
    if q_emb.ndim != 1:
        raise ValueError(f"Query embedding has wrong shape: {q_emb.shape}")

    # Normalize
    q_emb = q_emb / (np.linalg.norm(q_emb) + 1e-12)

    if use_faiss:
        q2d = q_emb[None, :].astype("float32")   # shape (1, d)
        D, I = index.search(q2d, k)
        idxs = I[0].tolist()
        sims = D[0].tolist()
    else:
        sims = (emb_matrix_unit @ q_emb).ravel().tolist()  # flatten to 1D
        idxs = sorted(range(len(sims)), key=lambda i: sims[i], reverse=True)[:k]
        sims = [float(sims[i]) for i in idxs]  # force float conversion

    results = []
    for rank, (i, s) in enumerate(zip(idxs, sims), start=1):
        results.append({
            "rank": rank,
            "score": float(s),  # guaranteed scalar
            "source": chunk_meta[i]["source"],
            "content": chunk_meta[i]["content"]
        })
    return results

# -----------------------------
# Helper: Build context string
# -----------------------------
def build_context(results):
    return "\n\n".join(f"[Source: {r['source']}]\n{r['content']}" for r in results)

# -----------------------------
# 🔍 Quick test
# -----------------------------
hits = retrieve("Summarize key datasets and models.", k=4)
for h in hits:
    print(f"[{h['rank']}] {h['score']:.3f} :: {h['source']} :: {h['content'][:100]}...")


⚠️ FAISS unavailable, falling back to NumPy. Reason: too many values to unpack (expected 2)
[1] 0.322 :: data/uploads/annotated-Project%20Title (1) (1).pdf :: 
drills,
 
response
 
strategies).
 
 
The
 
goal
 
is
 
to
 
make
 
disaster
 
preparedness
 
engag...
[2] 0.322 :: data/uploads/annotated-Project%20Title (1).pdf :: 
drills,
 
response
 
strategies).
 
 
The
 
goal
 
is
 
to
 
make
 
disaster
 
preparedness
 
engag...
[3] 0.320 :: data/uploads/annotated-Project%20Title (1) (1).pdf :: cts
 
 
https://github.com/firelab/windninja
 
https://github.com/huggingface/dif fusers
 
https://g...
[4] 0.320 :: data/uploads/annotated-Project%20Title (1).pdf :: cts
 
 
https://github.com/firelab/windninja
 
https://github.com/huggingface/dif fusers
 
https://g...


## 7) Generation with Gemini 1.5 Flash (grounded by retrieved context)

In [16]:
import google.generativeai as genai

GEN_MODEL="gemini-1.5-flash"
generator=genai.GenerativeModel(GEN_MODEL)

SYS=("You answer ONLY using the provided context. "
     "If the answer is not clearly supported, say you don't know.")

def answer_question(question:str, k:int=4, max_ctx_chars:int=8000):
    ctx_hits=retrieve(question, k=k)
    context_blob="\n\n".join(f"[Source: {h['source']}]\n{h['content']}" for h in ctx_hits)[:max_ctx_chars]
    prompt=f"""{SYS}

Context:
{context_blob}

Question: {question}

Answer:
"""
    resp=generator.generate_content(prompt)
    return resp.text, ctx_hits

for q in ["What problems does this project aim to solve?",
          "Which datasets are proposed?",
          "What methods or metrics are mentioned?"]:
    print("="*80); a,_=answer_question(q, k=4); print("Q:", q, "\nA:", a)

Q: What problems does this project aim to solve? 
A: Florida faces recurring natural disasters such as hurricanes, floods, and wildfires. Traditional educational materials are limited in interactivity and fail to capture the scale and urgency of such events.

Q: Which datasets are proposed? 
A: The NOAA Hurricane Database (https://www.nhc.noaa.gov/data/) and FEMA Disaster Records (https://www.fema.gov/about/reports-and-data/openfema) are the proposed datasets.

Q: What methods or metrics are mentioned? 
A: Advanced Dvorak Technique (ADT), Advanced Microwave Sounding Unit (AMSU), NASA Global Precipitation Mission (GPM), European Space Agency’s Advanced Scatterometer (ASCAT), and Defense Meteorological Satellite Program (DMSP) satellites are mentioned.



## 8) Fine-Tuning Options (A: Simulated QA Memory, B: LoRA/PEFT optional)

### 8A) Simulated Fine-Tuning (QA Memory + RAG)

In [17]:
import json

seed_questions=[
    "Give a concise project summary (objectives, data sources, methods).",
    "List the key datasets and why they are relevant.",
    "Explain the main modeling approach and any fine-tuning strategy."
]

qa_bank=[]
for q in seed_questions:
    try:
        a,_=answer_question(q, k=4)
    except Exception as e:
        a=f"[Error: {e}]"
    qa_bank.append({"question": q, "answer": a})

with open("qa_bank.jsonl","w",encoding="utf-8") as f:
    for row in qa_bank: f.write(json.dumps(row, ensure_ascii=False)+"\n")
print("Saved qa_bank.jsonl with", len(qa_bank), "pairs.")

def make_memory(qa_pairs, max_chars=4000):
    blob="\n\n".join([f"Q: {x['question']}\nA: {x['answer']}" for x in qa_pairs])
    return blob[:max_chars]
memory_blob=make_memory(qa_bank)

def answer_with_memory(question:str, k:int=4):
    ctx_hits=retrieve(question, k=k)
    context_blob="\n\n".join(f"[Source: {h['source']}]\n{h['content']}" for h in ctx_hits)[:8000]
    prompt=f"""You are an expert assistant for this specific project domain.
Use BOTH the QA memory and the retrieved context to answer.
Prefer facts in context; if unclear, rely on QA memory.

QA Memory:
{memory_blob}

Context:
{context_blob}

Question: {question}

Answer:
"""
    resp=generator.generate_content(prompt)
    return resp.text, ctx_hits

print("=== Base RAG vs 'Tuned' (QA memory) ===")
for q in ["Summarize the project goals and expected outcomes.",
          "What risks or limitations are mentioned and mitigations?"]:
    a_base,_=answer_question(q,k=4)
    a_tuned,_=answer_with_memory(q,k=4)
    print("\nQ:", q, "\n--- Base ---\n", a_base[:600], "\n--- Tuned ---\n", a_tuned[:600])

Saved qa_bank.jsonl with 3 pairs.
=== Base RAG vs 'Tuned' (QA memory) ===

Q: Summarize the project goals and expected outcomes. 
--- Base ---
 The project aims to build a Generative AI-powered 3D video generation system producing subject-oriented disaster education modules.  These videos will explain scientific causes (meteorology, geography), show social impacts (evacuation, relief operations), and teach civic preparedness (safety drills, response strategies).  The goal is to make disaster preparedness engaging.
 
--- Tuned ---
 The project aims to build a Generative AI-powered 3D video generation system for multi-subject disaster education, specifically focusing on Florida's hurricanes, floods, and wildfires.  The generated videos will explain the scientific causes (meteorology, geography), show social impacts (evacuation, relief operations), and teach civic preparedness (safety drills, response strategies).  The overall goal is to create engaging and interactive educational modules

### 8B) LoRA/PEFT on a Small Open-Source Model (Optional; GPU recommended)

In [22]:
# Optional: LoRA fine-tune TinyLlama on the QA pairs (skip if you only want Gemini).
from datasets import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model, TaskType
import torch, json

base_model="TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tok=AutoTokenizer.from_pretrained(base_model, use_fast=True)
model=AutoModelForCausalLM.from_pretrained(base_model, torch_dtype=torch.float16 if torch.cuda.is_available() else None, device_map="auto")

qa=[json.loads(l) for l in open("qa_bank.jsonl","r",encoding="utf-8")]
def to_text(x): return f"Question: {x['question']}\nAnswer: {x['answer']}"
ds=Dataset.from_list([{"text": to_text(x)} for x in qa])

def tokenize(batch): return tok(batch["text"], truncation=True, padding="max_length", max_length=512)
tokenized=ds.map(tokenize, batched=True, remove_columns=["text"])

lora=LoraConfig(task_type=TaskType.CAUSAL_LM, r=8, lora_alpha=16, lora_dropout=0.05, target_modules=["q_proj","v_proj"])
model=get_peft_model(model, lora)

args=TrainingArguments(output_dir="lora_out", num_train_epochs=1, per_device_train_batch_size=2, gradient_accumulation_steps=4,
                       learning_rate=2e-4, logging_steps=5, save_steps=100, fp16=torch.cuda.is_available())
collator=DataCollatorForLanguageModeling(tokenizer=tok, mlm=False)

trainer=Trainer(model=model, args=args, train_dataset=tokenized, data_collator=collator, tokenizer=tok)
trainer.train()
model.save_pretrained("lora_out"); tok.save_pretrained("lora_out")
print("Saved LoRA adapter to lora_out")

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

  trainer=Trainer(model=model, args=args, train_dataset=tokenized, data_collator=collator, tokenizer=tok)
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 2}.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33magd9c[0m ([33mdhtynt-neer[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss


Saved LoRA adapter to lora_out


## 9) Save Reproducibility Config → rag_gemini_ft_config.json

In [23]:
import json
try:
    cfg=json.load(open("rag_gemini_ft_config.json"))
except Exception: cfg={}
cfg.update({
    "embedding_model":"text-embedding-004",
    "generation_model":"gemini-1.5-flash",
    "retriever_k":4,
    "chunk_settings":[{"size":500,"overlap":100}],
    "fine_tuning":{"A_simulated_memory": True, "B_lora_peft":"optional"}
})
with open("rag_gemini_ft_config.json","w") as f: json.dump(cfg,f,indent=2)
print(json.dumps(cfg, indent=2))

{
  "chunk_size": 500,
  "chunk_overlap": 100,
  "retriever_k": 4,
  "embedding_model": "text-embedding-004",
  "generation_model": "gemini-1.5-flash",
  "chunk_settings": [
    {
      "size": 500,
      "overlap": 100
    }
  ],
  "fine_tuning": {
    "A_simulated_memory": true,
    "B_lora_peft": "optional"
  }
}
