In [1]:
"""
'Cell A: Environment Probe'
Purpose:
- Detect Python version, CUDA toolkits (if available), GPU info, and key package versions.
- Save a merged record into configs/env_rag_graph.json for reproducibility.

Notes:
- Uses only standard libs + minimal imports to avoid heavy loads here.
"""

import json, os, sys, subprocess, shutil, platform
from pathlib import Path

# 'Paths and files' - adjust only if your project layout changes
ROOT = Path("/home/manny-buff/projects/capstone/week6-rag-graph")
CFG  = ROOT / "configs" / "env_rag_graph.json"

def cmd_out(args):
    # 'Run a shell command safely and return stdout text'
    try:
        return subprocess.check_output(args, stderr=subprocess.STDOUT, text=True).strip()
    except Exception as e:
        return f"ERROR: {e}"

# 'Collect environment info'
info = {
    "python_venv": str(Path.home() / "venvs" / "core-rag"),
    "python_version": sys.version.split()[0],
    "platform": platform.platform(),
    "cuda_visible_devices": os.environ.get("CUDA_VISIBLE_DEVICES", None),
    "nvidia_smi": cmd_out(["bash", "-lc", "nvidia-smi --query-gpu=name,driver_version,memory.total --format=csv,noheader"]),
    "which_python": cmd_out(["bash", "-lc", "which python"]),
    "pip_freeze_head": cmd_out(["bash", "-lc", "pip freeze | head -n 20"])
}

# 'Key packages versions' - quick imports to record versions
versions = {}
for pkg in ["numpy", "pandas", "networkx", "sentence_transformers", "transformers", "accelerate", "faiss"]:
    try:
        mod = __import__(pkg)
        versions[pkg] = getattr(mod, "__version__", "unknown")
    except Exception as e:
        versions[pkg] = f"not importable: {e}"

info["packages"] = versions

# 'Merge with existing json'
CFG.parent.mkdir(parents=True, exist_ok=True)
existing = {}
if CFG.exists():
    try:
        existing = json.loads(CFG.read_text())
    except Exception:
        existing = {}

existing.update(info)
CFG.write_text(json.dumps(existing, indent=2))

print("Environment probe written to:", CFG)
print(json.dumps(info, indent=2))


  from tqdm.autonotebook import tqdm, trange


Environment probe written to: /home/manny-buff/projects/capstone/week6-rag-graph/configs/env_rag_graph.json
{
  "python_venv": "/home/manny-buff/venvs/core-rag",
  "python_version": "3.11.9",
  "platform": "Linux-6.14.0-33-generic-x86_64-with-glibc2.39",
  "cuda_visible_devices": null,
  "nvidia_smi": "NVIDIA GeForce RTX 4080, 580.65.06, 16376 MiB",
  "which_python": "/home/manny-buff/venvs/core-rag/bin/python",
  "pip_freeze_head": "accelerate==1.10.1\nacres==0.5.0\naiofiles==24.1.0\naiohappyeyeballs==2.6.1\naiohttp==3.12.15\naiosignal==1.4.0\naiosqlite==0.21.0\nannotated-types==0.7.0\nanyio==4.10.0\nargon2-cffi==25.1.0\nargon2-cffi-bindings==25.1.0\narrow==1.3.0\nasttokens==3.0.0\nasync-lru==2.0.5\nattrs==25.3.0\nav==15.1.0\nbabel==2.17.0\nbackoff==2.2.1\nbanks==2.2.0\nbcrypt==4.3.0",
  "packages": {
    "numpy": "2.2.1",
    "pandas": "2.2.3",
    "networkx": "3.3",
    "sentence_transformers": "3.0.1",
    "transformers": "4.56.2",
    "accelerate": "1.10.1",
    "faiss": "1.10.0"


In [2]:
"""
'Cell B: Sanity Probe'
Purpose:
- Verify core imports.
- Run a tiny e5 embedding call to confirm encoder works.
- Check that local Qwen path exists (skip heavy model load for now).
"""

from pathlib import Path

# 'Load run config'
import json
CFG_RUN = Path("/home/manny-buff/projects/capstone/week6-rag-graph/configs/rag_graph_run_config.json")
run_cfg = json.loads(CFG_RUN.read_text())

# 'Imports check'
import numpy as np
import pandas as pd
import networkx as nx
from sentence_transformers import SentenceTransformer

# 'Embed a sample query with e5-small-v2'
embed_model_id = run_cfg["embed_model"]
model = SentenceTransformer(embed_model_id)
vec = model.encode(["hello graph-rag world"], convert_to_numpy=True)
print("Embedding shape:", vec.shape, "dtype:", vec.dtype)

# 'Confirm local Qwen path exists'
qwen_local = Path(run_cfg["llm_local_path"])
print("Qwen local path:", qwen_local, "exists:", qwen_local.exists())

# 'Lightweight graph sanity'
G = nx.Graph()
G.add_edge("doc_A", "doc_B", weight=0.9)
G.add_edge("doc_B", "doc_C", weight=0.7)
print("Graph nodes/edges:", G.number_of_nodes(), G.number_of_edges())


Embedding shape: (1, 384) dtype: float32
Qwen local path: /home/manny-buff/projects/capstone/hw-rag/models/Qwen2-VL-2B-Instruct exists: True
Graph nodes/edges: 3 2


In [5]:
"""
'Cell C: Config + Helpers (extended)'
- Loads config.
- Discovers multiple filetypes.
- Extracts text from txt/md/text/pdf/json/csv/html/htm.
- Adds a tqdm fallback and silences the common tqdm warning.
"""

import os, json, re, math, pickle, warnings
from pathlib import Path
from typing import List, Dict, Any
import pandas as pd

# Silence noisy tqdm warnings if present
warnings.filterwarnings("ignore", message=".*tqdm.*")

# --- Load run config ---
RUN_CFG_PATH = Path("/home/manny-buff/projects/capstone/week6-rag-graph/configs/rag_graph_run_config.json")
cfg = json.loads(RUN_CFG_PATH.read_text())

CORPUS_ROOT   = Path(cfg["corpus_root"])
VDB_DIR       = Path(cfg["vector_db_dir"])
EMBED_ID      = cfg["embed_model"]
LLM_MODEL_ID  = cfg["llm_model_id"]
LLM_LOCAL     = Path(cfg["llm_local_path"])
DEVICE        = cfg.get("device", "cuda")
RETRIEVER_K   = int(cfg.get("retriever_k", 5))
HOP_LIMIT     = int(cfg.get("hop_limit", 2))

VDB_DIR.mkdir(parents=True, exist_ok=True)

# --- Optional deps used if available ---
try:
    from tqdm import tqdm
except Exception:
    def tqdm(x, **kw):  # no-op fallback
        return x

# PDF
try:
    from pypdf import PdfReader
except Exception:
    PdfReader = None

# HTML
try:
    from bs4 import BeautifulSoup
except Exception:
    BeautifulSoup = None

# --- File discovery ---
EXTS = {".txt", ".md", ".text", ".pdf", ".json", ".csv", ".html", ".htm"}

def find_files(root: Path) -> List[Path]:
    files = []
    for p in root.rglob("*"):
        if p.is_file() and p.suffix.lower() in EXTS:
            files.append(p)
    files.sort()
    return files

# --- Loaders by type ---
def load_text_plain(fp: Path) -> str:
    try:
        return fp.read_text(encoding="utf-8", errors="ignore")
    except Exception:
        return fp.read_text(errors="ignore")

def load_text_pdf(fp: Path) -> str:
    if PdfReader is None:
        return ""
    try:
        out = []
        reader = PdfReader(str(fp))
        for page in reader.pages:
            out.append(page.extract_text() or "")
        return "\n".join(out)
    except Exception:
        return ""

def load_text_json(fp: Path) -> str:
    try:
        obj = json.loads(fp.read_text(encoding="utf-8", errors="ignore"))
        # Flatten string-like leaf values
        def walk(x):
            if isinstance(x, dict):
                return " ".join(walk(v) for v in x.values())
            if isinstance(x, list):
                return " ".join(walk(v) for v in x)
            if isinstance(x, (str, int, float, bool)):
                return str(x)
            return ""
        return walk(obj)
    except Exception:
        return ""

def load_text_csv(fp: Path) -> str:
    try:
        df = pd.read_csv(fp, nrows=10000)  # cap large files
        return " ".join(map(str, df.astype(str).values.ravel().tolist()))
    except Exception:
        try:
            df = pd.read_table(fp, nrows=10000)
            return " ".join(map(str, df.astype(str).values.ravel().tolist()))
        except Exception:
            return ""

def load_text_html(fp: Path) -> str:
    if BeautifulSoup is None:
        return ""
    try:
        html = fp.read_text(encoding="utf-8", errors="ignore")
        soup = BeautifulSoup(html, "lxml")
        return soup.get_text(" ", strip=True)
    except Exception:
        return ""

LOADERS = {
    ".txt":  load_text_plain,
    ".md":   load_text_plain,
    ".text": load_text_plain,
    ".pdf":  load_text_pdf,
    ".json": load_text_json,
    ".csv":  load_text_csv,
    ".html": load_text_html,
    ".htm":  load_text_html,
}

def normalize_ws(s: str) -> str:
    return re.sub(r"\s+", " ", s).strip()

def chunk_text(s: str, max_tokens: int = 180, overlap: int = 30) -> List[str]:
    toks = s.split()
    chunks = []
    i = 0
    while i < len(toks):
        j = min(i + max_tokens, len(toks))
        chunk = " ".join(toks[i:j]).strip()
        if chunk:
            chunks.append(chunk)
        if j == len(toks):
            break
        i = max(0, j - overlap)
    return chunks

# --- Artifact I/O ---
ART_META   = VDB_DIR / "chunks_meta.parquet"
ART_CHUNKS = VDB_DIR / "chunks_text.pkl"
ART_FAISS  = VDB_DIR / "faiss.index"
ART_GRAPH  = VDB_DIR / "graph.pkl"

import pickle
def save_chunks_text(chunks: List[str]):
    with open(ART_CHUNKS, "wb") as f:
        pickle.dump(chunks, f)

def load_chunks_text() -> List[str]:
    with open(ART_CHUNKS, "rb") as f:
        return pickle.load(f)

print("Config loaded.")
print("CORPUS_ROOT =", CORPUS_ROOT)
print("Vector DB path =", VDB_DIR)
print("Extensions searched:", sorted(EXTS))


Config loaded.
CORPUS_ROOT = /home/manny-buff/projects/capstone/hw-rag/data
Vector DB path = /home/manny-buff/projects/capstone/week6-rag-graph/artifacts/vdb
Extensions searched: ['.csv', '.htm', '.html', '.json', '.md', '.pdf', '.text', '.txt']


In [6]:
"""
'Cell D (guarded): Build Embeddings + FAISS index'
- Reads/discovers/loads multiple file types.
- Chunks and embeds with e5-small-v2.
- Builds FAISS IP index (cosine on normalized vectors).
- Guards against empty corpus (prints message and returns early).
"""

from sentence_transformers import SentenceTransformer
import numpy as np, time, faiss

files = find_files(CORPUS_ROOT)
records = []
chunks_text = []

for doc_id, fp in enumerate(tqdm(files, desc="Loading+Chunking")):
    loader = LOADERS.get(fp.suffix.lower(), load_text_plain)
    raw = loader(fp)
    text = normalize_ws(raw)
    if not text:
        continue
    parts = chunk_text(text, max_tokens=180, overlap=30)
    for k, ch in enumerate(parts):
        records.append({
            "doc_id": doc_id,
            "chunk_id": len(chunks_text),
            "path": str(fp),
            "chunk_idx": k
        })
        chunks_text.append(ch)

import pandas as pd
meta_df = pd.DataFrame(records)
print(f"Docs: {len(files)} | Chunks: {len(chunks_text)}")

# Guard: no chunks → stop gracefully
if len(chunks_text) == 0:
    print("No chunks found. Please confirm corpus file types and that loaders extracted text.")
    # Tip for debugging: run the shell probe to see extensions/counts.
    raise SystemExit

# Embed
model = SentenceTransformer(EMBED_ID)
t0 = time.time()
emb = model.encode(
    chunks_text,
    convert_to_numpy=True,
    normalize_embeddings=True,
    show_progress_bar=True
)
print("Embeddings:", emb.shape, "elapsed_sec:", round(time.time()-t0, 2))

# Build FAISS
dim = emb.shape[1]
index = faiss.IndexFlatIP(dim)
index.add(emb.astype(np.float32))

# Save artifacts
meta_df.to_parquet(ART_META, index=False)
save_chunks_text(chunks_text)
faiss.write_index(index, str(ART_FAISS))

print("Saved:")
print(" -", ART_META)
print(" -", ART_CHUNKS)
print(" -", ART_FAISS)


Loading+Chunking:  71%|███████████████████████████████▊             | 12/17 [00:59<00:12,  2.58s/it]EOF marker not found
Loading+Chunking: 100%|█████████████████████████████████████████████| 17/17 [01:01<00:00,  3.61s/it]


Docs: 17 | Chunks: 4381


Batches:   0%|          | 0/137 [00:00<?, ?it/s]

Embeddings: (4381, 384) elapsed_sec: 4.4
Saved:
 - /home/manny-buff/projects/capstone/week6-rag-graph/artifacts/vdb/chunks_meta.parquet
 - /home/manny-buff/projects/capstone/week6-rag-graph/artifacts/vdb/chunks_text.pkl
 - /home/manny-buff/projects/capstone/week6-rag-graph/artifacts/vdb/faiss.index


In [7]:
"""
'Cell E: Build Similarity Graph'
Purpose:
- Create a lightweight graph of chunk relationships using top-N cosine neighbors.
- Collapses edges to doc-level (optional) or keeps chunk-level. We'll keep chunk-level for precision.
- Save graph.pkl for later Multi-Hop traversal.
"""

import numpy as np, networkx as nx, faiss, math

# Params for graph density
TOP_NEIGHBORS = max(10, RETRIEVER_K * 3)  # small multiple of retriever_k

# Load index and chunks
index = faiss.read_index(str(ART_FAISS))
chunks = load_chunks_text()
meta  = pd.read_parquet(ART_META)

# Query each vector against index to get neighbors (excluding self)
D, I = index.search(emb.astype(np.float32), TOP_NEIGHBORS + 1)

G = nx.Graph()
for row_idx, nbrs in enumerate(I):
    src = int(row_idx)
    for rank, nb in enumerate(nbrs):
        if nb == -1 or nb == src: 
            continue
        w = float(D[row_idx, rank])
        if w <= 0: 
            continue
        # Add undirected edge with weight=max(existing,w)
        if G.has_edge(src, nb):
            if w > G[src][nb].get("weight", 0.0):
                G[src][nb]["weight"] = w
        else:
            G.add_edge(src, nb, weight=w)

# Persist graph
with open(ART_GRAPH, "wb") as f:
    pickle.dump(G, f)

print("Graph built.")
print("Nodes:", G.number_of_nodes(), "Edges:", G.number_of_edges())


Graph built.
Nodes: 4381 Edges: 46574


In [8]:
"""
'Cell F: Validate'
Purpose:
- Issue a sample query to FAISS, print top-k chunk previews
- Show 1-step neighbors in the graph for the top hit (sanity for Multi-Hop)
"""

import textwrap, faiss, numpy as np
from sentence_transformers import SentenceTransformer

index = faiss.read_index(str(ART_FAISS))
meta  = pd.read_parquet(ART_META)
chunks = load_chunks_text()
enc   = SentenceTransformer(EMBED_ID)

query = "Briefly summarize the core topic of this corpus."
qv = enc.encode([query], convert_to_numpy=True, normalize_embeddings=True)
D, I = index.search(qv.astype(np.float32), RETRIEVER_K)

print("Top-k retrieved:")
for rank, cid in enumerate(I[0]):
    doc = meta.loc[meta["chunk_id"]==cid].iloc[0]
    preview = textwrap.shorten(chunks[cid], width=140, placeholder=" …")
    print(f"[{rank+1}] score={D[0,rank]:.3f} | {doc['path']} | chunk#{doc['chunk_idx']} :: {preview}")

# Graph neighbor preview for top hit
top_chunk = int(I[0,0])
print("\nGraph neighbors (first 10) of top chunk:", top_chunk)
with open(ART_GRAPH, "rb") as f:
    G = pickle.load(f)
nbrs = list(G.neighbors(top_chunk))[:10]
print("Neighbors:", nbrs)


Top-k retrieved:
[1] score=0.822 | /home/manny-buff/projects/capstone/hw-rag/data/1001 do-it-yourself hints & tips  tricks.pdf | chunk#1033 :: For inspecting a car's fi:ont and rear lights, p. 29, "Night moves" > Minor tUe for mounting on the ceil- ing of a closet to see what is …
[2] score=0.815 | /home/manny-buff/projects/capstone/hw-rag/data/1001 do-it-yourself hints & tips  tricks.pdf | chunk#1008 :: for a child's dresser, p. 21, "Playful pulls" Aluminum foil >■ Wrapped around pillows, to keep cats off the sofa, p. 41, "Stay off the …
[3] score=0.815 | /home/manny-buff/projects/capstone/hw-rag/data/1001 do-it-yourself hints & tips  tricks.pdf | chunk#1030 :: pour before completion, p. 258, "Easy pour" >■ To prevent a toilet bowl from "sweating" m humid weather, p. 173, "Bathroom condensation" …
[4] score=0.815 | /home/manny-buff/projects/capstone/hw-rag/data/the-complete-idiots-guide-to-simple-home-repair.pdf | chunk#26 :: motion. Warmest thanks are also due to Lynn Northrup, Jan L

In [None]:
# Week 6 — Part 1: Graph-RAG Build (Summary)

'''
**Corpus**: 17 PDFs under `/home/manny-buff/projects/capstone/hw-rag/data/`  
**Embedder**: `intfloat/e5-small-v2` (normalized vectors; cosine via FAISS IP)  
**Index**: FAISS saved to `artifacts/vdb/faiss.index`  
**Graph**: chunk-level similarity graph (NetworkX) using top-N neighbors; saved to `artifacts/vdb/graph.pkl`  
**Chunks**: <auto-printed in Cell D>  
**Graph Size**: <auto-printed in Cell E>  

### Pipeline Steps
1. **Load & Extract** text (txt/md/text/pdf/json/csv/html) with simple loaders (PDF via pypdf).
2. **Chunk** with 180-token windows + 30 overlap.
3. **Embed** chunks with e5-small-v2; normalize embeddings.
4. **Index** with FAISS (Inner Product) → cosine on normalized vectors.
5. **Graph**: for each chunk, connect to top neighbors with weight = similarity.

### Validations
- Retrieval preview (top-k) shows relevant chunks with file paths.
- Graph neighbors displayed for top hit.

**Artifacts** are deterministic given the corpus and config in `configs/rag_graph_run_config.json`.
'''
