In [1]:
# Step 1 · Cell 0 — System & Environment Probe (Week 11: Home Repair, Video Gen + Graph RAG)
# - Verifies expected folders and local model presence
# - Summarizes Python & key package versions (no installs)
# - Checks CUDA/GPU, ffmpeg, and disk space
# - Saves a JSON snapshot to configs/week11_sys_probe.json

import sys, os, json, shutil, subprocess
from pathlib import Path
from datetime import datetime

# ---------- Paths (edit here if your layout differs) ----------
W11 = Path("/home/manny-buff/projects/capstone/week11-hw")
MODELS_RO = Path("/home/manny-buff/projects/capstone/hw-rag/models")
DATA_RO   = Path("/home/manny-buff/projects/capstone/hw-rag/data")

EXPECTED_DIRS = [
    W11,
    W11 / "notebooks",
    W11 / "src",
    W11 / "data",
    W11 / "data" / "external",
    W11 / "artifacts",
    W11 / "visual_outputs",
    W11 / "audio_inputs",
    W11 / "audio_outputs",
    W11 / "configs",
    W11 / "reports",
    W11 / "source_mats",  # you mentioned docs live here
    MODELS_RO,
    DATA_RO,
]

MODEL_HINTS = [
    "Qwen2.5-VL-2B-Instruct",
    "Intern3_5-VL-4B-Instruct",
]

SNAPSHOT = W11 / "configs" / "week11_sys_probe.json"
SNAPSHOT.parent.mkdir(parents=True, exist_ok=True)

# ---------- Helpers ----------
def which(cmd):
    try:
        return shutil.which(cmd)
    except Exception:
        return None

def run_cmd(args):
    try:
        return subprocess.check_output(args, stderr=subprocess.STDOUT).decode("utf-8", "ignore").strip()
    except Exception as e:
        return f"ERROR: {e}"

def try_import_version(mod_name):
    try:
        m = __import__(mod_name)
        return getattr(m, "__version__", "installed")
    except Exception as e:
        return f"not_importable ({e.__class__.__name__}: {e})"

def list_model_candidates(root: Path, hints):
    results = {}
    if not root.exists():
        return results
    for h in hints:
        hits = []
        for p in root.rglob("*"):
            name = p.name.lower()
            if h.lower() in name and p.is_file():
                hits.append(str(p))
        results[h] = sorted(hits)
    return results

def disk_usage(path: Path):
    try:
        usage = shutil.disk_usage(str(path))
        return {"total_gb": round(usage.total/1e9, 2),
                "used_gb": round(usage.used/1e9, 2),
                "free_gb": round(usage.free/1e9, 2)}
    except Exception as e:
        return {"error": str(e)}

# ---------- Folder checks ----------
dirs_status = {str(p): (p.exists(), "dir" if p.is_dir() else ("file" if p.is_file() else "missing")) for p in EXPECTED_DIRS}

# ---------- Model hints ----------
model_files = list_model_candidates(MODELS_RO, MODEL_HINTS)

# ---------- Environment checks ----------
env = {
    "timestamp": datetime.now().isoformat(),
    "python": sys.version.replace("\n", " "),
    "paths": {
        "W11": str(W11),
        "MODELS_RO": str(MODELS_RO),
        "DATA_RO": str(DATA_RO),
    },
}

# Core packages
pkg_list = [
    "torch", "torchvision", "torchaudio",
    "transformers", "accelerate", "bitsandbytes",
    "opencv", "cv2", "imageio", "imageio_ffmpeg", "PIL",  # PIL = Pillow
    "tqdm", "pandas", "numpy",
    # graph / vector db scaffolds
    "networkx", "faiss", "faiss_cpu", "chromadb",
]

versions = {}
for name in pkg_list:
    # normalize import names (opencv-python -> cv2; Pillow -> PIL)
    alias = name
    if name == "opencv": alias = "cv2"
    if name == "PIL": alias = "PIL"
    try:
        mod = __import__(alias)
        versions[name] = getattr(mod, "__version__", "installed")
    except Exception as e:
        versions[name] = f"not_importable ({e.__class__.__name__}: {e})"

# CUDA / GPU
cuda = {
    "available": False,
    "device_count": 0,
    "device_name_0": None,
    "capability_0": None,
    "nvidia_smi": which("nvidia-smi"),
}
try:
    import torch
    cuda["available"] = bool(torch.cuda.is_available())
    cuda["device_count"] = torch.cuda.device_count()
    if cuda["available"] and cuda["device_count"] > 0:
        cuda["device_name_0"] = torch.cuda.get_device_name(0)
        try:
            cap = torch.cuda.get_device_capability(0)
            cuda["capability_0"] = f"{cap[0]}.{cap[1]}"
        except Exception:
            cuda["capability_0"] = "unknown"
except Exception as e:
    cuda["torch_error"] = f"{e.__class__.__name__}: {e}"

# ffmpeg check
ffmpeg = {
    "path": which("ffmpeg"),
    "version_line": None
}
if ffmpeg["path"]:
    v = run_cmd(["ffmpeg", "-version"])
    ffmpeg["version_line"] = v.splitlines()[0] if isinstance(v, str) and v else v

# Disk
disk = {
    "week11_hw": disk_usage(W11),
    "data_ro": disk_usage(DATA_RO),
    "models_ro": disk_usage(MODELS_RO),
}

# ---------- Snapshot ----------
snapshot = {
    "env": env,
    "dirs_status": dirs_status,
    "models_present": model_files,
    "packages": versions,
    "cuda": cuda,
    "ffmpeg": ffmpeg,
    "disk": disk,
}

SNAPSHOT.write_text(json.dumps(snapshot, indent=2))
print("✅ Wrote probe snapshot ->", SNAPSHOT)

# ---------- Pretty print summary ----------
def hr(): print("-"*72)

print("\nWeek 11 Probe Summary")
hr()
print("Root:", env["paths"]["W11"])
print("Models (RO):", env["paths"]["MODELS_RO"])
print("Data   (RO):", env["paths"]["DATA_RO"])
hr()
missing = [p for p,(exists, kind) in dirs_status.items() if not exists]
print(f"Dirs OK: {len(dirs_status)-len(missing)}/{len(dirs_status)}")
if missing:
    print("Missing:")
    for m in missing: print("  -", m)
else:
    print("All expected dirs present.")
hr()
print("Model candidates (by hint):")
for h, files in model_files.items():
    print(f"  {h}: {len(files)} files")
    for f in files[:5]:
        print("    -", f)
    if len(files) > 5: print("    ...")
hr()
print("Key packages:")
for k in ["torch","torchvision","torchaudio","transformers","accelerate","bitsandbytes","cv2","imageio","imageio_ffmpeg","PIL","pandas","numpy","networkx","faiss","faiss_cpu","chromadb"]:
    print(f"  {k:14s} -> {versions.get(k)}")
hr()
print("CUDA/GPU:")
for k,v in cuda.items(): print(f"  {k:14s}: {v}")
hr()
print("ffmpeg:")
for k,v in ffmpeg.items(): print(f"  {k:14s}: {v}")
hr()
print("Disk (GB):")
for k,v in disk.items(): print(f"  {k:14s}: {v}")
hr()

# Return a compact dict for quick glance in Jupyter
{
    "dirs_ok": len(missing)==0,
    "have_qwen": bool(model_files.get("Qwen2.5-VL-2B-Instruct")),
    "have_intern": bool(model_files.get("Intern3_5-VL-4B-Instruct")),
    "cuda": cuda.get("available"),
    "ffmpeg": bool(ffmpeg["path"]),
}


✅ Wrote probe snapshot -> /home/manny-buff/projects/capstone/week11-hw/configs/week11_sys_probe.json

Week 11 Probe Summary
------------------------------------------------------------------------
Root: /home/manny-buff/projects/capstone/week11-hw
Models (RO): /home/manny-buff/projects/capstone/hw-rag/models
Data   (RO): /home/manny-buff/projects/capstone/hw-rag/data
------------------------------------------------------------------------
Dirs OK: 14/14
All expected dirs present.
------------------------------------------------------------------------
Model candidates (by hint):
  Qwen2.5-VL-2B-Instruct: 0 files
  Intern3_5-VL-4B-Instruct: 0 files
------------------------------------------------------------------------
Key packages:
  torch          -> 2.9.0+cu128
  torchvision    -> 0.24.0+cu128
  torchaudio     -> not_importable (OSError: Could not load this library: /home/manny-buff/venvs/core-rag/lib/python3.11/site-packages/torchaudio/lib/libtorchaudio.so)
  transformers   -> 4.45

{'dirs_ok': True,
 'have_qwen': False,
 'have_intern': False,
 'cuda': True,
 'ffmpeg': True}

In [2]:
# Step 1 · Cell 1 — Graph-RAG Bootstrap (robust PDF ingest + simple graph)
# Safe to re-run; idempotent by default.
# Outputs under: artifacts/graph/{raw_text,graph,index,logs}

import os, re, json, shutil, subprocess
from pathlib import Path
from datetime import datetime
import traceback

import networkx as nx  # present per probe

# -------------------- CONFIG --------------------
W11 = Path("/home/manny-buff/projects/capstone/week11-hw")
DATA_RO = Path("/home/manny-buff/projects/capstone/hw-rag/data")  # PDFs live here
GART = W11 / "artifacts" / "graph"
RAW_TEXT = GART / "raw_text"
GRAPH_DIR = GART / "graph"
INDEX_DIR = GART / "index"
LOGS_DIR = GART / "logs"
LOG_FILE = LOGS_DIR / "ingest.log"

# Source selection (adjust as needed)
FILE_GLOBS = ["**/*.pdf"]        # scan all PDFs under DATA_RO
MAX_PDFS = 20                    # cap to avoid huge runs on first pass; set None to disable
MAX_PAGES_PER_DOC = 80           # cap per problematic doc; tweak later

# Fault tolerance
MIN_CHARS_PER_PAGE = 50          # pages below this are considered "empty" and trigger fallback
USE_DELETE_PREVIOUS = True       # full refresh of text outputs/graph when re-running

# -------------------- UTILS ---------------------
def ensure_dirs():
    for p in [RAW_TEXT, GRAPH_DIR, INDEX_DIR, LOGS_DIR]:
        p.mkdir(parents=True, exist_ok=True)

def reset_outputs():
    if USE_DELETE_PREVIOUS:
        for p in [RAW_TEXT, GRAPH_DIR, INDEX_DIR]:
            if p.exists():
                shutil.rmtree(p)
        ensure_dirs()

def log(msg):
    LOGS_DIR.mkdir(parents=True, exist_ok=True)
    with open(LOG_FILE, "a", encoding="utf-8") as f:
        f.write(f"[{datetime.now().isoformat()}] {msg}\n")

def which(cmd):
    try:
        from shutil import which as _which
        return _which(cmd)
    except Exception:
        return None

def run_cmd(args):
    try:
        out = subprocess.check_output(args, stderr=subprocess.STDOUT).decode("utf-8", "ignore")
        return out
    except Exception as e:
        return f"ERROR: {e}"

def import_optional(*names):
    mods = {}
    for n in names:
        try:
            mods[n] = __import__(n)
        except Exception:
            mods[n] = None
    return mods

mods = import_optional("pypdf", "PyPDF2", "pdfminer", "pdfminer.high_level")

# -------------------- PDF EXTRACTION ---------------------
def extract_with_pypdf(pdf_path: Path):
    text_pages = []
    try:
        if mods["pypdf"]:
            reader = mods["pypdf"].PdfReader(str(pdf_path))
            for i, page in enumerate(reader.pages):
                try:
                    t = page.extract_text() or ""
                except Exception:
                    t = ""
                text_pages.append(t)
        elif mods["PyPDF2"]:
            reader = mods["PyPDF2"].PdfReader(str(pdf_path))
            for i, page in enumerate(reader.pages):
                try:
                    t = page.extract_text() or ""
                except Exception:
                    t = ""
                text_pages.append(t)
        else:
            return None
        return text_pages
    except Exception as e:
        log(f"pypdf/PyPDF2 failed for {pdf_path.name}: {e}")
        return None

def extract_with_pdfminer(pdf_path: Path):
    try:
        if mods["pdfminer"] and mods["pdfminer.high_level"]:
            from pdfminer.high_level import extract_pages
            from pdfminer.layout import LTTextContainer
            pages = []
            for pageno, page_layout in enumerate(extract_pages(str(pdf_path))):
                chunks = []
                for element in page_layout:
                    if isinstance(element, LTTextContainer):
                        chunks.append(element.get_text())
                pages.append("\n".join(chunks))
            return pages
        return None
    except Exception as e:
        log(f"pdfminer failed for {pdf_path.name}: {e}")
        return None

def extract_with_pdftotext(pdf_path: Path):
    if not which("pdftotext"):
        return None
    try:
        # Write to temp .txt; then split per form-feed if present
        tmp_txt = RAW_TEXT / f"__tmp_{pdf_path.stem}.txt"
        if tmp_txt.exists():
            tmp_txt.unlink()
        cmd = ["pdftotext", "-layout", str(pdf_path), str(tmp_txt)]
        run = run_cmd(cmd)
        if isinstance(run, str) and run.startswith("ERROR"):
            log(f"pdftotext error for {pdf_path.name}: {run}")
            return None
        txt = tmp_txt.read_text(encoding="utf-8", errors="ignore")
        # heuristic split: many pdftotext builds do not insert form-feeds, so fallback to dense split by page markers later
        pages = txt.split("\f")
        if len(pages) == 1:  # fallback: chunk by ~8000 chars to simulate pages
            sz = 8000
            pages = [txt[i:i+sz] for i in range(0, len(txt), sz)]
        tmp_txt.unlink(missing_ok=True)
        return pages
    except Exception as e:
        log(f"pdftotext exception for {pdf_path.name}: {e}")
        return None

def extract_with_ocr(pdf_path: Path):
    # Optional OCR using pdftoppm + tesseract if available
    pdftoppm = which("pdftoppm")
    tesseract = which("tesseract")
    if not (pdftoppm and tesseract):
        return None
    try:
        ocr_dir = RAW_TEXT / f"__ocr_{pdf_path.stem}"
        if ocr_dir.exists():
            shutil.rmtree(ocr_dir)
        ocr_dir.mkdir(parents=True, exist_ok=True)
        # Convert to PNG at moderate DPI to keep runtime sane
        # NOTE: limit pages using -f / -l if MAX_PAGES_PER_DOC set
        args = [pdftoppm, "-png", "-r", "200", str(pdf_path), str(ocr_dir / "page")]
        run = run_cmd(args)
        if isinstance(run, str) and run.startswith("ERROR"):
            log(f"pdftoppm error for {pdf_path.name}: {run}")
            return None
        pages = []
        pngs = sorted(ocr_dir.glob("page-*.png"))
        if MAX_PAGES_PER_DOC:
            pngs = pngs[:MAX_PAGES_PER_DOC]
        for i, img in enumerate(pngs):
            out_txt = ocr_dir / f"page-{i:04d}.txt"
            cmd = [tesseract, str(img), str(out_txt.with_suffix("")), "--psm", "6"]
            _ = run_cmd(cmd)
            if out_txt.with_suffix(".txt").exists():
                pages.append(out_txt.with_suffix(".txt").read_text(encoding="utf-8", errors="ignore"))
        # Clean up images to save space
        for p in pngs:
            p.unlink(missing_ok=True)
        return pages if pages else None
    except Exception as e:
        log(f"OCR exception for {pdf_path.name}: {e}")
        return None

def robust_extract(pdf_path: Path):
    # 1) pypdf/PyPDF2
    pages = extract_with_pypdf(pdf_path)
    # Check quality
    def is_sparse(pages):
        if not pages: return True
        nonempty = [p for p in pages if p and len(p.strip()) >= MIN_CHARS_PER_PAGE]
        return len(nonempty) == 0
    if pages is None or is_sparse(pages):
        # 2) pdfminer.six
        pm = extract_with_pdfminer(pdf_path)
        if pm and not is_sparse(pm):
            pages = pm
        else:
            # 3) pdftotext
            p2t = extract_with_pdftotext(pdf_path)
            if p2t and not is_sparse(p2t):
                pages = p2t
            else:
                # 4) OCR
                ocr = extract_with_ocr(pdf_path)
                if ocr and not is_sparse(ocr):
                    pages = ocr
                else:
                    pages = pages if pages else []

    # Truncate if too long
    if MAX_PAGES_PER_DOC and len(pages) > MAX_PAGES_PER_DOC:
        pages = pages[:MAX_PAGES_PER_DOC]
    return pages

# -------------------- ENTITY & GRAPH ---------------------
# Simple domain-ish term harvesting (no external NLP deps)
TOOL_PAT = re.compile(r"\b(hammer|screwdriver|drill|level|saw|wrench|pliers|sander|trowel|chisel|stud\s?finder|tape\s?measure|shop\s?vac|utility\s?knife)\b", re.I)
MATERIAL_PAT = re.compile(r"\b(drywall|joint\s?compound|spackle|plywood|stud|2x4|adhesive|primer|paint|sealant|caulk|screw|nail|bolt|washer|bracket|shim)\b", re.I)
MEASURE_PAT = re.compile(r"\b(\d+(\.\d+)?\s?(mm|cm|m|in(?:ch(?:es)?)?|ft|feet|yd|g|kg|lb|lbs|oz))\b", re.I)

def extract_entities(text):
    ents = set()
    for m in TOOL_PAT.finditer(text): ents.add(("tool", m.group(0).lower()))
    for m in MATERIAL_PAT.finditer(text): ents.add(("material", m.group(0).lower()))
    for m in MEASURE_PAT.finditer(text): ents.add(("measure", m.group(0).lower()))
    # Heuristic extra: capture Title-case multi-words (likely components/proper nouns)
    for m in re.finditer(r"\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+){0,2})\b", text):
        if len(m.group(0)) >= 4:
            ents.add(("term", m.group(0)))
    return ents

def build_graph(doc_records):
    G = nx.Graph()
    # Add doc + page nodes; connect mentions; co-occurrence edges among entities per page
    for rec in doc_records:
        doc_id = f"doc::{rec['doc_name']}"
        if doc_id not in G: G.add_node(doc_id, kind="document", path=rec["doc_path"])
        for page in rec["pages"]:
            page_id = f"{doc_id}::p{page['page_index']}"
            if page_id not in G: G.add_node(page_id, kind="page", text_len=len(page['text']))
            G.add_edge(doc_id, page_id, kind="has_page")
            # entities
            ent_nodes = []
            for etype, eval_ in page["entities"]:
                ent_id = f"ent::{etype}::{eval_}"
                if ent_id not in G: G.add_node(ent_id, kind="entity", etype=etype, value=eval_)
                G.add_edge(page_id, ent_id, kind="mentions")
                ent_nodes.append(ent_id)
            # co-occurrence among entities on the same page
            for i in range(len(ent_nodes)):
                for j in range(i+1, len(ent_nodes)):
                    a, b = sorted([ent_nodes[i], ent_nodes[j]])
                    if G.has_edge(a, b):
                        G[a][b]["weight"] = G[a][b].get("weight", 1) + 1
                    else:
                        G.add_edge(a, b, kind="cooccur", weight=1)
    return G

# -------------------- MAIN ---------------------
reset_outputs()
ensure_dirs()

pdfs = []
for glob in FILE_GLOBS:
    pdfs.extend(sorted(DATA_RO.glob(glob)))
if MAX_PDFS:
    pdfs = pdfs[:MAX_PDFS]

doc_records = []
for pdf in pdfs:
    try:
        log(f"Extracting: {pdf}")
        pages = robust_extract(pdf)
        # write raw pages
        out_dir = RAW_TEXT / pdf.stem
        out_dir.mkdir(parents=True, exist_ok=True)
        kept = 0
        for i, t in enumerate(pages):
            if t and len(t.strip()) >= MIN_CHARS_PER_PAGE:
                (out_dir / f"page_{i:04d}.txt").write_text(t, encoding="utf-8", errors="ignore")
                kept += 1
        rec = {
            "doc_name": pdf.name,
            "doc_path": str(pdf),
            "pages": []
        }
        for i, t in enumerate(pages):
            if not t or len(t.strip()) < MIN_CHARS_PER_PAGE:
                continue
            ents = list(extract_entities(t))
            rec["pages"].append({
                "page_index": i,
                "text": t[:3000],  # keep snippet for provenance
                "entities": ents
            })
        doc_records.append(rec)
        log(f"Done: {pdf.name} pages_kept={kept} ents_total={sum(len(p['entities']) for p in rec['pages'])}")
    except Exception as e:
        log(f"FAIL: {pdf} -> {e}\n{traceback.format_exc()}")

# Build & save graph
G = build_graph(doc_records)

graph_json = GRAPH_DIR / "graph.json"
graph_gml  = GRAPH_DIR / "graph.gml"
GRAPH_DIR.mkdir(parents=True, exist_ok=True)
# JSON export
with open(graph_json, "w", encoding="utf-8") as f:
    nodes = [{"id": n, **G.nodes[n]} for n in G.nodes]
    edges = [{"u": u, "v": v, **G[u][v]} for u, v in G.edges]
    json.dump({"nodes": nodes, "edges": edges}, f, indent=2)
# GML export (useful for Gephi, yEd)
nx.write_gml(G, graph_gml)

# Small manifest for quick reference
manifest = {
    "created_at": datetime.now().isoformat(),
    "data_root": str(DATA_RO),
    "pdf_count_scanned": len(pdfs),
    "docs_indexed": len(doc_records),
    "nodes": G.number_of_nodes(),
    "edges": G.number_of_edges(),
    "outputs": {
        "raw_text_dir": str(RAW_TEXT),
        "graph_json": str(graph_json),
        "graph_gml": str(graph_gml),
        "log": str(LOG_FILE),
    }
}
(GRAPH_DIR / "manifest.json").write_text(json.dumps(manifest, indent=2))

# Console summary
print("✅ Graph-RAG bootstrap complete")
print(f"PDFs scanned     : {manifest['pdf_count_scanned']}")
print(f"Docs indexed     : {manifest['docs_indexed']}")
print(f"Graph nodes/edges: {manifest['nodes']}/{manifest['edges']}")
print("Outputs:")
for k,v in manifest["outputs"].items():
    print(f"  - {k}: {v}")

# Return a tiny dict for your notebook sidebar
{"pdfs_scanned": manifest["pdf_count_scanned"], "nodes": manifest["nodes"], "edges": manifest["edges"]}


EOF marker not found


✅ Graph-RAG bootstrap complete
PDFs scanned     : 17
Docs indexed     : 17
Graph nodes/edges: 4795/151113
Outputs:
  - raw_text_dir: /home/manny-buff/projects/capstone/week11-hw/artifacts/graph/raw_text
  - graph_json: /home/manny-buff/projects/capstone/week11-hw/artifacts/graph/graph/graph.json
  - graph_gml: /home/manny-buff/projects/capstone/week11-hw/artifacts/graph/graph/graph.gml
  - log: /home/manny-buff/projects/capstone/week11-hw/artifacts/graph/logs/ingest.log


{'pdfs_scanned': 17, 'nodes': 4795, 'edges': 151113}

In [3]:
# Step 1 · Cell 2 — Graph-RAG Query Layer (page TF-IDF + graph entities)
# - No new installs; pure-Python TF-IDF.
# - Reads: artifacts/graph/graph/graph.json and artifacts/graph/raw_text/<stem>/page_*.txt
# - Writes: artifacts/graph/index/tfidf_index.json
# - Provides: search_pages(query, top_k=5) -> list of dicts {doc, page, score, snippet, entities, path}

import json, math, re
from pathlib import Path
from collections import Counter, defaultdict
from datetime import datetime

# ----------------- CONFIG -----------------
W11 = Path("/home/manny-buff/projects/capstone/week11-hw")
GRAPH_JSON = W11 / "artifacts" / "graph" / "graph" / "graph.json"
RAW_TEXT   = W11 / "artifacts" / "graph" / "raw_text"
INDEX_DIR  = W11 / "artifacts" / "graph" / "index"
INDEX_DIR.mkdir(parents=True, exist_ok=True)
TFIDF_INDEX = INDEX_DIR / "tfidf_index.json"

# Optional quick demo (set a string to run a test search at the bottom)
TEST_QUERY = None  # e.g., "seal a drywall seam with joint compound"

# ----------------- HELPERS -----------------
WORD = re.compile(r"[a-zA-Z0-9]+(?:'[a-z0-9]+)?")
STOP = set("""
a an and are as at be by for from has have in is it its of on or that the to with your you we he she they them their our
""".split())

def tokenize(text: str):
    return [w.lower() for w in WORD.findall(text) if w.lower() not in STOP and len(w) > 1]

def read_text_safe(path: Path, max_chars=20000):
    try:
        t = path.read_text(encoding="utf-8", errors="ignore")
    except Exception:
        t = ""
    return t[:max_chars]

def parse_page_id(page_node_id: str):
    # format from bootstrap: "doc::<DOCNAME>::p<index>"
    try:
        _, docname, p = page_node_id.split("::")
        if p.startswith("p"):
            page_idx = int(p[1:])
        else:
            page_idx = int(p)
        stem = docname.rsplit(".", 1)[0]
        return docname, stem, page_idx
    except Exception:
        return None, None, None

def snippet_with_hits(text, hits, width=240):
    if not text:
        return ""
    # show beginning by default; bold known hits (case-insensitive)
    s = text[:width]
    for h in sorted(set(hits), key=len, reverse=True):
        s = re.sub(rf"(?i)\b({re.escape(h)})\b", r"**\1**", s)
    return s

# ----------------- LOAD GRAPH -----------------
if not GRAPH_JSON.exists():
    raise FileNotFoundError(f"Missing graph json at {GRAPH_JSON}")

g = json.loads(GRAPH_JSON.read_text(encoding="utf-8"))
nodes = {n["id"]: n for n in g["nodes"]}
adj = defaultdict(list)
for e in g["edges"]:
    u, v = e["u"], e["v"]
    adj[u].append((v, e))
    adj[v].append((u, e))

# Collect page nodes and resolve their raw-text file paths
pages = []  # list of dicts with keys: id, doc, stem, page_idx, path, entities
for nid, nd in nodes.items():
    if nd.get("kind") != "page":
        continue
    doc, stem, pidx = parse_page_id(nid)
    if doc is None:
        continue
    # reconstruct expected text path
    text_path = RAW_TEXT / stem / f"page_{pidx:04d}.txt"
    # gather entity neighbors from graph
    ents = []
    for nbr, eprops in adj[nid]:
        nbn = nodes.get(nbr, {})
        if nbn.get("kind") == "entity":
            ents.append({"etype": nbn.get("etype"), "value": nbn.get("value")})
    pages.append({"id": nid, "doc": doc, "stem": stem, "page_idx": pidx, "path": text_path, "entities": ents})

# Filter pages to those that actually have text files
pages = [p for p in pages if p["path"].exists()]

# ----------------- BUILD / LOAD TF-IDF -----------------
def build_index(pages):
    docs_tokens = []
    df = Counter()
    for p in pages:
        t = read_text_safe(p["path"])
        toks = tokenize(t)
        docs_tokens.append(toks)
        for term in set(toks):
            df[term] += 1

    N = len(docs_tokens)
    idf = {term: math.log((1 + N) / (1 + dfv)) + 1.0 for term, dfv in df.items()}  # smoothed
    doc_tfs = []
    norms = []
    for toks in docs_tokens:
        tf = Counter(toks)
        # L2 norm of tf-idf vector
        sq = 0.0
        for term, c in tf.items():
            w = (c / max(1, len(toks))) * idf.get(term, 0.0)
            sq += w * w
        norms.append(math.sqrt(sq) or 1.0)
        doc_tfs.append(tf)

    # Persist compactly
    index = {
        "built_at": datetime.now().isoformat(),
        "N": N,
        "idf": idf,                 # {term: idf}
        "norms": norms,             # [float]
        "docs": [                   # align with pages[]
            {"doc": pages[i]["doc"], "stem": pages[i]["stem"], "page_idx": pages[i]["page_idx"]}
            for i in range(N)
        ],
    }
    # To keep size reasonable, store only the top 200 terms by tf for each doc
    topk = 200
    index["tf_top"] = []
    for i, tf in enumerate(doc_tfs):
        index["tf_top"].append(dict(tf.most_common(topk)))
    return index

def load_index():
    if TFIDF_INDEX.exists():
        try:
            return json.loads(TFIDF_INDEX.read_text(encoding="utf-8"))
        except Exception:
            pass
    idx = build_index(pages)
    TFIDF_INDEX.write_text(json.dumps(idx), encoding="utf-8")
    return idx

index = load_index()

# ----------------- SEARCH -----------------
def search_pages(query: str, top_k: int = 5):
    q_toks = tokenize(query)
    if not q_toks:
        return []
    # Query tf-idf
    q_tf = Counter(q_toks)
    q_vec = {}
    for term, c in q_tf.items():
        w = (c / max(1, len(q_toks))) * index["idf"].get(term, 0.0)
        if w > 0: q_vec[term] = w
    q_norm = math.sqrt(sum(v*v for v in q_vec.values())) or 1.0

    scores = []
    N = index["N"]
    for i in range(N):
        tf_top = index["tf_top"][i]
        # cosine similarity over intersection of query terms ∩ doc terms (top terms only)
        dot = 0.0
        for term, qw in q_vec.items():
            if term in tf_top:
                dw = (tf_top[term] / 200.0) * index["idf"].get(term, 0.0)  # normalize by top cutoff
                dot += qw * dw
        denom = (q_norm * (index["norms"][i] or 1.0))
        s = dot / denom if denom else 0.0
        if s > 0:
            scores.append((s, i))
    scores.sort(reverse=True)
    hits = scores[:top_k]

    results = []
    for score, i in hits:
        meta = index["docs"][i]
        # find the corresponding page entry
        # (find first match; our lists are aligned by build_index)
        pg = next((p for p in pages if p["stem"] == meta["stem"] and p["page_idx"] == meta["page_idx"]), None)
        if not pg:
            continue
        txt = read_text_safe(pg["path"], max_chars=1200)
        results.append({
            "score": round(float(score), 6),
            "doc": pg["doc"],
            "page": pg["page_idx"],
            "snippet": snippet_with_hits(txt, q_toks, width=260),
            "entities": pg["entities"][:12],  # first dozen entity mentions
            "path": str(pg["path"])
        })
    return results

# -------------- OPTIONAL QUICK DEMO --------------
if TEST_QUERY:
    print(f"\nQuery: {TEST_QUERY!r}")
    for r in search_pages(TEST_QUERY, top_k=5):
        print(f"- [{r['score']:.3f}] {r['doc']} p{r['page']} :: {r['path']}")
        print(f"  ents: {[ (e['etype'], e['value']) for e in r['entities'][:6] ]}")
        print(f"  {r['snippet']}\n")

print(f"✅ Query layer ready. Built/loaded index with {index['N']} pages. Index file -> {TFIDF_INDEX}")


✅ Query layer ready. Built/loaded index with 696 pages. Index file -> /home/manny-buff/projects/capstone/week11-hw/artifacts/graph/index/tfidf_index.json


In [4]:
# Step 1 · Cell 3 — Context packer for Qwen / Intern (Home Repair query)
# Uses search_pages() from Cell 2 to retrieve graph-aware page snippets.
# Saves a JSON + TXT pack and prints a preview.

import json, os
from pathlib import Path
from datetime import datetime

# ---------- CONFIG ----------
W11 = Path("/home/manny-buff/projects/capstone/week11-hw")
QUERY_DIR = W11 / "artifacts" / "graph" / "queries"
QUERY_DIR.mkdir(parents=True, exist_ok=True)

USER_QUERY = "How do I fix a faucet leak?"
TOP_K = 5

# ---------- Retrieve ----------
try:
    results = search_pages(USER_QUERY, top_k=TOP_K)
except NameError as e:
    raise RuntimeError("search_pages() not found. Run Step 1 · Cell 2 first.") from e

# ---------- Build context block ----------
def format_result(i, r):
    ents = ", ".join(sorted({f"{e['etype']}:{e['value']}" for e in r["entities"]}))[:160]
    snippet = r["snippet"].replace("\n", " ").strip()
    return (
        f"[{i}] doc={r['doc']} page={r['page']} score={r['score']}\n"
        f"    ents: {ents}\n"
        f"    path: {r['path']}\n"
        f"    text: {snippet}\n"
    )

context_lines = []
for i, r in enumerate(results, 1):
    context_lines.append(format_result(i, r))

context_block = (
    "You are a precise home-repair assistant. Use only the provided snippets as primary evidence.\n"
    "Cite the snippet index like [1], [2] when relevant, and give step-by-step, tool- and material-aware instructions.\n\n"
    "=== SNIPPETS ===\n" + "\n".join(context_lines)
)

# ---------- Qwen-style messages (text-only for now) ----------
messages = [
    {
        "role": "system",
        "content": (
            "You are a helpful home-repair assistant. "
            "Be concise, safe, and practical. Include tool lists, safety notes, and sequencing. "
            "When uncertain, state assumptions and alternatives. Cite snippet IDs like [1], [2]."
        )
    },
    {
        "role": "user",
        "content": (
            f"Question: {USER_QUERY}\n\n"
            f"{context_block}\n\n"
            "Answer with a short checklist of steps and a brief explanation."
        )
    }
]

# ---------- Save pack ----------
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
stem = f"{ts}_faucet_leak"

pack = {
    "created_at": ts,
    "query": USER_QUERY,
    "top_k": TOP_K,
    "results": results,          # full provenance
    "context_block": context_block,
    "messages": messages,
}

json_path = QUERY_DIR / f"{stem}.json"
txt_path  = QUERY_DIR / f"{stem}_context.txt"

json_path.write_text(json.dumps(pack, indent=2), encoding="utf-8")
txt_path.write_text(context_block, encoding="utf-8")

# ---------- Preview ----------
print("✅ Context pack created")
print("Query:", USER_QUERY)
print(f"Results: {len(results)} pages")
print("Saved:")
print("  -", json_path)
print("  -", txt_path)
print("\nPreview (first ~20 lines):")
print("\n".join(context_block.splitlines()[:20]))

# Compact dict for notebook display
{
    "saved_json": str(json_path),
    "saved_txt": str(txt_path),
    "hits": len(results)
}


✅ Context pack created
Query: How do I fix a faucet leak?
Results: 5 pages
Saved:
  - /home/manny-buff/projects/capstone/week11-hw/artifacts/graph/queries/20251103_112405_faucet_leak.json
  - /home/manny-buff/projects/capstone/week11-hw/artifacts/graph/queries/20251103_112405_faucet_leak_context.txt

Preview (first ~20 lines):
You are a precise home-repair assistant. Use only the provided snippets as primary evidence.
Cite the snippet index like [1], [2] when relevant, and give step-by-step, tool- and material-aware instructions.

=== SNIPPETS ===
[1] doc=1001 do-it-yourself hints & tips  tricks.pdf page=13 score=0.304864
    ents: term:Also, term:Fold, term:Gradually, term:High, term:Never, term:Pmpomt, term:Preventing, term:Shut, term:Tack, term:This, term:Turn, term:When
    path: /home/manny-buff/projects/capstone/week11-hw/artifacts/graph/raw_text/1001 do-it-yourself hints & tips  tricks/page_0013.txt
    text: SAFE  AND  SNART  If  you  have  a  flood,  mini-  mize the  damage  w

{'saved_json': '/home/manny-buff/projects/capstone/week11-hw/artifacts/graph/queries/20251103_112405_faucet_leak.json',
 'saved_txt': '/home/manny-buff/projects/capstone/week11-hw/artifacts/graph/queries/20251103_112405_faucet_leak_context.txt',
 'hits': 5}

In [5]:
# Step 1 · Cell X — Graph-RAG v2: PDF images + image embeddings + new graph
# Safe to re-run; creates artifacts/graph_v2/* without touching v1.
# Requires: networkx, PIL, torch (preferred), and optionally open_clip or torchvision for embeddings.
# External tools (auto-detected): pdfimages (preferred), or pdftoppm fallback.

import os, re, json, math, shutil, subprocess, traceback
from pathlib import Path
from datetime import datetime

import networkx as nx
from PIL import Image

# ---------- Paths ----------
W11 = Path("/home/manny-buff/projects/capstone/week11-hw")
V1_GRAPH_JSON = W11 / "artifacts" / "graph" / "graph" / "graph.json"

V2_ROOT = W11 / "artifacts" / "graph_v2"
V2_IMG  = V2_ROOT / "images"
V2_EMB  = V2_ROOT / "embeddings"
V2_LOGS = V2_ROOT / "logs"
V2_GRAPH_DIR = V2_ROOT / "graph"
V2_GRAPH_JSON = V2_GRAPH_DIR / "graph.json"
V2_GRAPH_GML  = V2_GRAPH_DIR / "graph.gml"
V2_MANIFEST   = V2_ROOT / "manifest.json"
V2_LOGS.mkdir(parents=True, exist_ok=True)

# ---------- Config ----------
MAX_PDFS = None          # None = all docs from v1. Set small int to throttle.
MAX_IMAGES_PER_DOC = 40  # hard cap per document
MIN_W, MIN_H = 64, 64    # skip tiny images
EMB_FLOATS = 6           # decimals in json fallback
USE_FRESH = True         # wipe V2 images/embeddings/graph on re-run

# ---------- Utils ----------
def log(msg):
    with open(V2_LOGS / "build_v2.log", "a", encoding="utf-8") as f:
        f.write(f"[{datetime.now().isoformat()}] {msg}\n")

def which(cmd):
    try:
        from shutil import which as _w
        return _w(cmd)
    except Exception:
        return None

def run_cmd(args):
    try:
        return subprocess.check_output(args, stderr=subprocess.STDOUT).decode("utf-8","ignore")
    except Exception as e:
        return f"ERROR: {e}"

def parse_page_id(page_node_id: str):
    # v1 used: "doc::<DOCNAME>::p<index>"
    try:
        _, docname, p = page_node_id.split("::")
        pidx = int(p[1:]) if p.startswith("p") else int(p)
        stem = docname.rsplit(".",1)[0]
        return docname, stem, pidx
    except Exception:
        return None, None, None

# ---------- Prep ----------
if USE_FRESH:
    for p in [V2_IMG, V2_EMB, V2_GRAPH_DIR]:
        if p.exists(): shutil.rmtree(p)
for p in [V2_IMG, V2_EMB, V2_GRAPH_DIR]:
    p.mkdir(parents=True, exist_ok=True)

if not V1_GRAPH_JSON.exists():
    raise FileNotFoundError(f"v1 graph not found at {V1_GRAPH_JSON}")

v1 = json.loads(V1_GRAPH_JSON.read_text(encoding="utf-8"))
nodes_v1 = {n["id"]: n for n in v1["nodes"]}
edges_v1 = v1["edges"]

# Build a quick map: document -> path, pages for that doc
doc_to_path = {}
doc_to_pages = {}
for nid, nd in nodes_v1.items():
    if nd.get("kind") == "document":
        # in v1 we stored 'path' on the document node
        doc_to_path[nid] = nd.get("path")
        doc_to_pages[nid] = []
for nid, nd in nodes_v1.items():
    if nd.get("kind") == "page":
        # find doc node by splitting id
        docname, stem, pidx = parse_page_id(nid)
        if docname is None: 
            continue
        doc_id = f"doc::{docname}"
        if doc_id in doc_to_pages:
            doc_to_pages[doc_id].append((nid, pidx))

# ---------- Image extraction backends ----------
HAVE_PDFIMAGES = bool(which("pdfimages"))
HAVE_PDFTOPPM  = bool(which("pdftoppm"))

def extract_images_pdfimages(pdf_path: Path, out_dir: Path):
    """
    Use poppler pdfimages to extract embedded images with page numbers.
    Returns: list of dicts: {path, page, width, height}
    """
    out_prefix = out_dir / "img"
    out_dir.mkdir(parents=True, exist_ok=True)

    # Map images -> pages from `pdfimages -list`
    listing = run_cmd(["pdfimages", "-list", str(pdf_path)])
    page_map = []  # [(page, idx)]
    for line in listing.splitlines():
        # Skip headers; look for lines starting with page number
        if not line.strip() or not line.strip()[0].isdigit():
            continue
        parts = re.split(r"\s+", line.strip())
        try:
            page = int(parts[0])
            num  = int(parts[1])
        except Exception:
            continue
        page_map.append((page, num))

    # Extract as PNG with page numbers in filenames (-p)
    _ = run_cmd(["pdfimages", "-png", "-p", str(pdf_path), str(out_prefix)])

    # Collect files; filenames usually include '-<page>-<num>.png'
    imgs = sorted(out_dir.glob("img-*-*.png"))
    results = []
    for p in imgs:
        m = re.search(r"img-(\d+)-(\d+)\.png$", p.name)
        if m:
            page = int(m.group(1))
        else:
            # fallback if filename pattern differs: try align with page_map order
            idx = len(results)
            page = page_map[idx][0] if idx < len(page_map) else -1
        try:
            with Image.open(p) as im:
                w, h = im.size
        except Exception:
            w, h = 0, 0
        results.append({"path": p, "page": page, "width": w, "height": h})
    return results

def extract_images_pdftoppm(pdf_path: Path, out_dir: Path, dpi=150):
    """
    Render each page as an image (PNG) when embedded image extraction isn't available.
    Returns: list of dicts: {path, page, width, height}
    """
    out_prefix = out_dir / "page"
    out_dir.mkdir(parents=True, exist_ok=True)
    _ = run_cmd(["pdftoppm", "-png", "-r", str(dpi), str(pdf_path), str(out_prefix)])
    results = []
    pages = sorted(out_dir.glob("page-*.png"))
    for i, p in enumerate(pages):
        try:
            with Image.open(p) as im:
                w, h = im.size
        except Exception:
            w, h = 0, 0
        results.append({"path": p, "page": i, "width": w, "height": h})
    return results

def extract_images_for_doc(pdf_path: Path, out_dir: Path):
    if HAVE_PDFIMAGES:
        imgs = extract_images_pdfimages(pdf_path, out_dir)
    elif HAVE_PDFTOPPM:
        imgs = extract_images_pdftoppm(pdf_path, out_dir)
    else:
        log(f"No pdfimages/pdftoppm available for {pdf_path}")
        return []
    # Sanity: filter tiny images
    good = [d for d in imgs if d["width"] >= MIN_W and d["height"] >= MIN_H]
    # Cap per doc
    return good[:MAX_IMAGES_PER_DOC]

# ---------- Embedding backends ----------
device = "cpu"
emb_backend = None
emb_dim = None

try:
    import torch
    device = "cuda" if torch.cuda.is_available() else "cpu"
except Exception:
    torch = None

# Try open_clip first
if torch is not None:
    try:
        import open_clip
        model, _, preprocess = open_clip.create_model_and_transforms("ViT-B-32", pretrained="laion2b_s34b_b79k", device=device)
        model.eval()
        emb_backend = "open_clip_ViT-B-32"
        emb_dim = model.visual.output_dim if hasattr(model, "visual") else 512
    except Exception:
        model = None
        preprocess = None
        # Try torchvision ViT
        try:
            import torchvision
            from torchvision.models import vit_b_16, ViT_B_16_Weights
            weights = ViT_B_16_Weights.IMAGENET1K_V1
            model = vit_b_16(weights=weights).to(device)
            model.eval()
            preprocess = weights.transforms()
            emb_backend = "torchvision_vit_b_16"
            emb_dim = 1000  # classifier logits dim; acceptable as a proxy
        except Exception:
            model = None
            preprocess = None

def embed_image(png_path: Path):
    if torch is None or model is None or preprocess is None:
        return None
    try:
        with Image.open(png_path).convert("RGB") as img:
            inp = preprocess(img)
        if hasattr(inp, "unsqueeze"):  # torchvision tensor
            x = inp.unsqueeze(0).to(device)
        else:
            import torch as _t
            x = _t.tensor(inp).unsqueeze(0).to(device)

        with torch.no_grad():
            if "open_clip" in (emb_backend or ""):
                feats = model.encode_image(x)
            elif "torchvision_vit" in (emb_backend or ""):
                feats = model(x)
            else:
                return None
            feats = feats.float()
            # L2 normalize
            feats = feats / (feats.norm(dim=-1, keepdim=True) + 1e-8)
            vec = feats.squeeze(0).detach().cpu().numpy()
            return vec
    except Exception as e:
        log(f"embed fail {png_path.name}: {e}")
        return None

# ---------- Build v2 graph ----------
G2 = nx.Graph()

# Seed with all v1 nodes/edges so we keep text structure
for n in v1["nodes"]:
    G2.add_node(n["id"], **{k:v for k,v in n.items() if k!="id"})
for e in v1["edges"]:
    G2.add_edge(e["u"], e["v"], **{k:v for k,v in e.items() if k not in ("u","v")})

docs = list(doc_to_pages.items())
if MAX_PDFS:
    docs = docs[:MAX_PDFS]

total_imgs = 0
embedded = 0
skipped = 0
per_doc_stats = []

for doc_id, page_list in docs:
    pdf_path = doc_to_path.get(doc_id)
    if not pdf_path or not Path(pdf_path).exists():
        log(f"missing pdf for {doc_id}")
        continue
    docname = doc_id.split("::",1)[1]
    doc_stem = docname.rsplit(".",1)[0]
    out_dir = V2_IMG / doc_stem
    imgs = extract_images_for_doc(Path(pdf_path), out_dir)

    # map images to v1 page node ids when possible
    page_by_idx = {pidx: nid for (nid, pidx) in page_list}
    count_doc = 0

    for i, d in enumerate(imgs):
        page = d["page"]
        img_path = d["path"]
        w, h = d["width"], d["height"]
        total_imgs += 1
        count_doc += 1

        # Build image node id
        img_id = f"img::{docname}::p{page}::i{i:03d}"
        G2.add_node(img_id, kind="image", page=int(page), width=w, height=h, path=str(img_path))

        # Link to page node if known
        page_node = page_by_idx.get(int(page))
        if page_node:
            G2.add_edge(page_node, img_id, kind="has_image")
        else:
            # link to doc if page unknown
            G2.add_edge(doc_id, img_id, kind="has_image")

        # Embeddings
        if emb_backend is not None:
            vec = embed_image(img_path)
            if vec is not None:
                try:
                    import numpy as np
                    emb_path = V2_EMB / f"{doc_stem}_p{page}_i{i:03d}.npy"
                    V2_EMB.mkdir(parents=True, exist_ok=True)
                    np.save(emb_path, vec.astype("float32"))
                    G2.nodes[img_id]["embedding"] = {"path": str(emb_path), "dim": int(vec.shape[-1]), "backend": emb_backend}
                    embedded += 1
                except Exception:
                    # fallback to JSON
                    emb_path = V2_EMB / f"{doc_stem}_p{page}_i{i:03d}.json"
                    V2_EMB.mkdir(parents=True, exist_ok=True)
                    emb_path.write_text(json.dumps({
                        "backend": emb_backend,
                        "vec": [round(float(x), EMB_FLOATS) for x in (vec.tolist() if hasattr(vec, "tolist") else list(vec))],
                    }))
                    G2.nodes[img_id]["embedding"] = {"path": str(emb_path), "dim": len(G2.nodes[img_id]["embedding"].get("vec", [])), "backend": emb_backend}
                    embedded += 1
            else:
                skipped += 1
        else:
            skipped += 1

    per_doc_stats.append({"doc": docname, "images": count_doc})

# Save v2
nodes_out = [{"id": n, **G2.nodes[n]} for n in G2.nodes]
edges_out = [{"u": u, "v": v, **G2[u][v]} for u, v in G2.edges]
V2_GRAPH_DIR.mkdir(parents=True, exist_ok=True)
V2_GRAPH_JSON.write_text(json.dumps({"nodes": nodes_out, "edges": edges_out}, indent=2), encoding="utf-8")
nx.write_gml(G2, V2_GRAPH_GML)

manifest = {
    "created_at": datetime.now().isoformat(),
    "v1_graph": str(V1_GRAPH_JSON),
    "v2_graph_json": str(V2_GRAPH_JSON),
    "v2_graph_gml": str(V2_GRAPH_GML),
    "images_root": str(V2_IMG),
    "embeddings_root": str(V2_EMB),
    "stats": {
        "documents": len(docs),
        "images_total": total_imgs,
        "embeddings_built": embedded,
        "embeddings_skipped": skipped,
    },
    "per_doc": per_doc_stats,
    "embedding_backend": emb_backend,
}
V2_MANIFEST.write_text(json.dumps(manifest, indent=2), encoding="utf-8")

print("✅ Graph-RAG v2 built")
print("Docs:", manifest["stats"]["documents"])
print("Images total:", total_imgs, "| embedded:", embedded, "| skipped:", skipped)
print("Embedding backend:", emb_backend)
print("Saved:")
print("  -", V2_GRAPH_JSON)
print("  -", V2_GRAPH_GML)
print("  -", V2_MANIFEST)


Downloading: "https://download.pytorch.org/models/vit_b_16-c867db91.pth" to /home/manny-buff/.cache/torch/hub/checkpoints/vit_b_16-c867db91.pth


100%|███████████████████████████████████████████████████████████████████████| 330M/330M [00:01<00:00, 275MB/s]


✅ Graph-RAG v2 built
Docs: 17
Images total: 388 | embedded: 388 | skipped: 0
Embedding backend: torchvision_vit_b_16
Saved:
  - /home/manny-buff/projects/capstone/week11-hw/artifacts/graph_v2/graph/graph.json
  - /home/manny-buff/projects/capstone/week11-hw/artifacts/graph_v2/graph/graph.gml
  - /home/manny-buff/projects/capstone/week11-hw/artifacts/graph_v2/manifest.json
