# Automated SITREP Generator from Incident Logs (RAG-LLM)

This notebook builds a **self-contained** Retrieval-Augmented Generation (RAG) pipeline that:
- Downloads or ingests a **public MOT Marine accident investigation PDF** (with fallbacks).
- Performs **robust PDF extraction** (page-anchored) and chunking.
- Indexes chunks via **FAISS** + **sentence-transformers**.
- Uses a **local, open-source LLM** for summarization with **guardrails** to avoid hallucination.
- Implements **V&V**: hallucination probing and **Groundedness Score**.
- Generates a **Streamlit app** (via `%%writefile app/app.py`) to interactively produce SITREPs with citations.
- Saves governance artifacts to `artifacts/` (SPEC-DRIVE, model card, metrics, plots).

> Blue Lane = low-code path (default). Red Lane = optional deep dives.


In [3]:
# --- Bootstrap & Environment --------------------------------------------------
import os, sys, random, json, time, pathlib, platform, shutil, subprocess, re
from pathlib import Path
import numpy as np

print("Python  :", platform.python_version())
print("OS      :", platform.platform())
print("CPU     :", platform.processor())
try:
    import torch
    has_torch = True
    print("Torch   :", torch.__version__)
    print("CUDA    :", torch.cuda.is_available())
except Exception as e:
    has_torch = False
    print("Torch   : not installed")

# Deterministic seeds
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)
if has_torch:
    torch.manual_seed(SEED)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(SEED)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

# Create folders
for d in ["data", "models", "artifacts", "artifacts/plots", "app", "hf_cache"]:
    Path(d).mkdir(parents=True, exist_ok=True)

# Flags
BLUE_LANE = True            # low-code default
SKIP_HEAVY_TRAINING = True  # keep runtime short

# Offline detection
OFFLINE = False
try:
    import socket
    socket.create_connection(("www.google.com", 80), timeout=3)
except Exception:
    OFFLINE = True
print("OFFLINE :", OFFLINE)

# Hugging Face cache
os.environ["TRANSFORMERS_CACHE"] = str(Path("hf_cache").resolve())

print("Working dir:", os.getcwd())

# Initialize model card with a data-use note placeholder
model_card_path = Path("artifacts/model_card.md")
if not model_card_path.exists():
    model_card_path.write_text(
        "# Model Card (WIP)\n\n"
        "## Data Use & License\n"
        "- Source: Public MOT Marine accident investigation report (Singapore) or mirrored open sample.\n"
        "- No classified data. Educational/experimental use.\n\n"
        "## Notes\n"
        "- This file will be appended with details after pipeline runs.\n",
        encoding="utf-8"
    )
print("Created artifacts/model_card.md")

Python  : 3.11.13
OS      : Windows-10-10.0.26200-SP0
CPU     : Intel64 Family 6 Model 183 Stepping 1, GenuineIntel
Torch   : 2.9.0+cpu
CUDA    : False
OFFLINE : False
Working dir: c:\Users\tcmk_\Downloads\elice notebooks\RAGLLM_notebook
Created artifacts/model_card.md


In [2]:
# Create venv with Python 3.11 (no 'py' launcher needed)
import os, sys, subprocess, shutil, platform, pathlib

def find_py311():
    # 1) Prefer env var override
    for env_var in ["PYTHON311", "P311", "PY311"]:
        p = os.environ.get(env_var)
        if p and pathlib.Path(p).exists():
            return p
    # 2) If current interpreter is 3.111, use it
    if sys.version_info[:2] == (3, 11):  # fixed
        return sys.executable
    # 3) Try common Windows install paths
    candidates = [
        r"C:\Users\%USERNAME%\AppData\Local\Programs\Python\Python311\python.exe",
        r"C:\Program Files\Python311\python.exe",
        r"C:\Program Files (x86)\Python311\python.exe",
    ]
    candidates = [os.path.expandvars(p) for p in candidates]
    for p in candidates:
        if pathlib.Path(p).exists():
            return p
    # 4) Try anything named python3.11 on PATH
    p = shutil.which("python3.11") or shutil.which("python311")
    if p:
        return p
    return None

py311 = find_py311()
if not py311:
    print("[ERROR] Could not find Python 3.11 on this system.")
    print("Install it, then re-run this cell. From VS Code PowerShell:")
    print('  winget install -e --id Python.Python.3.11')
else:
    print("Using Python 3.11 at:", py311)
    # Create venv
    subprocess.check_call([py311, "-m", "venv", ".venv311"])
    # Upgrade pip and ensure ipykernel in the venv
    subprocess.check_call([r".\.venv311\Scripts\python", "-m", "pip", "install", "-U", "pip", "setuptools", "wheel"])
    subprocess.check_call([r".\.venv311\Scripts\python", "-m", "pip", "install", "ipykernel"])
    # Register Jupyter kernel
    subprocess.check_call([r".\.venv311\Scripts\python", "-m", "ipykernel", "install",
                           "--user", "--name", "sitrep311", "--display-name", "Python 3.11 (sitrep)"])
    print("Now switch the Notebook kernel to: Python 3.11 (sitrep)")

Using Python 3.11 at: c:\Users\tcmk_\Downloads\elice notebooks\.venv311\Scripts\python.exe
Now switch the Notebook kernel to: Python 3.11 (sitrep)


In [3]:
# --- Install Packages (graceful) ----------------------------------------------
# Run this AFTER switching kernel to "Python 3.11 (sitrep)"
import sys, subprocess

def pip_install(pkgs):
    for p in pkgs:
        try:
            print(f"Installing {p} ...")
            subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", p])
        except Exception as e:
            print(f"[WARN] Could not install {p}: {e}")

required = [
    "numpy",
    "pymupdf",               # PDF extraction
    "faiss-cpu",             # vector store
    "scikit-learn",
    "langchain",             # splitters/util
    "transformers",  # LLMs
    "accelerate",
    "sentence-transformers",
    "streamlit",             # app
    "spacy"                  # optional PII redaction
]
pip_install(required)

# Optional CPU torch
try:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q",
                           "torch", "--index-url", "https://download.pytorch.org/whl/cpu"])
except Exception as e:
    print("[WARN] torch CPU wheel install failed:", e)

# Optional: small spaCy model
try:
    import spacy
    try:
        spacy.load("en_core_web_sm")
    except Exception:
        subprocess.check_call([sys.executable, "-m", "spacy", "download", "en_core_web_sm"])
except Exception as e:
    print("[WARN] spaCy optional model unavailable:", e)

Installing numpy ...
Installing pymupdf ...
Installing faiss-cpu ...
Installing scikit-learn ...
Installing langchain ...
Installing transformers ...
Installing accelerate ...
Installing sentence-transformers ...
Installing streamlit ...
Installing spacy ...


## SPEC-DRIVE Card (Fill Me)

- **Mission:** *(What decision does the SITREP support?)*
- **Inputs:** *(One PDF incident report from public MOT Marine accident investigations)*
- **Outputs:** *(Concise SITREP with citations + groundedness score)*
- **Constraints:** *(No classified data; use only provided document; answer `NOT FOUND IN SOURCE` if missing)*
- **V&V Success Criteria:** *(Groundedness ≥ 0.90; 0 unsupported claims)*
- **Rollback Plan:** *(If groundedness < target, tighten retrieval, reduce generation temperature, or fall back to extractive)*


In [4]:
# --- Save SPEC-DRIVE to artifacts/spec_drive.md -------------------------------
# Edit the SPEC_CARD text and run this cell.
SPEC_CARD = """
Mission: <edit me>
Inputs: MOT Marine incident PDF
Outputs: SITREP with citations + groundedness score
Constraints: No classified data; use only provided document; answer NOT FOUND IN SOURCE if missing
V&V Success Criteria: Groundedness >= 0.90; 0 unsupported claims
Rollback Plan: Tighten retrieval; reduce temperature; extractive fallback if needed
"""

from pathlib import Path
Path("artifacts/spec_drive.md").write_text(SPEC_CARD.strip()+"\n", encoding="utf-8")
print("Saved artifacts/spec_drive.md")


Saved artifacts/spec_drive.md


In [5]:
# --- Data Ingestion & PDF Extraction ------------------------------------------
import os, json, io
from pathlib import Path

# Use this specific public report (preferred)
REPORT_URL = "https://www.mot.gov.sg/docs/default-source/about-mot/missing-of-fitter-from-rtm-zheng-he-at-sea-on-26-december-2024.pdf?sfvrsn=b4c661aa_1"
SAMPLE_PDF_URLS = [REPORT_URL]

pdf_path = Path("data/report.pdf")
pdf_path.parent.mkdir(parents=True, exist_ok=True)

def is_pdf_bytes(content: bytes, headers: dict) -> bool:
    ct = (headers or {}).get("content-type", "").lower()
    return (content.startswith(b"%PDF") or "application/pdf" in ct or ct.endswith("/pdf"))

def try_download_pdf(urls, out_path: Path) -> bool:
    import requests
    headers = {
        "User-Agent": "Mozilla/5.0",
        "Accept": "application/pdf,application/octet-stream,*/*;q=0.8",
    }
    for u in urls:
        try:
            print("Attempting:", u)
            r = requests.get(u, timeout=45, headers=headers)
            if r.status_code == 200 and r.content and is_pdf_bytes(r.content, r.headers):
                out_path.write_bytes(r.content)
                print("Downloaded:", out_path)
                return True
            else:
                print("[WARN] Not a PDF or empty response:", r.status_code, r.headers.get("content-type"))
        except Exception as e:
            print("[WARN] Download failed:", e)
    return False

def create_synthetic_pdf(out_path: Path) -> bool:
    """Create a tiny synthetic PDF as last fallback."""
    try:
        from reportlab.pdfgen import canvas
        from reportlab.lib.pagesizes import letter
        c = canvas.Canvas(str(out_path), pagesize=letter)
        text = c.beginText(40, 750); text.setFont("Helvetica", 11)
        for line in [
            "Marine Incident Report - Synthetic Sample",
            "Date: 2020-08-14",
            "Vessel: MV Example Star (Singapore-flagged)",
            "Location: Approaches to Singapore Strait",
            "Summary: At 03:41 LT, steering responded sluggishly while entering congested waters.",
            "Actions: Engine telegraph set to slow ahead; navigational warning issued.",
            "Damage: Minor scrape port bow, no injuries reported.",
            "Findings: Fatigue among bridge team; incomplete checklist use.",
        ]:
            text.textLine(line)
        c.drawText(text); c.showPage(); c.save()
        print("Created synthetic PDF:", out_path)
        return True
    except Exception as e:
        print("[WARN] Could not generate synthetic PDF:", e)
        return False

# 1) Reuse existing local report if present (skip download)
if pdf_path.exists() and pdf_path.stat().st_size > 0:
    print("Using existing report:", pdf_path)
else:
    ok = False
    if not OFFLINE:
        ok = try_download_pdf(SAMPLE_PDF_URLS, pdf_path)
    if not ok and not pdf_path.exists():
        ok = create_synthetic_pdf(pdf_path)

if not pdf_path.exists():
    raise FileNotFoundError("No PDF available. Please upload a file to data/report.pdf and re-run.")

# Extract text with page anchors via PyMuPDF
import fitz  # PyMuPDF
doc = fitz.open(str(pdf_path))
page_texts = []
for i, page in enumerate(doc):
    text = page.get_text("text")
    page_texts.append({"page": i+1, "text": text})
doc.close()

# Save page-anchored chunks for transparency
Path("data/report_chunks.json").write_text(json.dumps(page_texts, ensure_ascii=False, indent=2), encoding="utf-8")
full_text = "\n".join([p["text"] for p in page_texts]).strip()

print("PDF pages:", len(page_texts))
print("First 1000 chars of extracted text:\n", full_text[:1000])

Using existing report: data\report.pdf
PDF pages: 24
First 1000 chars of extracted text:
 Final Report 
 
 
 
 
 
Missing of Fitter  
From SRS RTM Zheng He 
At Sea 
On 26 December 2024 
 
 
 
 
TIB/MAI/CAS.187 
 
Transport Safety Investigation Bureau 
Ministry of Transport 
Singapore 
 
18 July 2025 

 
© 2025 Government of Singapore  
ii 
 
The Transport Safety Investigation Bureau of Singapore 
The Transport Safety Investigation Bureau of Singapore (TSIB) is the air, marine 
and rail accidents and incidents investigation authority in Singapore. Its mission is to 
promote transport safety through the conduct of independent investigations into air, 
marine and rail accidents and incidents. 
TSIB conducts marine safety investigations in accordance with the Singapore 
Transport Safety Investigations Act 2018, Transport Safety Investigations (Marine 
Occurrences) Regulations 2023 and the Casualty Investigation Code under SOLAS 
Regulation XI-1/6 adopted by the International Maritime Organ

In [6]:
# --- Retrieval Pipeline: Chunking, Embeddings, FAISS --------------------------
from pathlib import Path
import json, re
import numpy as np

# Try LangChain splitter; fallback to simple splitter if unavailable
try:
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    def chunk_text(txt, chunk_size=1000, overlap=200):
        splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
        return splitter.split_text(txt)
except Exception:
    def chunk_text(txt, chunk_size=1000, overlap=200):
        chunks = []
        start = 0
        while start < len(txt):
            end = min(len(txt), start + chunk_size)
            chunks.append(txt[start:end])
            start = end - overlap if end - overlap > start else end
        return chunks

chunks = chunk_text(full_text, chunk_size=1000, overlap=200)

Path("data/chunks.json").write_text(json.dumps(chunks, ensure_ascii=False, indent=2), encoding="utf-8")


# Map chunk_id -> a representative page (best-effort by position)
chunk_map = {}
if page_texts:
    page_concat = [p["text"] for p in page_texts]
    cum = [0]
    for t in page_concat:
        cum.append(cum[-1] + len(t))
    for i, ch in enumerate(chunks):
        idx = full_text.find(ch[:50]) if ch else -1
        if idx < 0:
            idx = full_text.find(ch[:20]) if ch else -1
        if idx < 0:
            idx = 0
        for p in range(1, len(cum)):
            if idx < cum[p]:
                chunk_map[i] = {"page": p, "preview": ch[:120] if ch else ""}
                break
else:
    for i, ch in enumerate(chunks):
        chunk_map[i] = {"page": None, "preview": ch[:120] if ch else ""}

# Embeddings (safe import + fallback)
EMBED_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
embed_model = None
try:
    from sentence_transformers import SentenceTransformer  # import inside try to avoid hard crash
    embed_model = SentenceTransformer(EMBED_MODEL_NAME)
    embeddings = embed_model.encode(
        chunks, batch_size=32, show_progress_bar=False, convert_to_numpy=True, normalize_embeddings=True
    ).astype(np.float32)
except Exception as e:
    print("[WARN] SentenceTransformer unavailable; using TF-IDF instead. Reason:", e)
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.preprocessing import normalize as sk_normalize
    tfidf = TfidfVectorizer(max_features=4096)
    embeddings = tfidf.fit_transform(chunks).astype(np.float32).toarray()
    embeddings = sk_normalize(embeddings, norm="l2").astype(np.float32)  # cosine via IP

# FAISS index
index = None
try:
    if len(chunks) == 0:
        raise ValueError("No text chunks extracted; skipping FAISS index build.")
    import faiss
    dim = embeddings.shape[1]
    index = faiss.IndexFlatIP(dim)  # cosine via normalized inner product
    index.add(embeddings.astype(np.float32))
    Path("models").mkdir(parents=True, exist_ok=True)
    faiss.write_index(index, "models/faiss.index")
    with open("data/chunk_map.json", "w", encoding="utf-8") as f:
        json.dump(chunk_map, f, ensure_ascii=False, indent=2)
    print("FAISS index built with", len(chunks), "chunks. Saved to models/faiss.index")
except Exception as e:
    print("[WARN] Could not build FAISS index:", e)
    index = None

  from .autonotebook import tqdm as notebook_tqdm


FAISS index built with 50 chunks. Saved to models/faiss.index


In [7]:
# --- Generation (Local-first with guardrails) ---------------------------------
from typing import List, Tuple, Dict
import re, math, json

GUARDRAIL_INSTRUCTION = (
    "You are generating a concise SITREP using ONLY the provided context. "
    "If the requested information is not present in the context, respond exactly: NOT FOUND IN SOURCE. "
    "Do not use outside knowledge."
)

# Safe import: transformers may be broken due to env mismatch
try:
    from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
except Exception as e:
    print("[WARN] transformers unavailable; will use extractive fallback. Reason:", e)
    AutoTokenizer = AutoModelForSeq2SeqLM = pipeline = None

LLM_NAME_CANDIDATES = [
    "google/flan-t5-small",
    "sshleifer/distilbart-cnn-12-6"
]

tokenizer = None
seq2seq = None
pipe = None
if pipeline is not None:
    for name in LLM_NAME_CANDIDATES:
        try:
            tokenizer = AutoTokenizer.from_pretrained(name)
            seq2seq = AutoModelForSeq2SeqLM.from_pretrained(name)
            pipe = pipeline("text2text-generation", model=seq2seq, tokenizer=tokenizer)
            print("Loaded LLM:", name)
            break
        except Exception as e:
            print("[WARN] Could not load", name, "->", e)

# Reuse retrieval for extractive baseline, with robust TF-IDF fallback
def retrieve_top_k(query: str, k: int = 5) -> List[int]:
    import numpy as np
    # If FAISS + embedding model available, use them
    if (index is not None) and (embed_model is not None):
        q_emb = embed_model.encode([query], convert_to_numpy=True, normalize_embeddings=True)
        D, I = index.search(q_emb.astype(np.float32), k)
        return I[0].tolist()
    # TF-IDF fallback (does not need transformers/accelerate)
    try:
        from sklearn.feature_extraction.text import TfidfVectorizer
        from sklearn.metrics.pairwise import cosine_similarity
        vec = TfidfVectorizer(max_features=4096)
        X = vec.fit_transform(chunks)
        q = vec.transform([query])
        sims = cosine_similarity(q, X).ravel()
        return np.argsort(-sims)[:k].tolist()
    except Exception as e:
        print("[WARN] TF-IDF retrieval failed:", e)
        return list(range(min(k, len(chunks))))

def _preview(text: str, n: int = 200) -> str:
    return text[:n].replace("\n", " ").replace("\r", " ")

def extractive_sitrep(query: str, top_k: int = 5) -> Tuple[str, List[int]]:
    top_idx = retrieve_top_k(query, k=top_k)
    selected = [chunks[i] for i in top_idx]
    lines = ["- " + _preview(s) + "..." for s in selected]
    sitrep = "SITREP (Extractive Baseline)\n" + "\n".join(lines)
    return sitrep, top_idx

def generate_baseline(prompt: str, max_tokens: int = 256) -> str:
    if pipe is None:
        s, _ = extractive_sitrep(prompt, top_k=5)
        return s
    out = pipe(prompt, max_new_tokens=max_tokens, do_sample=False)[0]["generated_text"]  # fixed
    return out

def generate_rag(prompt: str, top_k: int = 5, max_tokens: int = 256) -> Tuple[str, Dict[int, str]]:
    top_idx = retrieve_top_k(prompt, top_k)
    context_parts = []
    for i in top_idx:
        page = chunk_map.get(i, {}).get('page')
        preview = chunks[i]
        context_parts.append(f"[Chunk {i} | Page {page}]\n{preview}")
    context = "\n\n".join(context_parts)
    rag_prompt = (
        f"{GUARDRAIL_INSTRUCTION}\n"
        f"Context:\n{context}\n\n"
        f"User request: {prompt}\n"
        "Produce a concise, structured SITREP. Where info is missing, write: NOT FOUND IN SOURCE.\n"
    )
    if pipe is None:
        s, _ = extractive_sitrep(prompt, top_k=top_k)
        citations = {i: chunk_map.get(i, {'page': None, 'preview': chunks[i][:120]}) for i in top_idx}
        return s, citations

    out = pipe(rag_prompt, max_new_tokens=max_tokens, do_sample=False)[0]["generated_text"]
    citations = {i: chunk_map.get(i, {'page': None, 'preview': chunks[i][:120]}) for i in top_idx}
    return out, citations

Device set to use cpu


Loaded LLM: google/flan-t5-small


In [8]:
# --- Optional PII Redaction (Governance Demo) ---------------------------------
MASK_PII = False

def redact_pii(text: str, use_spacy: bool = True) -> str:
    if not MASK_PII:
        return text
    try:
        import spacy
        nlp = spacy.load("en_core_web_sm")
        doc = nlp(text)
        redacted = text
        # Replace from end to start to not offset spans
        for ent in sorted(doc.ents, key=lambda e: e.start_char, reverse=True):
            if ent.label_ in {"PERSON","ORG"}:
                redacted = redacted[:ent.start_char] + "[REDACTED]" + redacted[ent.end_char:]
        return redacted
    except Exception:
        # Simple fallback: naive proper noun masking (very rough)
        import re
        return re.sub(r"\b([A-Z][a-z]+\s[A-Z][a-z]+)\b", "[REDACTED]", text)

print("PII redaction ready (toggle MASK_PII=True to activate).")


PII redaction ready (toggle MASK_PII=True to activate).


In [9]:
# --- V&V: Hallucination Probes & Groundedness ---------------------------------
import json, re
from pathlib import Path

def lcs_length(a_tokens, b_tokens):
    # Longest Common Subsequence length (for rough ROUGE-L recall)
    dp = [[0]*(len(b_tokens)+1) for _ in range(len(a_tokens)+1)]
    for i in range(1, len(a_tokens)+1):
        for j in range(1, len(b_tokens)+1):
            if a_tokens[i-1] == b_tokens[j-1]:
                dp[i][j] = dp[i-1][j-1] + 1
            else:
                dp[i][j] = max(dp[i-1][j], dp[i][j-1])
    return dp[-1][-1]

def tokenize_simple(s):
    return re.findall(r"[A-Za-z0-9]+", s.lower())

def sentence_split(s):
    return [x.strip() for x in re.split(r"[\n\.;!?]+", s) if x.strip()]

def groundedness_score(sitrep: str, retrieved_indices: list, jaccard_tau=0.2, rougeL_tau=0.2):
    # Build candidate source tokens
    source_text = "\n\n".join([chunks[i] for i in retrieved_indices])
    source_tokens = tokenize_simple(source_text)
    sents = sentence_split(sitrep)
    supported = []
    for sent in sents:
        stoks = tokenize_simple(sent)
        if not stoks:
            supported.append(False); continue
        # Jaccard vs source
        inter = len(set(stoks) & set(source_tokens))
        union = len(set(stoks) | set(source_tokens)) or 1
        jacc = inter/union
        # ROUGE-L approx via LCS recall
        lcs = lcs_length(stoks, source_tokens)
        rougeL = lcs / (len(stoks) or 1)
        ok = (jacc >= jaccard_tau) or (rougeL >= rougeL_tau) or ("not found in source" in sent.lower())
        supported.append(ok)
    coverage = sum(supported) / (len(supported) or 1)
    return coverage, list(zip(sents, supported))

# --- Run probes ---
UNANSWERABLE = "What was the captain's middle name?"
ANSWERABLE   = "Generate a concise SITREP covering date/time, vessel, location, actions, damage and findings."

print("\n[Probe] Unanswerable question (Baseline vs RAG)")
b_resp = generate_baseline(UNANSWERABLE)
r_resp, r_cites = generate_rag(UNANSWERABLE, top_k=5)
print("\nBaseline response:\n", b_resp[:500])
print("\nRAG response:\n", r_resp[:500])

print("\n[Probe] Answerable SITREP (RAG)")
sitrep_text, cites = generate_rag(ANSWERABLE, top_k=5)
print(sitrep_text[:600])

# Compute groundedness vs the retrieved chunks used for SITREP
retrieved_idx = list(cites.keys())
g_score, details = groundedness_score(sitrep_text, retrieved_idx)

artifacts = {
    "groundedness": {
        "score": g_score,
        "details": [{"sentence": s, "supported": bool(ok)} for s, ok in details],
        "retrieved_indices": retrieved_idx,
        "citations": cites
    }
}
Path("artifacts/groundedness.json").write_text(json.dumps(artifacts, ensure_ascii=False, indent=2), encoding="utf-8")
print(f"Groundedness: {g_score:.2f}")

# Write a human-readable V&V report
with open("artifacts/vnv_report.md", "w", encoding="utf-8") as f:
    f.write("# V&V Report\n\n")
    f.write(f"- Groundedness Score: {g_score:.2f}\n")
    f.write("- Unsupported sentences listed below (if any):\n")
    for s, ok in details:
        if not ok:
            f.write(f"  - {s}\n")
    f.write("\n## Citations (retrieved chunks)\n")
    for i, meta in cites.items():
        f.write(f"- Chunk {i}, Page {meta.get('page')}: {meta.get('preview','')[:160]}\n")
print("Saved artifacts/vnv_report.md")



[Probe] Unanswerable question (Baseline vs RAG)


Token indices sequence length is longer than the specified maximum sequence length for this model (634 > 512). Running this sequence through the model will result in indexing errors



Baseline response:
 edward

RAG response:
 [Chunk 34 | Page 17] [Chunk 34 | Page 17] 1.6 The other crew qualifications and roles 1.6.1 RZH had a total of 25 crew9 including the Master at the time of the occurrence. 9 12 from India, 12 from the Philippines, one from Ukraine [Chunk 3 | Page 3]  2025 Government of Singapore iii Table of Contents ABBREVIATIONS iv SYNOPSIS 1 VIEW OF VESSEL 2 1 Factual information 3 1.1 Narrative 3 1.2 The vessel was in port in January 2025. 1.8.3 The Company shared that they were also implementing guidelines 

[Probe] Answerable SITREP (RAG)
[Chunk 3 | Page 2]  2025 Government of Singapore iii Table of Contents ABBREVIATIONS iv SYNOPSIS 1 VIEW OF VESSEL 2 1 Factual information 3 1.1 Narrative 3 1.2 The vessel 10 1.3 TSIB conducts marine safety investigations in accordance with the Singapore Transport Safety Investigations Act 2018, Transport Safety Investigations (Marine Occurrences) Regulations 2023 and the Casualty Investigation Code under SOLAS Regulati

In [10]:
# --- Append to Model Card -----------------------------------------------------
import datetime
from pathlib import Path
card_path = Path("artifacts/model_card.md")
appendix = f"""
## Pipeline Details (Appended {datetime.datetime.utcnow().isoformat()}Z)
- PDF: data/report.pdf
- Extraction: PyMuPDF (page-anchored)
- Retrieval: sentence-transformers/all-MiniLM-L6-v2 + FAISS (fallback: TF-IDF)
- Generator: google/flan-t5-small or distilbart-cnn (fallback: extractive)
- Guardrails: context-only; 'NOT FOUND IN SOURCE' for missing info
- Governance: Hallucination probe + Groundedness Score (artifacts/groundedness.json, artifacts/vnv_report.md)
- Streamlit app: app/app.py (cached loaders)

## Known Failure Modes
- Long or image-heavy PDFs may yield poor text extraction.
- LLM may truncate if prompt exceeds context length.
- Offline environments trigger extractive fallback (reduced quality).

## Next Steps
- Add OCR for scanned PDFs.
- Enrich retrieval with metadata (sections, headings).
- Human-in-the-loop verification workflow.
"""
card_path.write_text(card_path.read_text(encoding="utf-8") + "\n" + appendix, encoding="utf-8")
print("Appended details to artifacts/model_card.md")


Appended details to artifacts/model_card.md


In [11]:
# --- Streamlit App (file writer) ----------------------------------------------
from pathlib import Path

app_code = r"""
import os, json, re, time
from pathlib import Path
import streamlit as st

st.set_page_config(page_title="Automated SITREP (RAG-LLM)", layout="wide")

@st.cache_resource
def load_artifacts():
    t0 = time.perf_counter()

    # Load chunked text used by retrieval/FAISS (NOT pages)
    chunks = []
    try:
        chunks = json.loads(Path("data/chunks.json").read_text(encoding="utf-8"))
    except Exception:
        # Fallback to page texts if chunks.json is missing
        try:
            pages = json.loads(Path("data/report_chunks.json").read_text(encoding="utf-8"))
            chunks = [p["text"] for p in pages]
            st.warning("data/chunks.json missing; using page texts. Retrieval may be misaligned with FAISS.")
        except Exception:
            st.error("No chunks available. Please re-run the notebook pipeline.")
            chunks = []

    # Chunk map (for page numbers + previews)
    try:
        chunk_map = json.loads(Path("data/chunk_map.json").read_text(encoding="utf-8"))
    except Exception:
        chunk_map = {i: {"page": None, "preview": (chunks[i][:120] if i < len(chunks) else "")}
                     for i in range(len(chunks))}

    t_after_io = time.perf_counter()

    # Embeddings
    embed_model = None
    try:
        from sentence_transformers import SentenceTransformer
        embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
    except Exception as e:
        st.warning(f"Embedding model load failed: {e}")

    t_after_embed = time.perf_counter()

    # FAISS
    index = None
    index_size = 0
    try:
        import faiss, numpy as np  # noqa: F401
        index = faiss.read_index("models/faiss.index")
        index_size = index.ntotal
    except Exception as e:
        st.warning(f"FAISS index not available: {e}")

    t_after_faiss = time.perf_counter()

    # LLM
    pipe = None
    try:
        from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
        for name in ["sshleifer/distilbart-cnn-12-6", "google/flan-t5-small"]:
            try:
                tok = AutoTokenizer.from_pretrained(name)
                mdl = AutoModelForSeq2SeqLM.from_pretrained(name)
                pipe = pipeline("text2text-generation", model=mdl, tokenizer=tok)
                st.info(f"Loaded LLM: {name}")
                break
            except Exception as e:
                st.warning(f"Could not load {name}: {e}")
    except Exception as e:
        st.warning(f"Transformers not available: {e}")
        pipe = None

    t_after_llm = time.perf_counter()

    # Timers
    st.info(
        f"Init timings (s) | IO: {t_after_io - t0:.2f} | "
        f"Embeds: {t_after_embed - t_after_io:.2f} | "
        f"FAISS: {t_after_faiss - t_after_embed:.2f} | "
        f"LLM: {t_after_llm - t_after_faiss:.2f} | "
        f"Total: {t_after_llm - t0:.2f}"
    )

    # Mismatch warning
    if index is not None and len(chunks) and index_size and index_size != len(chunks):
        st.warning(f"Mismatch: FAISS ntotal={index_size} but chunks={len(chunks)}. "
                   f"Re-run the retrieval cell to regenerate data/chunks.json and models/faiss.index together.")

    return chunks, chunk_map, embed_model, index, pipe

chunks, chunk_map, embed_model, index, pipe = load_artifacts()

GUARDRAIL_INSTRUCTION = (
    "You are generating a concise SITREP using ONLY the provided context. "
    "If the requested information is not present in the context, respond exactly: NOT FOUND IN SOURCE. "
    "Do not use outside knowledge."
)

def retrieve_top_k(query: str, k: int = 5):
    if not chunks:
        return []
    k = max(1, min(k, len(chunks)))
    if index is None or embed_model is None:
        return list(range(k))
    import numpy as np, faiss  # noqa: F401
    q_emb = embed_model.encode([query], convert_to_numpy=True, normalize_embeddings=True)
    D, I = index.search(q_emb.astype(np.float32), k)
    top = [int(i) for i in I[0].tolist() if isinstance(i, (int, np.integer))]
    top = [i for i in top if 0 <= i < len(chunks)]
    if not top:
        top = list(range(k))
    return top

def generate_rag(prompt: str, top_k: int = 5, max_tokens: int = 256):
    top_idx = retrieve_top_k(prompt, top_k)
    if not top_idx:
        return "No chunks available to generate a SITREP.", {}

    # Build context safely (chunk_map keys may be strings)
    ctx_parts, valid_idx = [], []
    for i in top_idx:
        meta = (chunk_map.get(str(i)) or chunk_map.get(i) or {})
        page = meta.get("page")
        ctx_parts.append(f"[Chunk {i} | Page {page}]\\n{chunks[i]}")
        valid_idx.append(i)

    ctx = "\\n\\n".join(ctx_parts)
    rag_prompt = (
        f"{GUARDRAIL_INSTRUCTION}\\n"
        f"Context:\\n{ctx}\\n\\n"
        f"User request: {prompt}\\n"
        "Produce a concise, structured SITREP. Where info is missing, write: NOT FOUND IN SOURCE.\\n"
    )
    if pipe is None:
        selected = [chunks[i][:200].replace("\\n", " ") for i in valid_idx]
        out = "SITREP (Extractive Fallback)\\n" + "\\n".join([f"- {s}..." for s in selected])
    else:
        out = pipe(rag_prompt, max_new_tokens=max_tokens, do_sample=False, truncation=True)[0]["generated_text"]

    citations = {i: (chunk_map.get(str(i)) or chunk_map.get(i) or {"page": None, "preview": chunks[i][:120]})
                 for i in valid_idx}
    return out, citations

st.title("Automated SITREP Generator (RAG-LLM)")

left, right = st.columns([1,1])
with left:
    st.subheader("Source Report Text")
    try:
        pages = json.loads(Path("data/report_chunks.json").read_text(encoding="utf-8"))
        for p in pages:
            with st.expander(f"Page {p.get('page')}"):
                st.write(p.get("text",""))
    except Exception:
        st.info("No pages available. Please run the notebook pipeline first.")

with right:
    st.subheader("Generate SITREP")
    prompt_choice = st.selectbox("Sample prompts", [
        "Generate a concise SITREP covering date/time, vessel, location, actions, damage and findings.",
        "List key findings as bullet points.",
        "Create an incident timeline."
    ])
    custom_prompt = st.text_area("Or write your own prompt", value="", placeholder="Type here to override the sample prompt...")
    TOP_K = st.slider("TOP_K (retrieved chunks)", min_value=3, max_value=10, value=5, step=1)
    mask_pii = st.checkbox("Mask PII (simple NER)", value=False)
    if st.button("Generate SITREP"):
        # Custom prompt takes precedence if non-empty after stripping
        prompt = custom_prompt.strip() or prompt_choice
        st.caption(f"Using prompt: {prompt!r}")
        out, cites = generate_rag(prompt, top_k=TOP_K)
        if mask_pii:
            try:
                import spacy
                nlp = spacy.load("en_core_web_sm")
                doc = nlp(out)
                out_red = out
                for ent in sorted(doc.ents, key=lambda e: e.start_char, reverse=True):
                    if ent.label_ in {"PERSON","ORG"}:
                        out_red = out_red[:ent.start_char] + "[REDACTED]" + out_red[ent.end_char:]
                out = out_red
            except Exception as e:
                st.warning(f"PII masking unavailable: {e}")
        st.markdown("### SITREP")
        st.write(out)

        st.markdown("### Citations")
        for i, meta in cites.items():
            with st.expander(f"Chunk {i} (Page {meta.get('page')})"):
                st.write(meta.get("preview",""))

# Save a small run summary
try:
    Path("artifacts").mkdir(exist_ok=True, parents=True)
    Path("artifacts/run_summary.json").write_text(json.dumps({
        "ts": __import__("datetime").datetime.utcnow().isoformat()+"Z",
        "chunks": len(chunks),
        "index_loaded": bool(index is not None),
        "llm_loaded": bool(pipe is not None)
    }, indent=2), encoding="utf-8")
except Exception as e:
    st.warning(f"Could not write run summary: {e}")
"""

Path("app").mkdir(parents=True, exist_ok=True)
Path("app/app.py").write_text(app_code, encoding="utf-8")
print("Wrote app/app.py")

Wrote app/app.py


In [None]:
import os, sys, subprocess

port = os.environ.get("PORT", "8501")
address = os.environ.get("BIND_ADDR", "localhost")  # ensure clickable link uses localhost
cmd = [
    sys.executable, "-m", "streamlit", "run", "app/app.py",
    "--server.address", address,
    "--server.port", str(port),
    "--browser.serverAddress", "localhost",
    "--browser.serverPort", str(port),
]
print("Launching:", " ".join(cmd))
print(f"Open: http://localhost:{port}/")
# This will hold the cell until you stop Streamlit (Ctrl+C in terminal/kernel).
subprocess.run(cmd, check=False)


Launching: c:\Users\tcmk_\Downloads\elice notebooks\.venv311\Scripts\python.exe -m streamlit run app/app.py --server.address localhost --server.port 8501 --browser.serverAddress localhost --browser.serverPort 8501
Open: http://localhost:8501/
