<img src="https://theaiengineer.dev/tae_logo_gw_flatter.png" width=35% align=right>

# AI Agents & Automation — Chapter 14
## Case Study — Research & Reporting Assistant (RAG + Planning)

&copy; Dr. Yves J. Hilpisch<br>
AI-Powered by GPT-5.

### Overview

This notebook accompanies Chapter 14 — Research Reporting. It is self-contained and demonstrates the core ideas with small, readable code cells. Run cells from top to bottom; each code cell is preceded by a short explanation of what it does.


This self-contained notebook shows a tiny retrieval→plan→draft→fact-check flow using a local (dependency-free) retriever. Swap parts later for framework equivalents while keeping the same interfaces.

In [None]:
import math  # import math utilities
import re  # import regex utilities

def tokenize(text: str) -> list[str]:  # extract lowercase tokens
    return re.findall(r'[A-Za-z0-9_]+', text.lower())  # split into tokens

def bow(
    text: str,
    vocab: dict[str, int],
) -> list[float]:
    # build normalized bag-of-words vector
    tokens = tokenize(text)  # collect tokens
    for token in tokens:  # grow vocab when new terms arrive
        vocab.setdefault(token, len(vocab))  # add new token
    vec = [0.0] * len(vocab)  # allocate vector
    for token in tokens:  # tally token frequency
        vec[vocab[token]] += 1.0  # count tokens
    norm = math.sqrt(sum(value * value for value in vec)) or 1.0  # compute norm
    return [value / norm for value in vec]  # normalize

def pad(
    a: list[float],
    b: list[float],
) -> tuple[list[float], list[float]]:
    # align vector lengths for cosine
    size = max(len(a), len(b))  # target length
    first = a + [0.0] * (size - len(a))
    second = b + [0.0] * (size - len(b))
    return (first, second)  # output zero-padded copies

class Document:  # minimal doc container
    def __init__(self, doc_id: str, text: str) -> None:  # wire id/text fields
        self.id = doc_id  # document id
        self.text = text  # document text

class LocalRetriever:  # deterministic in-memory retriever
    def __init__(self, docs: list[Document]) -> None:  # store corpus and vocab
        self.docs = docs  # corpus
        self.vocab: dict[str, int] = {}  # shared vocabulary

    def topk(
        self,
        query: str,
        k: int = 2,
    ) -> list[tuple[Document, float]]:
        # compute cosine similarity and return hits
        query_vec = bow(query, self.vocab)  # vectorize query
        results: list[tuple[Document, float]] = []  # matching list
        for doc in self.docs:  # scan each candidate document
            doc_vec = bow(doc.text, self.vocab)  # vectorize doc
            doc_vec, padded_query = pad(doc_vec, query_vec)  # ensure equal length
            score = sum(x * y for x, y in zip(doc_vec, padded_query))
            # cosine numerator
            results.append((doc, float(score)))  # collect score
        return sorted(results, key=lambda pair: pair[1], reverse=True)[:k]  # top-k

documents = [  # tiny toy corpus
    Document('s1', 'Alpha launched in 2022 with a focus on simplicity.'),
    Document('s2', 'Key benefit: transparency in logs and short audits.'),
    Document('s3', 'Beta emphasized speed over explainability in 2021.'),
]  # sample corpus
retriever = LocalRetriever(documents)  # create retriever
print(retriever.topk('Alpha launch year', k=1))  # query demo


We define a tiny plan (constraints), draft bullets with citations, add a minimal fact-check pass, and compose the final brief.

In [None]:
def make_plan(topic: str) -> dict[str, object]:  # capture structure + constraints
    """Return plan metadata for the briefing."""
    return {
        'topic': topic,
        'bullets': 5,
        'cite': '[source:id]',
        'summary_len': '<=120',
    }


def draft_bullets(
    topic: str,
    hits: list[tuple[object, float]],
) -> list[str]:
    # convert hits into cite-ready bullets
    """Draft bullet list with simple inline citations."""
    bullets: list[str] = []  # accumulator
    for doc, _score in hits:  # consume retrieval hits
        clause = doc.text.split('.')[0]  # focus on first sentence
        bullets.append(f"- {clause} [source:{doc.id}]")
    if len(bullets) < 5 and hits:  # ensure at least five entries
        fallback = f"- Summary point on {topic} [source:{hits[0][0].id}]"
        # pad using top hit context
        bullets.append(fallback)
    return bullets[:5]


def fact_check(
    bullets: list[str],
    docs_by_id: dict[str, object],
) -> list[str]:
    # tag bullets missing support
    """Append needs_review tag when no supporting doc line exists."""
    checked: list[str] = []  # results buffer
    for bullet in bullets:  # scan each generated bullet
        cites = re.findall(r'\[source:([A-Za-z0-9_]+)\]', bullet)
        # extract cited ids
        supported = any(  # ensure cited snippet appears in text
            cite in docs_by_id and docs_by_id[cite].text.split('.')[0] in bullet
            for cite in cites
        )
        entry = bullet if supported else f"{bullet} [needs_review]"
        checked.append(entry)  # append flagged bullet
    return checked


def summary(bullets: list[str]) -> str:  # clip to <=120 chars
    """Return a short two-bullet summary clipped to 120 chars."""
    snippet = ' '.join(text.lstrip('- ') for text in bullets[:2])
    # join first two bullets
    return snippet if len(snippet) <= 120 else f"{snippet[:117]}..."


topic = 'Alpha vs Beta'  # sample query
hits = retriever.topk(topic, k=3)  # retrieve similar docs
plan_spec = make_plan(topic)  # demonstrate structured plan metadata
raw_bullets = draft_bullets(topic, hits)  # initial claims
docs_by_id = {doc.id: doc for doc in documents}  # id -> document map
checked_bullets = fact_check(raw_bullets, docs_by_id)  # label unsupported claims
print(''.join(checked_bullets))  # display bullet list
print('Summary:', summary(checked_bullets))  # show clipped summary


<img src="https://theaiengineer.dev/tae_logo_gw_flatter.png" width=35% align=right>