# Notebook 04 LLM Answering Pipeline
This section is responsible for:

* Running a local, open-source LLM
* Constructing the final prompt (query + context)
* Generating an answer
* Attaching citations
* Returning a polished response

Implementing open-source software, we will use:

Ollama (best, simplest, FREE local LLM runner)

Model: llama3 (this can changed eventually in order to improve)

In [1]:
# Had issues keeping same directories and variables across notebooks. 
# This should fix it.

from config import CONTEXT_FILE
import json

if not CONTEXT_FILE.exists():
    raise FileNotFoundError(
        f"context_for_llm not found. Expected at: {CONTEXT_FILE}\n"
        "Run Notebook 03 first."
    )

with open(CONTEXT_FILE, "r", encoding="utf-8") as f:
    context_for_llm = json.load(f)

print("Loaded context_for_llm OK.")

Loaded context_for_llm OK.


# 1 Import and Settings


In [2]:
# Notebook 04 — Cell 1
# Imports, configuration, and Ollama client setup (robust fallback)
# -------------------------------------------------------------------

import os
import json
import textwrap
from typing import Dict, Optional, Any

# Use the official ollama client (if installed).
# If not, fall back to a safe HTTP-based client using `requests`.
llm = None

try:
    # Preferred: official Python client
    from ollama import Client as OllamaClient
    # If the ollama client exists, create a client bound to the local daemon
    llm = OllamaClient(host=os.getenv("OLLAMA_HOST", "http://localhost:11434"))
    print("Official Ollama client loaded.")
except Exception:
    # Fallback: HTTP requests-based client
    try:
        import requests
        from requests.exceptions import HTTPError, RequestException

        class SimpleOllamaClient:
            """Lightweight HTTP Ollama client that normalizes responses."""

            def __init__(self, host: str = "http://localhost:11434"):
                self.host = host.rstrip("/")

            def _normalize(self, data: Any) -> Dict:
                # Normalize a variety of response shapes to:
                # {"message": {"content": "<text>"}}
                if isinstance(data, dict):
                    if "response" in data and isinstance(data["response"], str):
                        return {"message": {"content": data["response"]}}
                    if "text" in data and isinstance(data["text"], str):
                        return {"message": {"content": data["text"]}}
                    if "message" in data and isinstance(data["message"], dict) and "content" in data["message"]:
                        return {"message": {"content": data["message"]["content"]}}
                    if "choices" in data and isinstance(data["choices"], list) and len(data["choices"]) > 0:
                        first = data["choices"][0]
                        # handle openai-style or other shapes
                        if isinstance(first, dict):
                            if "text" in first:
                                return {"message": {"content": first["text"]}}
                            if "message" in first and isinstance(first["message"], dict) and "content" in first["message"]:
                                return {"message": {"content": first["message"]["content"]}}
                # Fallback: string conversion
                return {"message": {"content": str(data)}}

            def chat(self, model: str, messages: list, timeout: int = 60) -> Dict:
                """
                Sends a consolidated prompt to Ollama HTTP /api/generate endpoint.
                Expects `messages` as a list of {"role": "...", "content": "..."}; we merge user messages.
                """
                prompt_text = "\n".join([m.get("content", "") for m in messages if m.get("role") == "user"]).strip()
                url = f"{self.host}/api/generate"
                payload = {"model": model, "prompt": prompt_text, "stream": False}
                try:
                    resp = requests.post(url, json=payload, timeout=timeout)
                    resp.raise_for_status()
                except HTTPError as he:
                    status = getattr(he.response, "status_code", None)
                    if status == 404:
                        raise RuntimeError(f"Ollama HTTP endpoint not found (404). Is Ollama running at {self.host}?") from he
                    raise
                except RequestException as re:
                    raise RuntimeError(f"Failed to reach Ollama at {self.host}: {re}") from re

                # Parse JSON when possible
                try:
                    data = resp.json()
                except ValueError:
                    # Non-json, return as text
                    return {"message": {"content": resp.text}}
                return self._normalize(data)

        llm = SimpleOllamaClient(host=os.getenv("OLLAMA_HOST", "http://localhost:11434"))
        print("Using fallback HTTP Ollama client (requests).")
    except Exception as e:
        print("Could not initialize Ollama client or HTTP fallback.")
        print("Install `ollama` python package or `requests`, and ensure Ollama daemon is running.")
        class _StubLLM:
            def chat(self, *args, **kwargs):
                raise RuntimeError("No Ollama client available. Install `ollama` or start Ollama daemon.")
        llm = _StubLLM()


Using fallback HTTP Ollama client (requests).


# 2 Prompt Template and Formatting Function

In [3]:
# Notebook 04 — Cell 2
# Prompt template + builder for RAG. Keep the prompt strict to avoid hallucination.
# -------------------------------------------------------------------------------

def build_rag_prompt(query: str, context_data: Dict, instructions: Optional[str] = None) -> str:
    """
    Build the final prompt for the LLM using the assembled `context_data`.
    Expects context_data keys: merged_context (string), citations (list).
    """

    context_block = context_data.get("merged_context", "").strip()
    # Short helper to format a compact citations list (not strictly required by the LLM)
    citation_lines = []
    for c in context_data.get("citations", []):
        src = c.get("source", "unknown")
        preview = c.get("chunk_preview", "")[:120]
        citation_lines.append(f"- {src}: {preview}")

    citation_text = "\n".join(citation_lines)

    # Base instructions
    base_instructions = """
You are an enterprise-grade retrieval assistant. Answer using ONLY the provided context.
Do NOT hallucinate. If the required information is not present in the context, respond:
"The answer is not available in the provided context."

REQUIREMENTS:
- Cite sources inline for any factual claim using the bracket format: [source]
  Example: "6G targets ultra-low latency [./data/6g-paper.pdf]"
- Keep answers concise and reference the citation location when possible.
"""

    if instructions:
        base_instructions = base_instructions + "\n" + instructions

    prompt = f"""
{base_instructions}

CONTEXT:
--------
{context_block}
--------

CITATIONS:
{citation_text}

USER QUESTION:
{query}

FINAL ANSWER (include inline citations where relevant):
""".strip()

    return prompt


# 3 Answer Generation Using Local LLM (Ollama + Llama3.1)

In [4]:
# Notebook 04 — Cell 3
# Core LLM invocation wrapper. Uses llm.chat() and normalizes outputs.
# -------------------------------------------------------------------

def generate_llm_answer(query: str, context_for_llm: Dict, model: Optional[str] = None, timeout: int = 60) -> Dict:
    """
    Build prompt from context_for_llm and call the configured local LLM.
    Returns a dictionary with fields:
      - 'answer' (str) : model textual output
      - 'meta' (dict)  : underlying raw response (if available)
    """
    if model is None:
        model = os.getenv("OLLAMA_MODEL", "llama3.1:latest")

    prompt = build_rag_prompt(query, context_for_llm)

    # Call the client. The client returns a normalized structure (we implemented normalization in the SimpleOllamaClient)
    try:
        raw_resp = llm.chat(model=model, messages=[{"role": "user", "content": prompt}], timeout=timeout)
    except TypeError:
        # Some clients may not accept timeout param; try again without it
        raw_resp = llm.chat(model=model, messages=[{"role": "user", "content": prompt}])
    except Exception as e:
        raise RuntimeError(f"LLM call failed: {e}") from e

    # Normalize the output to extract a string answer.
    answer_text = None
    # raw_resp could already be a dict normalized by our HTTP client
    if isinstance(raw_resp, dict):
        # common pattern: {"message": {"content": "..."}}
        if "message" in raw_resp and isinstance(raw_resp["message"], dict) and "content" in raw_resp["message"]:
            answer_text = raw_resp["message"]["content"]
        elif "response" in raw_resp and isinstance(raw_resp["response"], str):
            answer_text = raw_resp["response"]
        elif "text" in raw_resp and isinstance(raw_resp["text"], str):
            answer_text = raw_resp["text"]
        elif "choices" in raw_resp and isinstance(raw_resp["choices"], list) and len(raw_resp["choices"]) > 0:
            first = raw_resp["choices"][0]
            if isinstance(first, dict):
                answer_text = first.get("text") or (first.get("message", {}).get("content") if isinstance(first.get("message"), dict) else None)

    # Last fallback: stringify whatever was returned
    if answer_text is None:
        try:
            answer_text = str(raw_resp)
        except Exception:
            answer_text = "<no textual output>"

    return {"answer": answer_text, "meta": raw_resp}


# 4 Final Output Formatting - Answer + Citations

In [5]:
# Notebook 04 — Cell 4
# Formatting final output (print + structured return)
# -------------------------------------------------------------------

def format_final_output(answer_obj: Dict, context_for_llm: Dict) -> Dict:
    """
    Prints a human-friendly final answer + citations and returns a structured dict:
      { "answer": str, "citations": [...], "raw_context": str }
    The input answer_obj should be the dict returned by generate_llm_answer().
    """
    answer_text = answer_obj.get("answer", "")
    raw_meta = answer_obj.get("meta", {})

    print("\n" + "=" * 40)
    print("FINAL ANSWER")
    print("=" * 40 + "\n")
    print(textwrap.fill(answer_text, width=100))

    print("\n" + "=" * 40)
    print("CITATIONS")
    print("=" * 40 + "\n")
    citations = []
    for c in context_for_llm.get("citations", []):
        # Defensive read with defaults
        src = c.get("source", "unknown")
        preview = c.get("chunk_preview", "") or c.get("text", "")[:120]
        rerank_score = c.get("rerank_score")
        score_str = f"{rerank_score:.4f}" if isinstance(rerank_score, (int, float)) else "n/a"
        print(f"- Source: {src}")
        print(f"  Preview: {preview}")
        print(f"  Rerank score: {score_str}\n")
        citations.append({"source": src, "preview": preview, "rerank_score": rerank_score})

    # Return structured object for programmatic use (APIs, UI)
    return {
        "answer": answer_text,
        "citations": citations,
        "raw_context": context_for_llm.get("merged_context", ""),
        "llm_meta": raw_meta
    }


In [6]:
# Notebook 04 — Cell 5
# Test cell — run end-to-end assuming `context_for_llm` exists (from Notebook 03).
# -------------------------------------------------------------------------------

# Replace this sample query with any user question
test_query = "What does 6G offer?"

if "context_for_llm" not in globals():
    print("Context_for_llm not found in kernel. Run Notebook 03 first to produce it.")
else:
    print("Running end-to-end test with current context_for_llm...")
    resp_obj = generate_llm_answer(test_query, context_for_llm, model=None)
    final = format_final_output(resp_obj, context_for_llm)
    # `final` is a structured dict you can use for downstream tasks


Running end-to-end test with current context_for_llm...


RuntimeError: LLM call failed: Failed to reach Ollama at http://localhost:11434: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)

In [7]:
print("Context keys:", context_for_llm.keys())

Context keys: dict_keys(['merged_context', 'citations', 'raw_chunks'])


# ---- Ollama HTTP diagnostic (run this cell to test your local Ollama docker)
import os
print("--- Ollama HTTP Diagnostic ---")
host = os.getenv('OLLAMA_HOST', 'http://localhost:11434')
model = os.getenv('OLLAMA_MODEL', 'llama3.1:latest')
print(f'Using host={host} model={model}')

try:
    import requests
    print('requests version:', requests.__version__)
except Exception as e:
    print('requests not available in this kernel:', e)

# Quick GET checks for common endpoints
for path in ['', '/api', '/api/models', '/api/generate']:
    url = host.rstrip('/') + path
    try:
        r = requests.get(url, timeout=4)
        print(f'GET {path} ->', r.status_code)
    except Exception as e:
        print(f'GET {path} error:', e)

# Try a small generate POST to /api/generate
payload = {'model': model, 'prompt': 'Say hello and identify the model used.', 'stream': False}
try:
    gen_url = host.rstrip('/') + '/api/generate'
    r = requests.post(gen_url, json=payload, timeout=20)
    print('POST /api/generate ->', r.status_code)
    text = r.text
    print('Response (truncated):', text[:800])
    try:
        j = r.json()
        print('JSON keys:', list(j.keys()))
        # If llm object exists with _normalize, try to normalize and show result
        if 'llm' in globals() and hasattr(llm, '_normalize'):
            try:
                print('Normalized sample:', llm._normalize(j))
            except Exception as e:
                print('Normalization error:', e)
        else:
            # Try simple normalization heuristics
            if isinstance(j, dict):
                if 'message' in j:
                    print('message:', j.get('message'))
                elif 'text' in j:
                    print('text:', j.get('text'))
                elif 'choices' in j and isinstance(j['choices'], list) and len(j['choices'])>0:
                    print('choices[0]:', j['choices'][0])
    except Exception:
        pass
except Exception as e:
    print('POST /api/generate error:', e)

print('Diagnostic complete. If you see 404 from POST, verify the Ollama daemon and model name (try OLLAMA_MODEL=llama3.1:latest).')
