In [91]:
# imports
from __future__ import annotations
import re
from typing import List
import spacy
import json, os, uuid, textwrap, pathlib
import json, numpy as np
from sentence_transformers import SentenceTransformer
import faiss
from openai import OpenAI

# Fact Extractor

In [13]:
nlp = spacy.load("en_core_web_sm")

In [15]:
# Sentences with these words aren't presented as facts in the essay
HEDGES = {
    "maybe","perhaps","probably","possibly","likely","unlikely","apparently","reportedly",
    "seems","seem","appears","appear","suggests","suggest","claims","claim","believe","think",
    "could","might","may","should","would","argue","argues","allege","alleges","estimate","estimates"
}

FACT_ENT_LABELS = {"DATE","TIME","MONEY","QUANTITY","PERCENT","CARDINAL","ORDINAL","GPE","LOC","ORG","PERSON","NORP"}
COPULAS = {"is","are","was","were","'s","'re"}

In [17]:
def is_imperative(doc) -> bool:
    # no subject, root is a verb/begins with a verb
    root = [t for t in doc if t.dep_ == "ROOT"]
    root = root[0] if root else None
    has_subject = any(t.dep_ in {"nsubj","nsubjpass","csubj"} for t in doc)
    starts_with_verb = doc[0].pos_ == "VERB"
    return (root is not None and root.pos_ == "VERB" and not has_subject and starts_with_verb)

In [21]:
is_imperative(nlp("Go to your room"))
is_imperative(nlp("He was very fast"))

False

In [51]:
def factual_score(doc) -> float:
    text = doc.text.strip()
    tokens = [t for t in doc if not t.is_space]

    # Grammer signals
    has_verb = any(t.pos_ in {"VERB","AUX"} for t in tokens)
    has_subject = any(t.dep_ in {"nsubj","nsubjpass","csubj"} for t in tokens)
    is_question = text.endswith("?")
    imperative = is_imperative(doc)
    too_short = sum(1 for t in tokens if not t.is_punct) < 7  # very short fragments

    # if the sentence is hedging
    lower_tokens = {t.lemma_.lower() for t in tokens}
    hedge_hits = len(HEDGES & lower_tokens)

    # Factual cues: numbers, named entities, or copula statements
    has_number = any(t.like_num for t in tokens)
    has_entities = any(ent.label_ in FACT_ENT_LABELS for ent in doc.ents)
    has_copula = any(t.lemma_.lower() in COPULAS for t in tokens)

    # Scoring system
    score = 0.0
    score += 1.5 if has_verb else 0
    score += 1.2 if has_subject else 0
    score += 0.8 if has_copula else 0
    score += 0.8 if (has_number or has_entities) else 0
    score -= 1.5 if is_question else 0
    score -= 1.0 if imperative else 0
    score -= 0.9 * min(3, hedge_hits)  # cap penalty
    score -= 0.9 if too_short else 0

    return score

In [52]:
print(factual_score(nlp("Go to your room")), factual_score(nlp("He was very fast")))

-0.4 1.8000000000000003


In [53]:
def is_factual_sentence(sent, threshold: float = 1.5) -> bool:
    """Return True if the sentence likely states a fact."""
    # Strip obvious non-factual patterns (standalone citations, figure captions, etc.)
    if re.fullmatch(r"\(?\d+(\s*-\s*\d+)?\)?", sent.text.strip()):
        return False
    doc = nlp(sent.text)
    return factual_score(doc) >= threshold

In [65]:
print(is_factual_sentence(nlp("He was very fast")),  is_factual_sentence(nlp("Go to your room")))

True False


In [66]:
def extract_factual_sentences(text: str, threshold: float = 1.5) -> List[str]:
    """Split into sentences and keep ones that look factual."""
    doc = nlp(text)
    factual = []
    for s in doc.sents:
        st = s.text.strip()
        if not st:
            continue
        if is_factual_sentence(s, threshold = threshold):
            factual.append(st)

    return factual

In [70]:
sample = """
c
"""
facts = extract_factual_sentences(sample, threshold=1.5)
for i, f in enumerate(facts, 1):
        print(f"{i:02d}. {f}")

01. Albert Einstein was a physicist, and he developed the theory of relativity.
02. He was born in Germany but later moved to the United States.
03. In 1921, Einstein won the Nobel Prize in Physics.


# Create Passage Store

In [78]:
pathlib.Path("data").mkdir(exist_ok=True)

passages = [
    {
        "passage_id": str(uuid.uuid4()),
        "title": "Eiffel Tower - Wikipedia",
        "url": "https://en.wikipedia.org/wiki/Eiffel_Tower",
        "source": "wikipedia",
        "published_date": None,
        "text": "The Eiffel Tower is a wrought-iron lattice tower in Paris, France. It was constructed from 1887 to 1889."
    },
    {
        "passage_id": str(uuid.uuid4()),
        "title": "Barack Obama - Wikipedia",
        "url": "https://en.wikipedia.org/wiki/Barack_Obama",
        "source": "wikipedia",
        "published_date": None,
        "text": "Barack Obama served as the 44th president of the United States from 2009 to 2017."
    },
    {
        "passage_id": str(uuid.uuid4()),
        "title": "Great Wall of China - Wikipedia",
        "url": "https://en.wikipedia.org/wiki/Great_Wall_of_China",
        "source": "wikipedia",
        "published_date": None,
        "text": "The Great Wall of China is located in northern China, stretching thousands of kilometers."
    },
]

with open("data/passages.jsonl", "w") as f:
    for p in passages:
        f.write(json.dumps(p) + "\n")
len(passages)


3

# Embed

In [80]:
# Load passages
rows = [json.loads(l) for l in open("data/passages.jsonl")]
texts = [r["text"] for r in rows]

# Load bge-m3 (CPU is fine for this tiny demo)
encoder = SentenceTransformer("BAAI/bge-m3")
emb = encoder.encode(texts, normalize_embeddings=True, convert_to_numpy=True)

# Build FAISS index (inner product == cosine when vectors are normalized)
index = faiss.IndexFlatIP(emb.shape[1])
index.add(emb)

# Save artifacts so you can reuse
faiss.write_index(index, "data/faiss.index")
np.save("data/passage_ids.npy", np.array([r["passage_id"] for r in rows], dtype="U"))
with open("data/passage_meta.json", "w") as f:
    json.dump(rows, f)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/444 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

  return forward_call(*args, **kwargs)


# Retrieval

In [82]:
# reload
index = faiss.read_index("data/faiss.index")
pid_map = np.load("data/passage_ids.npy", allow_pickle=True)
meta = {r["passage_id"]: r for r in json.load(open("data/passage_meta.json"))}
enc = encoder  # reuse already-loaded model

In [84]:
def retrieve_topk(query: str, k=3):
    q = enc.encode([query], normalize_embeddings=True, convert_to_numpy=True)
    scores, idxs = index.search(q, k)
    out = []
    for score, i in zip(scores[0], idxs[0]):
        pid = pid_map[i]
        m = meta.item().get(pid) if hasattr(meta, "item") else meta.get(pid)
        out.append({
            "id": pid,
            "title": m["title"],
            "url": m["url"],
            "source": m["source"],
            "published_date": m["published_date"],
            "text": m["text"],
            "score": float(score),
        })

    return out

In [86]:
retrieve_topk("Obama was never a president", k=2)

[{'id': np.str_('ccaf88fa-92cd-44c1-bd3b-5301a664e6fc'),
  'title': 'Barack Obama - Wikipedia',
  'url': 'https://en.wikipedia.org/wiki/Barack_Obama',
  'source': 'wikipedia',
  'published_date': None,
  'text': 'Barack Obama served as the 44th president of the United States from 2009 to 2017.',
  'score': 0.5229790210723877},
 {'id': np.str_('c0ce14bf-4e7d-4540-825e-787635a09dc2'),
  'title': 'Eiffel Tower - Wikipedia',
  'url': 'https://en.wikipedia.org/wiki/Eiffel_Tower',
  'source': 'wikipedia',
  'published_date': None,
  'text': 'The Eiffel Tower is a wrought-iron lattice tower in Paris, France. It was constructed from 1887 to 1889.',
  'score': 0.2773168981075287}]

# Prompt Builder

In [88]:
SCHEMA = {
  "label": "supports|contradicts|unclear",
  "confidence": 0.0,
  "quotes": ["..."],
  "notes": "short explanation",
  "source_urls": ["..."]   # must come from provided evidence only
}

In [126]:
def build_prompt(claim: str, evidences: list[dict]):
    ev_for_model = [
        {"title": e["title"], "url": e["url"], "text": e["text"][:400]}  # keep snippets short
        for e in evidences
    ]

    return f"""You are a claim-checking assistant.

Return STRICT JSON only (no prose), matching this schema:
{json.dumps(SCHEMA, ensure_ascii=False)}

Claim: {json.dumps(claim, ensure_ascii=False)}

Evidence (only cite from these URLs):
{json.dumps(ev_for_model, ensure_ascii=False)}

Rules:
- "label" is one of: supports, contradicts, unclear.
- "source_urls" must be a subset of the evidence urls.
- Include short "quotes" copied from evidence for rationale.
- Keep "notes" concise."""

In [111]:
client = OpenAI(base_url="http://localhost:11434/v1", api_key="ollama")

In [112]:
def call_ollama(prompt: str, model="gpt-oss:20b", temperature=0.2, stream=True) -> str:
    resp = client.chat.completions.create(
        model=model,
        messages=[{"role":"user","content": prompt}],
        temperature=temperature,
        stream=stream,
    )
    if stream:
        out = ""
        for chunk in resp:
            delta = chunk.choices[0].delta
            if delta and delta.content:
                out += delta.content
        return out
    else:
        return resp.choices[0].message.content

In [113]:
def validate_or_retry(raw: str, prompt: str):
    try:
        data = json.loads(raw)
    except Exception:
        retry_prompt = prompt + "\n\nThe previous output was not valid JSON. Return valid JSON only."
        raw2 = call_ollama(retry_prompt)
        try:
            data = json.loads(raw2)
        except Exception:
            data = {"label":"unclear","confidence":0.0,"quotes":[],"notes":"invalid json","source_urls":[]}
    # guardrails
    data["confidence"] = max(0.0, min(float(data.get("confidence", 0.0)), 1.0))
    # enforce URL subset
    return data

In [123]:
def judge_claim(claim: str, k):
    evidence = retrieve_topk(claim, k=k)
    prompt = build_prompt(claim, evidence)
    raw = call_ollama(prompt)
    out = validate_or_retry(raw, prompt)
    # enforce URL subset
    allowed = {e["url"] for e in evidence}
    out["source_urls"] = [u for u in out.get("source_urls", []) if u in allowed]
    return out, evidence

In [122]:
sample = """Albert Einstein was a physicist, and he developed the theory of relativity. 
Maybe his ideas are kind of the best, right? What about quantum stuff?
He was born in Germany but later moved to the United States.
Please consider the following points. In 1921, Einstein won the Nobel Prize in Physics.
It seems people think he was the greatest."""
facts = extract_factual_sentences(sample, threshold=1.5)

In [124]:
results = []
for c in facts:
    v, ev = judge_claim(c, k=1)
    results.append({"claim": c, "verdict": v, "evidence": ev})

In [125]:
def bar(x): 
    filled = int(round(x*10))
    return "█"*filled + "░"*(10-filled)

for r in results:
    v = r["verdict"]
    print(f"\n— Claim: {r['claim']}")
    print(f"  Label: {v['label']}   Confidence: {bar(v['confidence'])} {v['confidence']:.2f}")
    if v.get("quotes"):
        print(f"  Quote: “{v['quotes'][0][:120]}”")
    if v.get("source_urls"):
        print("  Sources:")
        for u in v["source_urls"]:
            print("   •", u)



— Claim: Albert Einstein was a physicist, and he developed the theory of relativity.
  Label: unclear   Confidence: █████░░░░░ 0.50

— Claim: He was born in Germany but later moved to the United States.
  Label: unclear   Confidence: █████░░░░░ 0.50
  Quote: “Barack Obama served as the 44th president of the United States from 2009 to 2017.”
  Sources:
   • https://en.wikipedia.org/wiki/Barack_Obama

— Claim: In 1921, Einstein won the Nobel Prize in Physics.
  Label: unclear   Confidence: █████░░░░░ 0.50
