<a href="https://colab.research.google.com/github/DarianSawali/News-Based-RAG/blob/main/News_RAG_GPT2_cleaned.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install feedparser newspaper3k sentence-transformers faiss-cpu




In [None]:
!pip install lxml_html_clean




In [None]:
import feedparser

NEWS_SOURCES = {
    # "CBC BC": "https://www.cbc.ca/cmlink/rss-canada-britishcolumbia",
    "Global News BC": "https://globalnews.ca/bc/feed/",
    "CTV Vancouver": "https://bc.ctvnews.ca/rss/ctv-news-vancouver-1.822295",
}

def fetch_articles_from_rss(source_name, feed_url, max_articles=5):
    feed = feedparser.parse(feed_url)
    docs = []

    for entry in feed.entries[:max_articles]:
        title = entry.get("title", "").strip()
        summary = entry.get("summary", "").strip()
        url = entry.get("link", "")

        text = summary or title

        if not text:
            continue

        docs.append({
            "source": source_name,
            "title": title,
            "text": text,
            "url": url,
            "published": entry.get("published", "")
        })

    return docs


In [None]:
import feedparser

feed = feedparser.parse("https://globalnews.ca/bc/feed/")
print(len(feed.entries))

10


In [None]:
all_documents = []

for name, url in NEWS_SOURCES.items():
    print("Fetching", name)
    docs = fetch_articles_from_rss(name, url)
    print(" → Retrieved:", len(docs))
    all_documents.extend(docs)

print("Total articles:", len(all_documents))

Fetching Global News BC
 → Retrieved: 5
Fetching CTV Vancouver
 → Retrieved: 0
Total articles: 5


In [None]:
len(all_documents), all_documents[1]

(5,
 {'source': 'Global News BC',
  'title': 'Oil tanker ban just one obstacle to northern B.C. pipeline as MOU expected Thursday',
  'text': "Ottawa and Alberta are poised to announce a memorandum of understanding, affirming support for a pipeline to the B.C. coast that's being described as 'Northern Gateway 2.0.'",
  'url': 'https://globalnews.ca/news/11545298/alberta-bc-oil-pipeline-mou/',
  'published': 'Wed, 26 Nov 2025 22:30:54 +0000'})

In [None]:
!pip install sentence-transformers faiss-cpu

from sentence_transformers import SentenceTransformer
import numpy as np
import faiss




In [None]:
embed_model = SentenceTransformer("all-MiniLM-L6-v2")

corpus_texts = [
    doc["title"] + "\n\n" + doc["text"]
    for doc in all_documents
]

# Create embeddings
corpus_embeddings = embed_model.encode(corpus_texts, convert_to_numpy=True)
corpus_embeddings = corpus_embeddings.astype("float32")

print("Number of docs:", len(corpus_texts))
print("Embedding shape:", corpus_embeddings.shape)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Number of docs: 5
Embedding shape: (5, 384)


In [None]:
embedding_dim = corpus_embeddings.shape[1]

index = faiss.IndexFlatL2(embedding_dim)
index.add(corpus_embeddings)

print("FAISS index size:", index.ntotal)


FAISS index size: 5


In [None]:
def retrieve_docs(query, k=3):

    q_emb = embed_model.encode([query], convert_to_numpy=True).astype("float32")

    distances, indices = index.search(q_emb, k)

    results = []
    for dist, idx in zip(distances[0], indices[0]):
        doc = all_documents[idx]
        doc = {**doc, "distance": float(dist)}
        results.append(doc)
    return results


In [None]:
test_query = "traffic delays in Metro Vancouver"
retrieved = retrieve_docs(test_query, k=3)

for r in retrieved:
    print(r["source"], "-", r["title"])
    print("distance:", r["distance"])
    print("url:", r["url"])
    print()


Global News BC - No criminal offence by police in failed search for B.C. woman found dead: watchdog
distance: 1.5033574104309082
url: https://globalnews.ca/news/11545029/vanderhoof-bc-rcmp/

Global News BC - Oil tanker ban just one obstacle to northern B.C. pipeline as MOU expected Thursday
distance: 1.6910425424575806
url: https://globalnews.ca/news/11545298/alberta-bc-oil-pipeline-mou/

Global News BC - Lower oil and gas industry spending in 2026 predicted by energy service group
distance: 1.7757796049118042
url: https://globalnews.ca/news/11545074/canada-energy-industry-spending-2026-prediction-enserva/



In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
gpt2 = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
tokenizer.pad_token = tokenizer.eos_token


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
# def summarize_query_with_rag(query, k=3, max_new_tokens=80):

#     retrieved = retrieve_docs(query, k=k)

#     if len(retrieved) == 0:
#         return "No relevant news found for this query."

#     all_summaries = []

#     for doc in retrieved:
#         source = doc["source"]
#         title = doc["title"]
#         text = doc["text"]

#         prompt = f"""
# You are a STRICT summarization system.

# RULES:
# - Summarize ONLY THIS ONE article.
# - Do NOT add any new facts.
# - Do NOT mix information from any other article.
# - If something is not mentioned in the text, you MUST NOT mention it.
# - Write 1–2 sentences.
# - End the summary with "(Source: {source})".

# ARTICLE:
# [{source}] {title}
# {text}

# SUMMARY:
# """.strip()

#         inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)

#         output_ids = gpt2.generate(
#             **inputs,
#             max_new_tokens=min(max_new_tokens, 60),
#             do_sample=False,
#             repetition_penalty=1.2,
#             pad_token_id=tokenizer.eos_token_id,
#         )

#         generated_only = output_ids[0][inputs["input_ids"].shape[1]:]
#         summary = tokenizer.decode(generated_only, skip_special_tokens=True).strip()
#         all_summaries.append(summary)

#     return "\n\n".join(all_summaries)


In [None]:
# import re

# def extract_first_sentences(text, n=2):
#     """Return the first n sentences from a block of text."""
#     sentences = re.split(r'(?<=[.!?])\s+', text.strip())
#     sentences = [s for s in sentences if s]
#     return " ".join(sentences[:n])


# def summarize_query_with_rag(query, k=3):
#     retrieved = retrieve_docs(query, k=k)

#     if len(retrieved) == 0:
#         return "No relevant news found for this query."

#     summaries = []

#     for doc in retrieved:
#         source = doc["source"]
#         title = doc["title"]
#         text = doc["text"]

#         short = extract_first_sentences(text, n=2)

#         summary = f"{short} (Source: {source} — {title})"
#         summaries.append(summary)

#     return "\n\n".join(summaries)

In [None]:
import re

def extract_first_sentences(text, n=2):
    """Return the first n sentences from a block of text."""
    sentences = re.split(r'(?<=[.!?])\s+', text.strip())
    sentences = [s for s in sentences if s]
    if not sentences:
        return ""
    return " ".join(sentences[:n])


def summarize_query_with_rag(query, k=3, distance_threshold=2.0):
    """
    Retrieve top-k news articles for a query and return short, extractive summaries.
    This is a non-generative (no GPT-2) summarizer, so it cannot hallucinate.
    """
    retrieved = retrieve_docs(query, k=k)

    if len(retrieved) == 0:
        return "No relevant news found for this query."

    summaries = []

    for doc in retrieved:
        dist = doc.get("distance", None)
        if dist is not None and dist > distance_threshold:
            continue

        source = doc["source"]
        title = doc["title"]
        text = doc["text"]
        url = doc.get("url", "")

        short = extract_first_sentences(text, n=2)
        if not short:
            continue

        line = f"{short} (Source: {source} — {title})"
        if url:
            line += f"\nLink: {url}"

        summaries.append(line)

    if not summaries:
        return "No strong matches found for this query."

    return "\n\n".join(summaries)


In [None]:
print(summarize_query_with_rag("northern BC oil pipeline", k=2))



Ottawa and Alberta are poised to announce a memorandum of understanding, affirming support for a pipeline to the B.C. coast that's being described as 'Northern Gateway 2.0.' (Source: Global News BC — Oil tanker ban just one obstacle to northern B.C. pipeline as MOU expected Thursday)
Link: https://globalnews.ca/news/11545298/alberta-bc-oil-pipeline-mou/

The West Coast Oil Tanker Ban came into effect in 2019 and prohibits tankers from carrying more than 12,500 metric tons of crude oil along the northern coast of B.C. (Source: Global News BC — B.C. Coastal First Nations dismiss any pipeline MOU, vow it will ‘never be built’)
Link: https://globalnews.ca/news/11544657/bc-first-nations-dismiss-pipeline-mou/
