In [None]:
import pandas as pd
import numpy as np
import torch
from sentence_transformers import SentenceTransformer
from sklearn.neighbors import NearestNeighbors
import pickle

# --- Step 1. Load and Precompute Embeddings ---
df = pd.read_csv("/content/cleaned_true.csv")  # Assume 'title' and 'text' columns

# Check and use GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Instantiate model with the correct device
model = SentenceTransformer("all-MiniLM-L6-v2", device=device)  # Automatic device handling

print("Embedding true news corpus...")
corpus = df['text'].tolist()
corpus_embeddings = model.encode(
    corpus,
    batch_size=64,
    show_progress_bar=True,
    device=device  # this guarantees encoding is on GPU if available
)

# Save index for retrieval
with open("corpus_embeddings.pkl", "wb") as f:
    pickle.dump(corpus_embeddings, f)
df.to_pickle("isot_true_news_df.pkl")

# Fit NearestNeighbors retriever (runs on CPU, but this is fast for small-medium datasets)
nn = NearestNeighbors(n_neighbors=5, metric='cosine').fit(corpus_embeddings)
with open("isot_nn.pkl", "wb") as f:
    pickle.dump(nn, f)

print("ISOT retriever trained and saved.")


Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Embedding true news corpus...


Batches:   0%|          | 0/326 [00:00<?, ?it/s]

ISOT retriever trained and saved.


In [None]:
import requests
import pickle
import pandas as pd
from sentence_transformers import SentenceTransformer

try:
    from tabulate import tabulate
except ImportError:
    print("Installing tabulate for nice formatting...")
    import sys
    import subprocess
    subprocess.check_call([sys.executable, "-m", "pip", "install", "tabulate"])
    from tabulate import tabulate

# Restore retriever and data
df = pd.read_pickle("isot_true_news_df.pkl")
with open("corpus_embeddings.pkl", "rb") as f:
    corpus_embeddings = pickle.load(f)
with open("isot_nn.pkl", "rb") as f:
    nn = pickle.load(f)
model = SentenceTransformer("all-MiniLM-L6-v2")

def retrieve_isot(query, top_k=5):
    q_emb = model.encode([query])
    distances, indices = nn.kneighbors(q_emb, n_neighbors=top_k)
    results = df.iloc[indices[0]][['title', 'text']]
    results['source'] = 'ISOT'
    results['url'] = None  # ensure column exists for uniformity
    return results.reset_index(drop=True)

def retrieve_gnews(query, api_key, top_k=3):
    url = f"https://newsapi.org/v2/everything?q={query}&sortBy=popularity&apiKey={api_key}"
    res = requests.get(url)
    if res.status_code != 200:
        return pd.DataFrame(columns=["title", "text", "source", "url"])
    articles = res.json().get("articles", [])[:top_k]
    records = []
    for a in articles:
        records.append({
            "title": a.get("title"),
            "text": a.get("description"),
            "source": a.get("source", {}).get("name", "GNews"),
            "url": a.get("url")
        })
    return pd.DataFrame(records)

def rag_plus_api(query, api_key, rag_k=3, api_k=3):
    rag_results = retrieve_isot(query, top_k=rag_k)
    api_results = retrieve_gnews(query, api_key, top_k=api_k)
    combined = pd.concat([rag_results, api_results], ignore_index=True)
    return rag_results, api_results, combined

# Usage Example
API_KEY = "66fe551778dc4fb7a59af26ef213a2cd"
query = "bad news for trump"

rag_only, gnews_only, combined_results = rag_plus_api(query, api_key=API_KEY, rag_k=3, api_k=3)

def print_results(df, title):
    print("\n" + "="*30)
    print(f"{title}\n")
    if df.empty:
        print("No results found.")
        return
    # Only show first 512 chars of large texts for readability
    disp_df = df.copy()
    disp_df["text"] = disp_df["text"].apply(lambda x: (x[:512]+"...") if pd.notnull(x) and len(x) > 512 else x)
    print(tabulate(disp_df[["source", "title", "text", "url"]], headers="keys", tablefmt="fancy_grid", showindex=False))

print_results(rag_only, "1. Only RAG Results (ISOT True News)")
print_results(gnews_only, "2. Only GNews API Results")
print_results(combined_results, "3. Combined Results (RAG + GNews API)")



1. Only RAG Results (ISOT True News)

╒══════════╤════════════════════════════════════════════════════════════════════════════════╤═════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════╤═══════╕
│ source   │ title                                                                          │ text                                                                                                                                                                                                                                                   