# Step 1 | Platform Setup

## Step 1.1 | Check Environment
1. Open Anaconda Prompt
2. conda activate tf-gpu
3. jupyter notebook

In [1]:
import sys
import torch
import transformers

!where python
print(sys.executable)
print(torch.__version__)
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))
print(transformers.__version__)
print(transformers.__file__)

C:\Anaconda\envs\tf-gpu\python.exe
C:\Anaconda\python.exe
C:\Anaconda\envs\tf-gpu\python.exe
2.1.0+cu118
True
NVIDIA GeForce RTX 3050 4GB Laptop GPU
4.35.2
C:\Anaconda\envs\tf-gpu\lib\site-packages\transformers\__init__.py


## Step 1.2 | Import Libraries

In [82]:
import requests
import time
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import AgglomerativeClustering
from openai import OpenAI
import unicodedata
import os
from openpyxl import load_workbook
from openpyxl.styles import Font

# Step 2 | Fetch News Articles (GNews API)

## Step 2.1 | Define GNews API Key

In [1]:
# Define your API key here
GNEWS_API_KEY = "INSERT_KEY_HERE"

## Step 2.2 | Define Fetch Function

In [12]:
def fetch_top_headlines(api_key, max_results=8):
    """
    Fetches top headlines from GNews using the /top-headlines endpoint.

    Parameters:
        api_key (str): Your GNews API key
        max_results (int): Total number of articles to return (max 10 allowed by GNews)

    Returns:
        list of dicts: Each dict contains title, description, content, source, and publishedAt
    """
    base_url = "https://gnews.io/api/v4/top-headlines"
    params = {
        "lang": "en",
        "country": "us",
        "max": max_results,
        "apikey": api_key
    }

    try:
        response = requests.get(base_url, params=params)
        response.raise_for_status()
        data = response.json()

        articles = []
        for article in data.get("articles", []):
            articles.append({
                "title": article.get("title"),
                "description": article.get("description"),
                "content": article.get("content"),
                "source_name": article.get("source", {}).get("name"),
                "publishedAt": article.get("publishedAt"),
                "url": article.get("url")
            })

        return articles

    except requests.exceptions.RequestException as e:
        print("Error fetching top headlines:", e)
        return []

## Step 2.3 | Fetch Articles

In [113]:
# Fetch today's top headlines
all_articles = fetch_top_headlines(GNEWS_API_KEY, max_results=25)

# Check the result
print(f"Fetched {len(all_articles)} articles.")
pd.DataFrame(all_articles)[["title", "source_name", "publishedAt"]]

Fetched 10 articles.


Unnamed: 0,title,source_name,publishedAt
0,Jake Paul vs. Julio Cesar Chavez Jr. live resu...,Yahoo Sports,2025-06-28T23:20:43Z
1,Final party set to end Bezos-Sánchez wedding e...,The Guardian,2025-06-28T22:45:00Z
2,Hurricane season is here and meteorologists ar...,"ABC News - Breaking News, Latest News and Videos",2025-06-28T21:13:26Z
3,Warren Buffett announces $6 billion in donatio...,AP News,2025-06-28T21:04:00Z
4,"Dave Parker, 2-time World Series champ, 7-time...",ESPN,2025-06-28T20:29:00Z
5,Ancient city possibly ruled by females living ...,CBS News,2025-06-28T20:26:47Z
6,Protesters line highway in Florida Everglades ...,AP News,2025-06-28T20:11:00Z
7,"Celebrities emerge day after Jeff Bezos, Laure...",USA Today,2025-06-28T19:42:04Z
8,Kneecap hit back at Starmer in highly-charged ...,BBC,2025-06-28T18:41:09Z
9,"After Supreme Court term, Chief Justice Robert...",CNN,2025-06-28T17:54:00Z


# Step 3 | Data Preprocessing

## Step 3.1 | Convert Data to DataFrame

In [114]:
# Convert raw list to DataFrame
df_articles = pd.DataFrame(all_articles)

# Fill missing fields with empty string (for text fields)
df_articles.fillna("", inplace=True)

In [115]:
# Define cleaning function for messy text encoding
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = unicodedata.normalize("NFKC", text)  # Normalize character width & accents
    text = text.replace("â€™", "'").replace("â€œ", '"').replace("â€�", '"')
    text = text.replace("â€”", "—").replace("â€“", "-")
    text = text.replace("Ã©", "é").replace("Ã", "à")  # Add more mappings as needed
    return text.strip()

In [116]:
# Clean the core text fields
text_columns = ["title", "description", "content"]
for col in text_columns:
    if col in df_articles.columns:
        df_articles[col] = df_articles[col].apply(clean_text)

In [117]:
# Combine into full_text for clustering or embedding
df_articles["full_text"] = (
    df_articles["title"].str.strip() + ". " +
    df_articles["description"].str.strip() + ". " +
    df_articles["content"].str.strip()
).str.strip()

print("✅ Cleaned and combined text fields")

✅ Cleaned and combined text fields


## Step 3.2 | Remove Irrelevant Articles

In [118]:
# Drop empty or very short articles (less than 50 characters of combined text)
df_articles = df_articles[df_articles["full_text"].str.len() > 50].reset_index(drop=True)

In [119]:
# Drop near-duplicates using cosine similarity
def drop_near_duplicates(df, text_column="full_text", similarity_threshold=0.85):
    """
    Drops near-duplicate rows based on cosine similarity between their text contents.
    """
    texts = df[text_column].tolist()
    tfidf = TfidfVectorizer(stop_words='english').fit_transform(texts)
    sim_matrix = cosine_similarity(tfidf)
    
    to_drop = set()
    for i in range(len(texts)):
        for j in range(i + 1, len(texts)):
            if sim_matrix[i, j] > similarity_threshold:
                to_drop.add(j)
    return df.drop(df.index[list(to_drop)]).reset_index(drop=True)

In [120]:
# Apply deduplication
df_articles_clean = drop_near_duplicates(df_articles)

print(f"Cleaned down to {len(df_articles_clean)} unique articles.")
df_articles_clean[["title", "source_name", "publishedAt"]]

Cleaned down to 10 unique articles.


Unnamed: 0,title,source_name,publishedAt
0,Jake Paul vs. Julio Cesar Chavez Jr. live resu...,Yahoo Sports,2025-06-28T23:20:43Z
1,Final party set to end Bezos-Sánchez wedding e...,The Guardian,2025-06-28T22:45:00Z
2,Hurricane season is here and meteorologists ar...,"ABC News - Breaking News, Latest News and Videos",2025-06-28T21:13:26Z
3,Warren Buffett announces $6 billion in donatio...,AP News,2025-06-28T21:04:00Z
4,"Dave Parker, 2-time World Series champ, 7-time...",ESPN,2025-06-28T20:29:00Z
5,Ancient city possibly ruled by females living ...,CBS News,2025-06-28T20:26:47Z
6,Protesters line highway in Florida Everglades ...,AP News,2025-06-28T20:11:00Z
7,"Celebrities emerge day after Jeff Bezos, Laure...",USA Today,2025-06-28T19:42:04Z
8,Kneecap hit back at Starmer in highly-charged ...,BBC,2025-06-28T18:41:09Z
9,"After Supreme Court term, Chief Justice Robert...",CNN,2025-06-28T17:54:00Z


# Step 4 | Topic Clustering (TF-IDF)

## Step 4.1 | Vectorize Text

In [121]:
# Vectorize the full_text
tfidf = TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf.fit_transform(df_articles_clean["full_text"])

## Step 4.2 | Cluster Text

In [124]:
# Cluster into shared narratives
num_clusters = 4

clustering_model = AgglomerativeClustering(
    n_clusters=num_clusters,
    metric='cosine',
    linkage='average',
    compute_full_tree=True
)

cluster_labels = clustering_model.fit_predict(tfidf_matrix.toarray())

## Step 4.3 | Add Cluster Labels to DataFrame

In [125]:
# Attach cluster labels back to your cleaned DataFrame
df_articles_clean["cluster"] = cluster_labels

# Preview
df_articles_clean[["cluster", "title", "source_name"]].sort_values("cluster")

Unnamed: 0,cluster,title,source_name
0,0,Jake Paul vs. Julio Cesar Chavez Jr. live resu...,Yahoo Sports
1,0,Final party set to end Bezos-Sánchez wedding e...,The Guardian
4,0,"Dave Parker, 2-time World Series champ, 7-time...",ESPN
5,0,Ancient city possibly ruled by females living ...,CBS News
7,0,"Celebrities emerge day after Jeff Bezos, Laure...",USA Today
8,0,Kneecap hit back at Starmer in highly-charged ...,BBC
9,0,"After Supreme Court term, Chief Justice Robert...",CNN
3,1,Warren Buffett announces $6 billion in donatio...,AP News
6,2,Protesters line highway in Florida Everglades ...,AP News
2,3,Hurricane season is here and meteorologists ar...,"ABC News - Breaking News, Latest News and Videos"


# Step 5 | Prompt Generator (OpenRouter)

## Step 5.1 | Define OpenRouter API Key

In [None]:
# Create the OpenAI-compatible client using OpenRouter settings
client = OpenAI(
    api_key="INSERT_KEY_HERE",
    base_url="https://openrouter.ai/api/v1"  # required for OpenRouter
)

## Step 5.2 | Define LLM Parameters

In [128]:
# List of OpenRouter LLMs
# models = client.models.list()
# for m in models.data:
#     print(m.id)
# """
# "openai/gpt-4o"
# "microsoft/mai-ds-r1:free"
# "google/gemini-2.0-flash-exp:free"
# "minimax/minimax-m1:extended"
# """

In [129]:
# Model configuration
LLM_MODEL = "microsoft/mai-ds-r1:free"   # Swap in any free OpenRouter model here

# Content chunking
max_content_chars = 1000                  # Characters of content to include per article in the prompt

# Generation parameters
max_tokens = 2000                        # Maximum tokens returned by the LLM per response
temperature = 0.7                        # Higher = more creative, lower = more focused and safe
top_p = 1.0                              # Optional: nucleus sampling control (keep at 1.0 unless experimenting)

# Rate limit control
rate_limit_pause = 2.0                   # Seconds to pause between API calls to avoid rate limits

## Step 5.3 | Define Prompt Generator

Generates a reflective writing prompt for a cluster of articles.

Includes title + description + trimmed content for each article to guide the LLM.

**Parameters:**
- cluster_df (pd.DataFrame): DataFrame of articles in one cluster
- cluster_id (int): Cluster ID (for logging)
- model (str): LLM model ID from OpenRouter
- max_content_chars (int): Max characters to keep from article content (default: 400)

**Returns:**
- str: The generated prompt output from the LLM

In [130]:
def generate_prompt_for_cluster(cluster_df, cluster_id, model=LLM_MODEL, max_content_chars=max_content_chars, top_p=top_p):

    """
    EXTRACT TEXT FROM EACH ARTICLE IN CLUSTER
    """
    entries = []
    for _, row in cluster_df.iterrows():
        title = row.get("title", "").strip()
        description = row.get("description", "").strip()
        content = row.get("content", "").strip()[:max_content_chars]

        entry_text = f"- {title}: {description} {content}".strip()
        entries.append(entry_text)

    headlines_text = "\n".join(entries)

    """
    INPUT PROMPT HERE
    """
    prompt = f"""
Here are 2–3 articles or headlines about a current event:

{headlines_text}

I’m writing a reflective piece in the “Politics for the Tired” style. Please:

- Summarize what happened in emotionally neutral terms
- Identify 2–3 emotions a thoughtful, tired reader might feel
- Name 1–2 moral or identity-based tensions
- Suggest a soft narrative arc for an essay
- Propose a working title and subtitle that evoke the emotional truth, not just the facts
"""

    """
    CALL LLM API
    """
    try:
        response = client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": prompt}],
            temperature=temperature,
            max_tokens=max_tokens,  # Keep outputs token-efficient
            top_p=top_p
        )
        return response.choices[0].message.content

    except Exception as e:
        return f"ERROR generating prompt for cluster {cluster_id}: {e}"

## Step 5.4 | Run LLM Prompt

In [131]:
# Step 1: Sort clusters by number of articles (descending)
cluster_sizes = (
    df_articles_clean["cluster"]
    .value_counts()
    .sort_values(ascending=False)
    .index.tolist()
)

In [132]:
# Step 2: Run LLM prompt generation in sorted order
clustered_prompts = []

for cluster_id in cluster_sizes:
    cluster_df = df_articles_clean[df_articles_clean["cluster"] == cluster_id]
    output = generate_prompt_for_cluster(cluster_df, cluster_id)
    
    clustered_prompts.append({
        "cluster": cluster_id,
        "headlines": "\n".join(cluster_df["title"].tolist()),
        "prompt_output": output
    })

    # Pause to avoid hitting rate limits
    time.sleep(rate_limit_pause)

# Step 3: Create final DataFrame
df_prompts = pd.DataFrame(clustered_prompts)

## Step 5.5 | Export to Excel

In [133]:
# === Parameters to track ===
parameters_used = {
    "LLM_Model": LLM_MODEL,
    "max_content_chars": max_content_chars,
    "max_tokens": max_tokens,
    "temperature": temperature,
    "top_p": top_p,
    "rate_limit_pause": rate_limit_pause
}

In [134]:
# === Step 1: Create filename with versioning ===
base_filename = "narrative_prompts"
existing_files = [f for f in os.listdir() if f.startswith(base_filename) and f.endswith(".xlsx")]

if f"{base_filename}.xlsx" not in existing_files:
    filename = f"{base_filename}.xlsx"
else:
    version = 1
    while f"{base_filename}_{version}.xlsx" in existing_files:
        version += 1
    filename = f"{base_filename}_{version}.xlsx"

In [135]:
# === Step 2: Create parameter DataFrame ===
df_params = pd.DataFrame(list(parameters_used.items()), columns=["Parameter", "Value"])

In [136]:
# === Step 3: Write both DataFrames to Excel ===
with pd.ExcelWriter(filename, engine="openpyxl") as writer:
    df_prompts.to_excel(writer, sheet_name="Prompts", index=False)
    df_params.to_excel(writer, sheet_name="Parameters", index=False)

In [137]:
# === Step 4 (optional): Set font style in Excel ===
wb = load_workbook(filename)
font = Font(name="Aptos Narrow", size=11)

for sheet in wb.sheetnames:
    ws = wb[sheet]
    for row in ws.iter_rows():
        for cell in row:
            cell.font = font

wb.save(filename)

print(f"✅ Prompts and parameters saved to {filename}")

✅ Prompts and parameters saved to narrative_prompts_3.xlsx
