In [None]:
"""
Module to fetch, sample, and save article titles from the Perigon API.

Defines:
- fetch_titles: retrieve titles with pagination and rate-limit handling.
- print_sample_titles: display a subset of fetched titles.
- save_titles_to_csv: write titles to a CSV file.
"""

import time
import requests
import pandas as pd

API_KEY = "<YOUR_API_KEY_HERE>"
BASE_URL = "https://api.goperigon.com/v1/all"

def fetch_titles(from_date: str, to_date: str, source: str = "cnn.com",
                 max_pages: int = 30, retry_after: int = 5) -> list:
    """
    Fetch article titles from a news source over a date range.

    Parameters
    ----------
    from_date : str
        Start date (YYYY-MM-DD) for fetching articles.
    to_date : str
        End date (YYYY-MM-DD) for fetching articles.
    source : str, default "cnn.com"
        Domain of the news source to query.
    max_pages : int, default 30
        Maximum number of result pages to request.
    retry_after : int, default 5
        Seconds to wait after a 429 (rate limit) response before retrying.

    Returns
    -------
    list of str
        Titles of articles retrieved from the API.
    """
    titles = []
    page = 1

    while page <= max_pages:
        url = (
            f"{BASE_URL}?source={source}"
            f"&from={from_date}&to={to_date}&sortBy=date"
            f"&page={page}&apiKey={API_KEY}&pageSize=100"
        )
        try:
            response = requests.get(url)

            if response.status_code == 429:
                print(f"Rate limit reached. Retrying in {retry_after} seconds...")
                time.sleep(retry_after)
                continue
            elif response.status_code == 400:
                print(f"Error fetching page {page}: {response.status_code}, {response.json()}")
                break
            elif response.status_code != 200:
                print(f"Error fetching page {page}: {response.status_code}, {response.text}")
                break

            data = response.json()
            articles = data.get("articles", [])
            if articles:
                titles.extend(article.get("title", "No Title") for article in articles)
                if len(articles) < 10:
                    print(f"Last page reached: {page}")
                    break
                page += 1
            else:
                print(f"No articles on page {page}")
                break

        except Exception as e:
            print(f"Error: {e}")
            break

    return titles

date_ranges = [
    ("2023-01-01", "2023-03-31"),
    ("2023-04-01", "2023-06-30"),
    ("2023-07-01", "2023-09-30"),
    ("2023-10-01", "2023-12-31"),
]

all_titles = []
for start_date, end_date in date_ranges:
    print(f"Fetching articles from {start_date} to {end_date}")
    titles = fetch_titles(from_date=start_date, to_date=end_date,
                          max_pages=30, retry_after=5)
    all_titles.extend(titles)

print(f"Total fetched titles: {len(all_titles)}")
df = pd.DataFrame({"Title": all_titles})

def print_sample_titles(df: pd.DataFrame, sample_size: int = 5) -> None:
    """
    Display a sample of the fetched article titles.

    Parameters
    ----------
    df : pandas.DataFrame
        DataFrame containing a 'Title' column.
    sample_size : int, default 5
        Number of titles to display.
    """
    print(df.head(sample_size))

print_sample_titles(df)

def save_titles_to_csv(df: pd.DataFrame, filename: str = "article_titles.csv") -> None:
    """
    Save the fetched article titles to a CSV file.

    Parameters
    ----------
    df : pandas.DataFrame
        DataFrame containing a 'Title' column.
    filename : str, default "article_titles.csv"
        Name of the output CSV file.
    """
    df.to_csv(filename, index=False)
    print(f"Titles saved to {filename}")

save_titles_to_csv(df)


In [None]:
"""
Batch analyze article titles for sensitive topics using the Groq API, then summarize results.

- Reads titles from a CSV file.
- `process_batch`: identifies the top 10 sensitive topics in each batch of titles.
- `summarize_results_batch`: condenses intermediate batch summaries into a final list.
- Processes titles in batches and prints the final summary.
"""

import os
import pandas as pd
from groq import Groq

os.environ["GROQ_API_KEY"] = "<YOUR_GROQ_API_KEY_HERE>"

client = Groq(api_key=os.environ["GROQ_API_KEY"])

# Read titles from CSV
csv_file = "article_titles.csv"
df = pd.read_csv(csv_file)
all_titles = df["Title"].tolist()

def process_batch(titles_batch):
    """
    Analyze a batch of titles for the top 10 most sensitive topics.

    Parameters
    ----------
    titles_batch : list of str
        A batch of article titles.

    Returns
    -------
    str or None
        The model’s ranked list of sensitive topics, or None on error.
    """
    articles_content = "\n".join(f"Title: {title}" for title in titles_batch)
    prompt = f"""Analyze the following article titles and identify the top 10 most sensitive
topics discussed. Sensitivity includes political, social, economic, or cultural topics that may
provoke significant discussion or controversy.

Titles:
{articles_content}

Provide a ranked list of the top 10 sensitive topics."""
    try:
        chat_completion = client.chat.completions.create(
            messages=[{"role": "user", "content": prompt}],
            model="llama-3.2-90b-vision-preview"
        )
        return chat_completion.choices[0].message.content
    except Exception as e:
        print(f"Error processing batch: {e}")
        return None

def summarize_results_batch(results_batch):
    """
    Summarize multiple batch results into a concise list of sensitive topics.

    Parameters
    ----------
    results_batch : list of str
        Summaries from previous batches.

    Returns
    -------
    str or None
        The consolidated summary of sensitive topics, or None on error.
    """
    results_content = "\n\n".join(results_batch)
    prompt = f"""Summarize the following topics from multiple batches into a concise summary of
the most sensitive topics.

Batch Summaries:
{results_content}

Provide a summarized list of sensitive topics."""
    try:
        chat_completion = client.chat.completions.create(
            messages=[{"role": "user", "content": prompt}],
            model="mixtral-8x7b-32768"
        )
        return chat_completion.choices[0].message.content
    except Exception as e:
        print(f"Error summarizing results: {e}")
        return None

# Process titles in batches
batch_size = 30
intermediate_summaries = []
for i in range(0, len(all_titles), batch_size):
    print(f"Processing batch {i // batch_size + 1}")
    batch = all_titles[i : i + batch_size]
    result = process_batch(batch)
    if result:
        intermediate_summaries.append(result)

# Summarize intermediate results
summary_batch_size = 5
final_intermediate_summaries = []
for i in range(0, len(intermediate_summaries), summary_batch_size):
    print(f"Summarizing intermediate batch {i // summary_batch_size + 1}")
    summary_batch = intermediate_summaries[i : i + summary_batch_size]
    summarized_result = summarize_results_batch(summary_batch)
    if summarized_result:
        final_intermediate_summaries.append(summarized_result)

# Final summarization
print("Final summarization step ...")
final_summary = summarize_results_batch(final_intermediate_summaries)

if final_summary:
    print("Final Analysis:")
    print(final_summary)
else:
    print("No final analysis available.")
