# Stage 1: Ingestion
This notebook covers the data ingestion process for the automated video pipeline.
**Goal**: Fetch trending news from Google RSS, resolve redirects, and extract full content.

In [1]:
import feedparser
import trafilatura
import json
import os
import requests
import time

def resolve_google_news_url(url):
    try:
        # Google News uses base64 encoded urls or redirects.
        # The simplest way is to let requests follow redirects.
        response = requests.head(url, allow_redirects=True, timeout=10)
        return response.url
    except Exception as e:
        print(f"Failed to resolve {url}: {e}")
        return url

def fetch_trending_news():
    rss_url = "https://news.google.com/rss?hl=en-US&gl=US&ceid=US:en"
    print(f"Fetching RSS from {rss_url}...")
    feed = feedparser.parse(rss_url)
    
    news_data = []
    
    # Take top 5 to ensure we get at least 1-2 good scrapes
    for entry in feed.entries[:5]:
        print(f"Processing: {entry.title}")
        original_url = entry.link
        
        resolved_url = resolve_google_news_url(original_url)
        print(f"Resolved to: {resolved_url}")
        
        if "google.com" in resolved_url:
             print("Skipping loopback URL")
             continue

        try:
            downloaded = trafilatura.fetch_url(resolved_url)
            if downloaded:
                text = trafilatura.extract(downloaded)
                if text and len(text) > 200:
                    news_data.append({
                        "title": entry.title,
                        "link": resolved_url,
                        "published": entry.published,
                        "content": text
                    })
                    print("Success!")
                else:
                    print("Trafilatura returned empty/short text.")
            else:
                print("Trafilatura fetch failed.")
        except Exception as e:
            print(f"Scrape error: {e}")
        
        # Be polite
        time.sleep(1)
            
    return news_data

if __name__ == "__main__":
    data = fetch_trending_news()
    os.makedirs("../data", exist_ok=True)
    
    output_path = "../data/raw_news.json"
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4, ensure_ascii=False)
    print(f"Saved {len(data)} items to {output_path}")

Fetching RSS from https://news.google.com/rss?hl=en-US&gl=US&ceid=US:en...


Processing: Anti-ICE protests to be held across US as organizers urge national strike - The Guardian


Resolved to: https://news.google.com/rss/articles/CBMif0FVX3lxTE9mVGZTRG1JRmtyRXkwckk0NTl4SXZGTGFnWnN2dmk3WFFzUWREUjFLSzI2RDItSUZaX04tdl9naDJLelNvbkF2elIxemtOdDl0SWRpcVI4ekJvOGFiTUxuUjN0N1cyaFpUQy1obHptUjY0al82NmRwWWY2NEdQOW8?oc=5&hl=en-IN&gl=IN&ceid=IN:en
Skipping loopback URL
Processing: Trump says Putin will not attack Ukraine cities during cold week - BBC


Resolved to: https://news.google.com/rss/articles/CBMiWkFVX3lxTE1GZVB4bGtfRGZwZDVmSkxGU0Z0NW05Z3dJbUt2LUQwNGtVRUIwcFRwRGlEMjVuTVN6cHlHV2hkNU9hdzNXYVRTSVhNWWZjWUMtQ0lKMUx1Rktrdw?oc=5&hl=en-IN&gl=IN&ceid=IN:en
Skipping loopback URL
Processing: US lifts some Venezuela sanctions to ease oil sales - Reuters


Resolved to: https://news.google.com/rss/articles/CBMimgFBVV95cUxQRFhRQUlMNEZZb0E2cXdtaVh3TVcwUzYxZTlFNDgyYkhXVHViWmRndkRwZzdMeVc5cGlucE1jeEFacTNDOEw1ejh6NW9zdWFJdHRfaEJMbk52ZlRXb2pkV3ZnQ3FZSktkYzVBS0lQUlBoMTY2T1FkYUVsN19vWmVPSUJJN1FmRXVNb0c5a2RBbUZ6amJqVXlPS1dn?oc=5&hl=en-IN&gl=IN&ceid=IN:en
Skipping loopback URL
Processing: Hong Kong companyâ€™s concession to operate Panama Canal ports is ruled unconstitutional - AP News


Resolved to: https://news.google.com/rss/articles/CBMikAFBVV95cUxNLUxlVTRMRUd6UFQ3MHZNRVVaVGRtTmFOcFRlbTlzWE1USmthVjBHOG9SeEhDbDRsNHNCQi1lVk9nNHBkRjlZYWtwMGtJa1NCdXJQMmZYU2hLN0tCb0k0elNqV1E1MHhxZ2ZMRlhZU0pnM0ZkM3FYc0hhLW1CQWV3aExHVVE1UFU1b0RmSnhiMVE?oc=5&hl=en-IN&gl=IN&ceid=IN:en
Skipping loopback URL
Processing: Massachusetts on "razor's edge" with weekend storm. Maps show best chance for snow, high winds. - CBS News


Resolved to: https://news.google.com/rss/articles/CBMijgFBVV95cUxNREs1Vkt2ZnZGRTJfdTVtZWJ6ajhpRnlsVUpmaGtZVEJmcGZ6SVkwLWNBYldFazdQTkZienVoWm5IMl9mVVdFcVpmbHNCODgzS1RlcGNITGF5ZUxfTWFPRy1LdzhzQUZRbkk0MnhVejRqZEpodE9hTHVMbTR0MHdQWUpGZVB2XzViVGNINTR3?oc=5&hl=en-IN&gl=IN&ceid=IN:en
Skipping loopback URL


Saved 0 items to ../data/raw_news.json
