In [1]:
import pandas as pd
import requests
import os
from datetime import datetime
from dotenv import load_dotenv
import sys
# hack to get around notebook being not in main directory
sys.path.append("..")
from src.utils.config import GUARDIAN_API_KEY
from src.utils.config import NEWSDATA_API_KEY


## News sources

There's a lot of programmatic options for getting news sources. Here are some of them:
- Guardian API - https://open-platform.theguardian.com/documentation/
- Newsdata API

Tons of extension ideas, like specific AI-related news or spaceflight news or country-specific news sources that might not be covered in the above 2 sources.

### Guardian API

In [6]:
# Guardian API testing

def get_guardian_news_today(api_key):
    """Fetches news articles published today from The Guardian API."""
    today_str = datetime.now().strftime('%Y-%m-%d')
    api_url = f"https://content.guardianapis.com/search"
    params = {
        'api-key': api_key,
        'from-date': today_str,
        'to-date': today_str,
        'show-fields': 'headline,shortUrl,firstPublicationDate', # Specify desired fields
        'page-size': 50 # Adjust as needed
    }
    
    try:
        response = requests.get(api_url, params=params)
        response.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx)
        data = response.json()
        
        articles = []
        if 'response' in data and 'results' in data['response']:
            for item in data['response']['results']:
                articles.append({
                    'title': item.get('webTitle'),
                    'url': item.get('webUrl'),
                    'publication_date': item.get('webPublicationDate'),
                    'headline': item.get('fields', {}).get('headline'),
                    'short_url': item.get('fields', {}).get('shortUrl')
                })
        
        return pd.DataFrame(articles) # Return as a pandas DataFrame
        
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data from Guardian API: {e}")
        return pd.DataFrame() # Return empty DataFrame on error
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return pd.DataFrame()

# Example usage:
# Ensure GUARDIAN_API_KEY is loaded correctly from your config or environment
if 'GUARDIAN_API_KEY' in locals() or 'GUARDIAN_API_KEY' in globals():
    news_df = get_guardian_news_today(GUARDIAN_API_KEY)
    print(f"Fetched {len(news_df)} articles from today.")
    display(news_df.head()) # Uncomment to display in notebook
else:
    print("GUARDIAN_API_KEY not found. Please ensure it's loaded.")

Fetched 50 articles from today.


Unnamed: 0,title,url,publication_date,headline,short_url
0,Tony Blair’s climate crisis views ‘absolutely ...,https://www.theguardian.com/politics/live/2025...,2025-04-30T16:40:14Z,Tony Blair’s climate crisis views ‘absolutely ...,https://www.theguardian.com/p/x26ax9
1,Trump hits out at journalist in tense TV inter...,https://www.theguardian.com/us-news/live/2025/...,2025-04-30T16:35:09Z,Trump hits out at journalist in tense TV inter...,https://www.theguardian.com/p/x26aet
2,Europe live: Kyiv ready to sign US minerals de...,https://www.theguardian.com/world/live/2025/ap...,2025-04-30T16:33:02Z,Europe live: Kyiv ready to sign US minerals de...,https://www.theguardian.com/p/x26av5
3,US economy shrinks in first quarter of Trump 2...,https://www.theguardian.com/business/2025/apr/...,2025-04-30T16:24:27Z,US economy shrinks in first quarter of Trump 2...,https://www.theguardian.com/p/x2674k
4,Trump warns ‘nothing will stop me’ at rally to...,https://www.theguardian.com/us-news/2025/apr/2...,2025-04-30T16:13:45Z,Trump warns ‘nothing will stop me’ at rally to...,https://www.theguardian.com/p/x268ft


In [7]:
import json

def get_guardian_news_today_json(api_key):
    """Fetches news articles published today from The Guardian API and returns a JSON object (list of dicts)."""
    today_str = datetime.now().strftime('%Y-%m-%d')
    api_url = f"https://content.guardianapis.com/search"
    params = {
        'api-key': api_key,
        'from-date': today_str,
        'to-date': today_str,
        'show-fields': 'headline,shortUrl,firstPublicationDate', # Specify desired fields
        'page-size': 50 # Adjust as needed
    }
    
    try:
        response = requests.get(api_url, params=params)
        response.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx)
        data = response.json()
        
        articles = []
        if 'response' in data and 'results' in data['response']:
            for item in data['response']['results']:
                articles.append({
                    'title': item.get('webTitle'),
                    'url': item.get('webUrl'),
                    'publication_date': item.get('webPublicationDate'),
                    'headline': item.get('fields', {}).get('headline'),
                    'short_url': item.get('fields', {}).get('shortUrl')
                })
        
        # Return the list of dictionaries directly
        return articles 
        
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data from Guardian API: {e}")
        return [] # Return empty list on error
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return []

# Example usage:
# Ensure GUARDIAN_API_KEY is loaded correctly from your config or environment
if 'GUARDIAN_API_KEY' in locals() or 'GUARDIAN_API_KEY' in globals():
    news_list = get_guardian_news_today_json(GUARDIAN_API_KEY)
    print(f"Fetched {len(news_list)} articles as a list.")
    # Optional: Print the first item as a JSON string for inspection
    if news_list:
        print("\nFirst article (JSON format):")
        print(json.dumps(news_list[0], indent=2))
else:
    print("GUARDIAN_API_KEY not found. Please ensure it's loaded.")

Fetched 50 articles as a list.

First article (JSON format):
{
  "title": "Tony Blair\u2019s climate crisis views \u2018absolutely aligned\u2019 with government policy, Starmer says \u2013 UK politics live",
  "url": "https://www.theguardian.com/politics/live/2025/apr/30/labour-tories-local-elections-2025-pmqs-tony-blair-keir-starmer-kemi-badenoch-uk-politics-live-news-updates",
  "publication_date": "2025-04-30T16:40:14Z",
  "headline": "Tony Blair\u2019s climate crisis views \u2018absolutely aligned\u2019 with government policy, Starmer says \u2013 UK politics live",
  "short_url": "https://www.theguardian.com/p/x26ax9"
}


### Newsdata.io API

Gets a wider range of news stories in their coverage. 

See documentation: https://newsdata.io/documentation

Current function below is very basic

In [2]:
# Newsdata.io API call function
import requests
import json
from datetime import datetime, timedelta

def get_newsdata_news_today_json(api_key, language='en', country='us,gb'):
    """Fetches latest news articles from the Newsdata.io API (primarily from today) and returns a JSON object (list of dicts)."""
    # Note: Newsdata.io /latest endpoint gets recent news. Filtering precisely for *today* might require paid plans or post-filtering.
    # We'll fetch recent news and can filter later if needed.
    api_url = "https://newsdata.io/api/1/latest"
    params = {
        'apikey': api_key,
        'language': language,
        # 'country': country, # Free plan might restrict this or number of countries
        # 'timeframe': 24 # Get news from the last 24 hours (check if supported on free plan)
    }
    
    try:
        response = requests.get(api_url, params=params)
        response.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx)
        data = response.json()
        
        articles = []
        if data.get('status') == 'success' and 'results' in data:
            for item in data['results']:
                # Basic filtering for today's date if pubDate is available
                # pub_date_str = item.get('pubDate') # Format is often 'YYYY-MM-DD HH:MM:SS'
                # if pub_date_str:
                #    try:
                #        pub_date = datetime.strptime(pub_date_str, '%Y-%m-%d %H:%M:%S').date()
                #        if pub_date != datetime.now().date():
                #            continue # Skip if not today
                #    except ValueError:
                #        pass # Ignore if date format is unexpected
                        
                articles.append({
                    'title': item.get('title'),
                    'url': item.get('link'),
                    'publication_date': item.get('pubDate'),
                    'description': item.get('description'),
                    'source': item.get('source_id'),
                    'keywords': item.get('keywords'), # Often null on free tier
                    'image_url': item.get('image_url')
                })
        else:
             print(f"Newsdata API did not return success status. Status: {data.get('status')}")
             print(f"Response: {data}") # Print full response for debugging
             return []
        
        # Return the list of dictionaries directly
        return articles 
        
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data from Newsdata API: {e}")
        # If the error is due to the response content, print it
        if 'response' in locals() and response is not None:
             print(f"Response status code: {response.status_code}")
             print(f"Response text: {response.text}")
        return [] # Return empty list on error
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return []

# Example usage:
# Ensure NEWSDATA_API_KEY is loaded correctly from your config or environment
if 'NEWSDATA_API_KEY' in locals() or 'NEWSDATA_API_KEY' in globals():
    newsdata_list = get_newsdata_news_today_json(NEWSDATA_API_KEY)
    print(f"Fetched {len(newsdata_list)} articles from Newsdata.io.")
    # Optional: Print the first item as a JSON string for inspection
    if newsdata_list:
        print("\nFirst Newsdata.io article (JSON format):")
        print(json.dumps(newsdata_list[0], indent=2))
else:
    print("NEWSDATA_API_KEY not found. Please ensure it's loaded.")

Fetched 10 articles from Newsdata.io.

First Newsdata.io article (JSON format):
{
  "title": "Maradona Ate Burgers After Brain Surgery: ICU Chief Slams Lax Post-Op Care - 'Anything Was Allowed'",
  "url": "https://www.outlookindia.com/sports/football/maradona-ate-burgers-after-brain-surgery-icu-chief-slams-lax-post-op-care-anything-was-allowed",
  "publication_date": "2025-04-30 04:57:00",
  "description": null,
  "source": "outlookindia",
  "keywords": [
    "maradona ,maradona ate burger,maradona ate burger after braiin surgery,maradona death trial,maradona final days,bdiego maradona ,bdiego maradona family,fernando villarejo,olivos clinic,maradona brain surgery,what happened to maradona,maradona death,how did maradona die,argentina football legend,football news,sports news"
  ],
  "image_url": "https://media.assettype.com/outlookindia/2025-04-30/ausz2wjd/maradona-death-trial-ap-photo?ar=40%3A21&auto=format%2Ccompress&enlarge=true&mode=crop&ogImage=true&overlay=false&overlay_position