In [2]:
import datetime
import os
import logging
from typing import List, Dict, Any

import pandas as pd
from newsapi import NewsApiClient
from dotenv import load_dotenv

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("news_pipeline.log"),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

def extract_data() -> List[Dict[str, Any]]:
    """
    Extract news articles from NewsAPI for multiple topics and combine them.
    
    Returns:
        List[Dict[str, Any]]: Combined list of articles from different topics
        
    Raises:
        NewsAPIError: If there's an issue with the NewsAPI
        EnvironmentError: If required environment variables are missing
    """
    try:
        # Get current date and date from 7 days ago
        current_date = datetime.datetime.now().strftime('%Y-%m-%d')
        logger.info(f"Current date: {current_date}")
        
        seven_days_ago = (datetime.datetime.now() - datetime.timedelta(days=7)).strftime('%Y-%m-%d')
        logger.info(f"7 days ago: {seven_days_ago}")
        
        # Load API key from .env file
        load_dotenv()
        news_api = os.getenv("NEWS_API")
        
        if not news_api:
            raise EnvironmentError("NEWS_API environment variable not found. Please check your .env file.")
        
        # Initialize NewsAPI client
        newsapi = NewsApiClient(news_api)
        
        # Define topics to search for
        topics = ['GenAI', 'AI', 'Technology']
        combined_articles = []
        
        # Fetch articles for each topic
        for topic in topics:
            logger.info(f"Fetching articles for topic: {topic}...")
            try:
                articles = newsapi.get_everything(
                    q=topic,
                    from_param=seven_days_ago,
                    to=current_date,
                    language='en',
                    sort_by='relevancy',
                    page=2
                )
                
                if 'articles' not in articles:
                    logger.warning(f"No articles found for topic: {topic}")
                    continue
                    
                logger.info(f"Fetched {len(articles['articles'])} articles for topic: {topic}")
                combined_articles.extend(articles['articles'])
                
            except Exception as e:
                logger.error(f"Error fetching articles for topic {topic}: {str(e)}")
                # Continue with other topics rather than failing completely
        
        if not combined_articles:
            raise NewsAPIError("No articles were fetched from any topic")
        
        # Remove duplicate articles (same URL)
        seen_urls = set()
        unique_articles = []
        
        for article in combined_articles:
            if article['url'] not in seen_urls:
                seen_urls.add(article['url'])
                unique_articles.append(article)
        
        logger.info(f"Total unique articles fetched: {len(unique_articles)}")
        return unique_articles
        
    except Exception as e:
        logger.error(f"Error in extract_data: {str(e)}")
        raise 



In [3]:
extract_data()

2025-04-22 17:21:47,360 - __main__ - INFO - Current date: 2025-04-22
2025-04-22 17:21:47,362 - __main__ - INFO - 7 days ago: 2025-04-15
2025-04-22 17:21:47,365 - __main__ - INFO - Fetching articles for topic: GenAI...
2025-04-22 17:21:47,666 - __main__ - INFO - Fetched 100 articles for topic: GenAI
2025-04-22 17:21:47,666 - __main__ - INFO - Fetching articles for topic: AI...
2025-04-22 17:21:47,961 - __main__ - INFO - Fetched 98 articles for topic: AI
2025-04-22 17:21:47,962 - __main__ - INFO - Fetching articles for topic: Technology...
2025-04-22 17:21:48,363 - __main__ - INFO - Fetched 99 articles for topic: Technology
2025-04-22 17:21:48,364 - __main__ - INFO - Total unique articles fetched: 291


[{'source': {'id': None, 'name': 'Pypi.org'},
  'author': 'michelangelo.bucci@gmail.com',
  'title': 'gaspare 0.3.2',
  'description': "A rewriting of Answer.AI's Gaspard to the new Genai SDK",
  'url': 'https://pypi.org/project/gaspare/0.3.2/',
  'urlToImage': None,
  'publishedAt': '2025-04-18T22:46:42Z',
  'content': 'A required part of this site couldnt load. This may be due to a browser\r\n extension, network issues, or browser settings. Please check your\r\n connection, disable any ad blockers, or try using a diffe… [+12 chars]'},
 {'source': {'id': None, 'name': 'Pypi.org'},
  'author': 'michelangelo.bucci@gmail.com',
  'title': 'gaspare 0.3.1',
  'description': "A rewriting of Answer.AI's Gaspard to the new Genai SDK",
  'url': 'https://pypi.org/project/gaspare/0.3.1/',
  'urlToImage': None,
  'publishedAt': '2025-04-18T22:28:22Z',
  'content': 'A required part of this site couldnt load. This may be due to a browser\r\n extension, network issues, or browser settings. Please che