In [None]:
pip install newspaper3k feedparser

In [1]:
pip install newspaper3k feedparser

Note: you may need to restart the kernel to use updated packages.


In [10]:
import feedparser
from newspaper import Article
import csv
from datetime import datetime

In [12]:
# Function to parse RSS feed and get article URLs
def get_article_urls_from_rss(feed_url):
    urls = []
    feed = feedparser.parse(feed_url)
    for entry in feed.entries:
        urls.append(entry.link)
    return urls
# Function to extract article information
def extract_article_info(url):
    try:
        article = Article(url)
        article.download()
        article.parse()

        # Handling cases where publish date or authors might not be available
        publish_date = article.publish_date if article.publish_date else "Unknown"
        authors = ", ".join(article.authors) if article.authors else "Unknown"

        article_info = {
            'title': article.title,
            'author': authors,
            'publish_date': publish_date,
            'content': article.text[:500]  # Limiting content to first 500 characters
        }
        return article_info
    except Exception as e:
        print(f"Failed to extract info from {url}: {e}")
        return None

In [17]:
# Main function to scrape articles from a list of RSS feeds
def scrape_articles_from_rss_feeds(rss_feeds):
    all_articles = []
    for feed_url in rss_feeds:
        print(f"Parsing RSS feed: {feed_url}")
        article_urls = get_article_urls_from_rss(feed_url)
        for url in article_urls:
            print(f"Extracting information from article: {url}")
            article_info = extract_article_info(url)
            if article_info:
                all_articles.append(article_info)
    return all_articles
# Example RSS feed URLs
rss_feeds = [
    "https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml",
    "https://www.theguardian.com/world/rss",
    "http://feeds.reuters.com/reuters/topNews",
    "https://abcnews.go.com/abcnews/topstories",
    "https://feeds.npr.org/1001/rss.xml"
]

# Scraping articles
articles = scrape_articles_from_rss_feeds(rss_feeds)
# Display the extracted information
for idx, article in enumerate(articles):
    print(f"\nArticle {idx + 1}")
    print(f"Title: {article['title']}")
    print(f"Author: {article['author']}")
    print(f"Publish Date: {article['publish_date']}")
    print(f"Content: {article['content'][:500]}...") 

Parsing RSS feed: https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml
Extracting information from article: https://www.nytimes.com/2024/08/30/world/middleeast/gaza-polio-vaccinations.html
Extracting information from article: https://www.nytimes.com/2024/08/30/health/polio-gaza-vaccines.html
Extracting information from article: https://www.nytimes.com/2024/08/30/business/economy/gaza-labor-unions.html
Extracting information from article: https://www.nytimes.com/2024/08/30/us/trump-moms-for-liberty-school-boards.html
Extracting information from article: https://www.nytimes.com/2024/08/30/business/economy/interest-rate-cuts-housing-affordability-crisis.html
Extracting information from article: https://www.nytimes.com/2024/08/30/business/economy/pce-inflation-fed-interest-rate.html
Extracting information from article: https://www.nytimes.com/2024/08/30/health/wegovy-covid-deaths.html
Extracting information from article: https://www.nytimes.com/2024/08/30/upshot/wegovy-obesity-drugs-n

In [14]:
# Saving to CSV
csv_file = "extracted_articles.csv"
csv_columns = ['title', 'author', 'publish_date', 'content']

try:
    with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
        writer.writeheader()
        for article in articles:
            writer.writerow(article)
    print(f"Data successfully written to {csv_file}")
except IOError:
    print("I/O error when writing to the CSV file")

Data successfully written to extracted_articles.csv
