In [1]:
# Install libraries needed for RSS parsing, NLP, and text processing
!pip install feedparser nltk pandas numpy scikit-learn beautifulsoup4

Collecting feedparser
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting sgmllib3k (from feedparser)
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading feedparser-6.0.11-py3-none-any.whl (81 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.3/81.3 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: sgmllib3k
  Building wheel for sgmllib3k (setup.py) ... [?25l[?25hdone
  Created wheel for sgmllib3k: filename=sgmllib3k-1.0.0-py3-none-any.whl size=6046 sha256=23bf6eebcbef53abf9a108f7ecef699598cf173bdbf9d054b4d36dface7d37eb
  Stored in directory: /root/.cache/pip/wheels/3b/25/2a/105d6a15df6914f4d15047691c6c28f9052cc1173e40285d03
Successfully built sgmllib3k
Installing collected packages: sgmllib3k, feedparser
Successfully installed feedparser-6.0.11 sgmllib3k-1.0.0


In [2]:
# Import libraries for RSS parsing, NLP, and text processing
import feedparser
import nltk
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from bs4 import BeautifulSoup
import requests
import re
from datetime import datetime
import os

# Download NLTK data for sentence tokenization and stopwords
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
# Define RSS feeds for each category with updated finance feeds
RSS_FEEDS = {
    'general': ['http://feeds.bbci.co.uk/news/world/rss.xml', 'https://rss.nytimes.com/services/xml/rss/nyt/World.xml', 'http://feeds.reuters.com/reuters/topNews'],
    'technology': ['https://techcrunch.com/feed/', 'https://www.wired.com/feed/rss', 'https://www.technologyreview.com/feed/'],
    'finance': [
        'http://feeds.reuters.com/reuters/businessNews',  # Reuters Finance
        'https://www.ft.com/?format=rss',  # Financial Times
        'https://finance.yahoo.com/news/rss'  # Yahoo Finance
    ],
    'sports': ['https://www.espn.com/espn/rss/news', 'http://feeds.bbci.co.uk/sport/rss.xml', 'https://www.skysports.com/rss'],
    'entertainment': ['https://variety.com/feed/', 'https://www.hollywoodreporter.com/feed/', 'https://www.billboard.com/feed/'],
    'science': ['https://www.nasa.gov/rss/dyn/breaking_news.rss', 'https://www.sciencedaily.com/rss/all.xml', 'https://arstechnica.com/feed/']
}

# Define user personas with their interests and categories
USER_PERSONAS = {
    'Alex Parker': {'interests': ['AI', 'cybersecurity', 'blockchain', 'startups', 'programming'], 'categories': ['technology']},
    'Priya Sharma': {'interests': ['global markets', 'startups', 'fintech', 'cryptocurrency', 'economics'], 'categories': ['finance']},
    'Marco Rossi': {'interests': ['football', 'F1', 'NBA', 'olympic sports', 'esports'], 'categories': ['sports']},
    'Lisa Thompson': {'interests': ['movies', 'celebrity news', 'TV shows', 'music', 'books'], 'categories': ['entertainment']},
    'David Martinez': {'interests': ['space exploration', 'AI', 'biotech', 'physics', 'renewable energy'], 'categories': ['science', 'technology']}
}

In [13]:
class NewsletterGenerator:
    def __init__(self):
        self.articles = []
        self.vectorizer = TfidfVectorizer(stop_words='english')

    def fetch_articles(self):
        """Fetch articles from RSS feeds"""
        for category, feeds in RSS_FEEDS.items():
            for feed_url in feeds:
                try:
                    feed = feedparser.parse(feed_url)
                    for entry in feed.entries[:10]:  # Limit to 10 articles per feed
                        article = {
                            'title': entry.get('title', ''),
                            'link': entry.get('link', ''),
                            'summary': entry.get('summary', ''),
                            'published': entry.get('published', ''),
                            'category': category
                        }
                        self.articles.append(article)
                except Exception as e:
                    print(f"Error fetching {feed_url}: {e}")

    def clean_text(self, text):
        """Clean text for processing"""
        text = BeautifulSoup(text, 'html.parser').get_text()
        text = re.sub(r'\s+', ' ', text).strip()
        return text

    def generate_summary(self, text, max_length=100):
        """Generate concise summary"""
        sentences = nltk.sent_tokenize(text)
        if not sentences:
            return ""
        return sentences[0][:max_length] + ("..." if len(sentences[0]) > max_length else "")

    def personalize_articles(self, user):
        """Personalize articles for a user based on their interests"""
        user_interests = USER_PERSONAS[user]['interests']
        user_categories = USER_PERSONAS[user]['categories']

        # Filter articles by category
        relevant_articles = [a for a in self.articles if a['category'] in user_categories]
        if not relevant_articles:
            return []

        # Create document corpus for TF-IDF
        documents = [self.clean_text(a['title'] + ' ' + a['summary']) for a in relevant_articles]
        interest_text = ' '.join(user_interests)

        # Compute TF-IDF and similarity
        tfidf_matrix = self.vectorizer.fit_transform(documents + [interest_text])
        similarities = cosine_similarity(tfidf_matrix[-1:], tfidf_matrix[:-1])[0]

        # Rank articles by relevance
        ranked_articles = sorted(
            zip(relevant_articles, similarities),
            key=lambda x: x[1],
            reverse=True
        )

        # Select top 5 articles
        return [article for article, _ in ranked_articles[:5]]

    def generate_newsletter(self, user):
        """Generate Markdown newsletter for a user"""
        selected_articles = self.personalize_articles(user)
        date_str = datetime.now().strftime("%Y-%m-%d")
        markdown_content = f"# Personalized Newsletter for {user}\n\n"
        markdown_content += f"*Generated on {date_str}*\n\n"

        if not selected_articles:
            markdown_content += "No relevant articles found for your interests today.\n"
            return markdown_content

        # Add summary section
        markdown_content += "## Today's Highlights\n\n"
        for article in selected_articles[:2]:  # Top 2 for highlights
            summary = self.generate_summary(article['summary'])
            markdown_content += f"- **{article['title']}**: {summary} [Read more]({article['link']})\n"

        # Add detailed sections by category
        categories = set(a['category'] for a in selected_articles)
        for category in categories:
            markdown_content += f"\n## {category.capitalize()} News\n\n"
            for article in [a for a in selected_articles if a['category'] == category]:
                summary = self.generate_summary(article['summary'])
                markdown_content += f"### {article['title']}\n\n"
                markdown_content += f"{summary}\n\n"
                markdown_content += f"[Read Full Article]({article['link']})\n\n"

        return markdown_content

    def save_newsletter(self, user, markdown_content):
        """Save newsletter as Markdown file"""
        filename = f"newsletter_{user.replace(' ', '_')}_{datetime.now().strftime('%Y%m%d')}.md"
        with open(filename, 'w', encoding='utf-8') as f:
            f.write(markdown_content)
        return filename

In [14]:
# Example usage
def main():
    generator = NewsletterGenerator()
    print("Fetching articles...")
    generator.fetch_articles()

    for user in USER_PERSONAS.keys():
        print(f"Generating newsletter for {user}...")
        newsletter = generator.generate_newsletter(user)
        filename = generator.save_newsletter(user, newsletter)
        print(f"Newsletter saved as {filename}")

if __name__ == "__main__":
    main()

Fetching articles...
Generating newsletter for Alex Parker...
Newsletter saved as newsletter_Alex_Parker_20250412.md
Generating newsletter for Priya Sharma...
Newsletter saved as newsletter_Priya_Sharma_20250412.md
Generating newsletter for Marco Rossi...
Newsletter saved as newsletter_Marco_Rossi_20250412.md
Generating newsletter for Lisa Thompson...
Newsletter saved as newsletter_Lisa_Thompson_20250412.md
Generating newsletter for David Martinez...
Newsletter saved as newsletter_David_Martinez_20250412.md
