In [5]:
pip install feedparser requests newspaper3k pandas lxml[html_clean]

Collecting lxml_html_clean (from lxml[html_clean])
  Downloading lxml_html_clean-0.4.3-py3-none-any.whl.metadata (2.3 kB)
Downloading lxml_html_clean-0.4.3-py3-none-any.whl (14 kB)
Installing collected packages: lxml_html_clean
Successfully installed lxml_html_clean-0.4.3


In [6]:
import feedparser
import requests
from newspaper import Article
import pandas as pd
from datetime import datetime, timedelta
import time

class NewsScraper:
    def __init__(self):
        self.feeds = [
            'https://feeds.reuters.com/reuters/businessNews',
            'https://feeds.reuters.com/reuters/mergersNews',
            'https://feeds.reuters.com/reuters/companyNews'
        ]

    def fetch_news_from_feeds(self):
        """Fetch news from RSS feeds"""
        articles = []

        for feed_url in self.feeds:
            try:
                feed = feedparser.parse(feed_url)
                for entry in feed.entries[:10]:  # Get latest 10 articles per feed
                    article_data = {
                        'title': entry.title,
                        'url': entry.link,
                        'published': entry.published if 'published' in entry else '',
                        'source': 'Reuters',
                        'content': ''
                    }

                    # Try to get full content
                    try:
                        article = Article(entry.link)
                        article.download()
                        article.parse()
                        article_data['content'] = article.text
                        article_data['authors'] = article.authors
                        article_data['publish_date'] = article.publish_date
                    except:
                        article_data['content'] = entry.summary if 'summary' in entry else ''

                    articles.append(article_data)
                    time.sleep(0.5)  # Be respectful to servers

            except Exception as e:
                print(f"Error fetching from {feed_url}: {e}")

        return articles

    def get_sample_financial_news(self):
        """Get sample financial news for demonstration"""
        sample_news = [
            {
                'title': 'Microsoft acquires AI startup Lumier for $500 million',
                'content': 'Microsoft Corporation announced today that it has acquired Lumier, an artificial intelligence startup, for approximately $500 million. The acquisition will strengthen Microsoft\'s AI capabilities in the enterprise sector.',
                'url': 'https://example.com/microsoft-lumier',
                'published': datetime.now().isoformat(),
                'source': 'Sample Financial News'
            },
            {
                'title': 'JPMorgan Chase invests in blockchain company ChainTech',
                'content': 'JPMorgan Chase & Co. has made a strategic investment in ChainTech, a blockchain technology company. The investment is part of JPMorgan\'s ongoing digital transformation initiatives.',
                'url': 'https://example.com/jpmorgan-chaintech',
                'published': datetime.now().isoformat(),
                'source': 'Sample Financial News'
            },
            {
                'title': 'Google DeepMind partners with pharmaceutical giant',
                'content': 'Google DeepMind has entered into a partnership with a major pharmaceutical company to develop AI-driven drug discovery platforms.',
                'url': 'https://example.com/deepmind-pharma',
                'published': datetime.now().isoformat(),
                'source': 'Sample Financial News'
            },
            {
                'title': 'Apple reports strong quarterly earnings',
                'content': 'Apple Inc. reported better-than-expected quarterly earnings, driven by strong iPhone sales and services growth.',
                'url': 'https://example.com/apple-earnings',
                'published': datetime.now().isoformat(),
                'source': 'Sample Financial News'
            }
        ]
        return sample_news

# if __name__ == "__main__":
scraper = NewsScraper()
articles = scraper.fetch_news_from_feeds()
print(f"Fetched {len(articles)} articles")

Fetched 0 articles


In [7]:
newsapi_key="7372da6867a4469389333127e07e4615"

In [8]:
import feedparser
import requests
from newspaper import Article
import pandas as pd
from datetime import datetime, timedelta
import time
import random

class NewsScraper:
    def __init__(self):
        # Updated valid RSS feeds for financial news
        self.feeds = [
            # Yahoo Finance RSS feeds
            'https://finance.yahoo.com/news/rssindex',
            'https://feeds.finance.yahoo.com/rss/2.0/headline?s=msft,aapl,goog,jpm,gs&region=US&lang=en-US',

            # MarketWatch RSS
            'http://feeds.marketwatch.com/marketwatch/topstories/',
            'http://feeds.marketwatch.com/marketwatch/marketpulse/',

            # Investing.com RSS
            'https://www.investing.com/rss/news_301.rss',
            'https://www.investing.com/rss/market_overview.rss',

            # CNBC RSS feeds
            'https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=10000664',  # Top Business News
            'https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=15839135',  # M&A News

            # Bloomberg RSS (markets)
            'https://feeds.bloomberg.com/markets/news.rss',

            # Financial Times (business news)
            'https://www.ft.com/business?format=rss',

            # The Wall Street Journal (business)
            'https://feeds.a.dj.com/rss/RSSWSJD.xml',

            # Business Insider
            'https://markets.businessinsider.com/rss/news',
        ]

        # Set headers to mimic a real browser
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }

    def fetch_news_from_feeds(self):
        """Fetch news from RSS feeds with better error handling"""
        articles = []
        successful_feeds = 0

        for feed_url in self.feeds:
            try:
                print(f"Attempting to fetch from: {feed_url}")

                # Add timeout and custom headers to feedparser
                feed = feedparser.parse(feed_url)

                if hasattr(feed, 'status') and feed.status != 200:
                    print(f"Feed returned status {feed.status} for {feed_url}")
                    continue

                if not feed.entries:
                    print(f"No entries found in feed: {feed_url}")
                    continue

                print(f"✓ Found {len(feed.entries)} entries in {feed_url}")
                successful_feeds += 1

                for entry in feed.entries[:6]:  # Get latest 6 articles per feed
                    try:
                        # Skip if no title
                        if not hasattr(entry, 'title') or not entry.title:
                            continue

                        article_data = {
                            'title': entry.title,
                            'url': entry.link,
                            'published': entry.published if 'published' in entry else datetime.now().isoformat(),
                            'source': self._get_source_name(feed_url),
                            'content': '',
                            'summary': entry.summary if hasattr(entry, 'summary') else ''
                        }

                        # Try to get full content with newspaper3k
                        full_content = self._get_article_content(entry.link)
                        if full_content:
                            article_data['content'] = full_content
                        else:
                            # Fallback to summary
                            article_data['content'] = article_data['summary']

                        # Only add if we have reasonable content
                        if article_data['content'] and len(article_data['content']) > 50:
                            articles.append(article_data)
                            print(f"  ✓ Added: {entry.title[:60]}...")
                        else:
                            print(f"  ⚠ Skipped (insufficient content): {entry.title[:60]}...")

                    except Exception as e:
                        print(f"  Error processing article: {e}")
                        continue

                    time.sleep(0.5)  # Be respectful to servers

            except Exception as e:
                print(f"✗ Error fetching from {feed_url}: {str(e)[:100]}...")
                continue

        print(f"Successfully processed {successful_feeds}/{len(self.feeds)} feeds")
        return articles

    def _get_source_name(self, feed_url):
        """Extract source name from feed URL"""
        source_map = {
            'yahoo': 'Yahoo Finance',
            'marketwatch': 'MarketWatch',
            'investing': 'Investing.com',
            'cnbc': 'CNBC',
            'bloomberg': 'Bloomberg',
            'ft.com': 'Financial Times',
            'dj.com': 'Wall Street Journal',
            'businessinsider': 'Business Insider'
        }

        for key, name in source_map.items():
            if key in feed_url:
                return name
        return 'Financial News'

    def _get_article_content(self, url):
        """Extract full article content using newspaper3k with error handling"""
        try:
            # Create Article object with timeout
            article = Article(url, request_timeout=10)
            article.download()
            article.parse()

            # Return text if we got meaningful content
            if article.text and len(article.text) > 100:
                return article.text
            else:
                return None

        except Exception as e:
            # print(f"Could not extract content from {url}: {e}")  # Commented to reduce noise
            return None

    def fetch_financial_news_api(self):
        """Alternative method using free financial news APIs"""
        articles = []

        # NewsAPI.org (you would need to get a free API key)
        # This is just a template - you'd need to sign up for an API key
        newsapi_key = 'your_api_key_here'  # Get from https://newsapi.org
        if newsapi_key != 'your_api_key_here':
            try:
                url = f'https://newsapi.org/v2/top-headlines?category=business&language=en&apiKey={newsapi_key}'
                response = requests.get(url, timeout=10)
                if response.status_code == 200:
                    data = response.json()
                    for article in data.get('articles', [])[:10]:
                        articles.append({
                            'title': article.get('title', ''),
                            'url': article.get('url', ''),
                            'published': article.get('publishedAt', datetime.now().isoformat()),
                            'source': article.get('source', {}).get('name', 'NewsAPI'),
                            'content': article.get('description', '') + ' ' + article.get('content', ''),
                        })
            except Exception as e:
                print(f"NewsAPI error: {e}")

        return articles

    def get_sample_financial_news(self):
        """Get comprehensive sample financial news for demonstration"""
        sample_news = [
            {
                'title': 'Microsoft acquires AI startup Lumier for $500 million',
                'content': 'Microsoft Corporation announced today that it has acquired Lumier, an artificial intelligence startup, for approximately $500 million. The acquisition will strengthen Microsoft\'s AI capabilities in the enterprise sector. The deal is expected to close in the next quarter and will see Lumier\'s team joining Microsoft\'s AI research division. Lumier was founded in 2020 and has developed advanced machine learning algorithms for business automation.',
                'url': 'https://example.com/microsoft-lumier',
                'published': datetime.now().isoformat(),
                'source': 'Sample Financial News'
            },
            {
                'title': 'JPMorgan Chase invests in blockchain company ChainTech',
                'content': 'JPMorgan Chase & Co. has made a strategic investment in ChainTech, a blockchain technology company. The investment is part of JPMorgan\'s ongoing digital transformation initiatives and aims to enhance their blockchain capabilities for cross-border payments and settlement systems. The amount of the investment was not disclosed, but sources close to the deal estimate it to be around $50 million.',
                'url': 'https://example.com/jpmorgan-chaintech',
                'published': datetime.now().isoformat(),
                'source': 'Sample Financial News'
            },
            {
                'title': 'Google DeepMind partners with pharmaceutical giant',
                'content': 'Google DeepMind has entered into a partnership with Pfizer to develop AI-driven drug discovery platforms. The collaboration will leverage DeepMind\'s AlphaFold technology to accelerate drug development and identify new treatment options for various diseases. This partnership represents a significant step in applying artificial intelligence to pharmaceutical research and development.',
                'url': 'https://example.com/deepmind-pharma',
                'published': datetime.now().isoformat(),
                'source': 'Sample Financial News'
            },
            {
                'title': 'Apple reports strong quarterly earnings',
                'content': 'Apple Inc. reported better-than-expected quarterly earnings, driven by strong iPhone sales and services growth. The company exceeded analyst expectations with revenue of $89.5 billion, representing 8% year-over-year growth. Services revenue reached an all-time high of $19.5 billion. CEO Tim Cook highlighted the company\'s continued innovation and strong customer loyalty as key drivers of performance.',
                'url': 'https://example.com/apple-earnings',
                'published': datetime.now().isoformat(),
                'source': 'Sample Financial News'
            },
            {
                'title': 'Amazon launches new AI tools for cloud customers',
                'content': 'Amazon Web Services announced new artificial intelligence tools designed to help businesses build and deploy AI applications more easily. The new services include enhanced machine learning capabilities and improved natural language processing tools for enterprise customers. The launch positions AWS to compete more aggressively in the rapidly growing AI services market against Microsoft Azure and Google Cloud.',
                'url': 'https://example.com/amazon-ai-tools',
                'published': datetime.now().isoformat(),
                'source': 'Sample Financial News'
            },
            {
                'title': 'Tesla announces new battery technology breakthrough',
                'content': 'Tesla has unveiled a new battery technology that promises to extend electric vehicle range by 30% while reducing costs. The breakthrough in solid-state battery design could significantly impact the electric vehicle market and accelerate the transition to sustainable transportation. Tesla CEO Elon Musk stated that the new batteries will enter production next year.',
                'url': 'https://example.com/tesla-battery',
                'published': datetime.now().isoformat(),
                'source': 'Sample Financial News'
            }
        ]
        return sample_news

    def get_news(self, use_sample_if_empty=True):
        """Main method to get news, with fallback to sample data"""
        print("Starting news collection from financial RSS feeds...")

        # Try RSS feeds first
        articles = self.fetch_news_from_feeds()

        # If no articles from RSS, try API method
        if not articles:
            print("No articles from RSS feeds, trying API sources...")
            articles = self.fetch_financial_news_api()

        # If still no articles and sample is allowed, use sample data
        if not articles and use_sample_if_empty:
            print("Using comprehensive sample financial news data...")
            articles = self.get_sample_financial_news()

        print(f"🎯 Total articles collected: {len(articles)}")
        return articles

# Test the improved scraper
# if __name__ == "__main__":
scraper = NewsScraper()
articles = scraper.get_news(use_sample_if_empty=True)

print(f"\n=== FINAL RESULTS ===")
print(f"Total articles: {len(articles)}")

for i, article in enumerate(articles, 1):
    print(f"\n{i}. {article['title']}")
    print(f"   Source: {article['source']}")
    print(f"   Content: {article['content'][:120]}...")

Starting news collection from financial RSS feeds...
Attempting to fetch from: https://finance.yahoo.com/news/rssindex
✓ Found 48 entries in https://finance.yahoo.com/news/rssindex
  ✓ Added: 1 Top Growth Stock to Buy and Hold for the Next 10 Years...
  ✓ Added: Best high-yield savings interest rates today, October 19, 20...
  ✓ Added: Best money market account rates today, October 19, 2025 (bes...
  ✓ Added: Best CD rates today, October 19, 2025 (lock in up to 4.25% A...
  ✓ Added: Mortgage and refinance interest rates today, October 19, 202...
  ✓ Added: Analyst Report: State Street Corp....
Attempting to fetch from: https://feeds.finance.yahoo.com/rss/2.0/headline?s=msft,aapl,goog,jpm,gs&region=US&lang=en-US
✓ Found 20 entries in https://feeds.finance.yahoo.com/rss/2.0/headline?s=msft,aapl,goog,jpm,gs&region=US&lang=en-US
  ✓ Added: Apple stock hits all-time high as iPhone 17 sales outpace iP...
  ✓ Added: Stock market today: Dow, S&P 500, Nasdaq jump amid trade-war...
  ✓ Added: Wi

In [9]:
df = pd.DataFrame(articles)
df.head()

Unnamed: 0,title,url,published,source,content,summary
0,1 Top Growth Stock to Buy and Hold for the Nex...,https://finance.yahoo.com/news/1-top-growth-st...,2025-10-19T10:15:00Z,Yahoo Finance,Key Points\n\nASML's latest quarterly results ...,
1,"Best high-yield savings interest rates today, ...",https://finance.yahoo.com/personal-finance/art...,2025-10-19T10:00:34Z,Yahoo Finance,Find out how much you could earn with today’s ...,
2,"Best money market account rates today, October...",https://finance.yahoo.com/personal-finance/ban...,2025-10-19T10:00:40Z,Yahoo Finance,Find out how much you could earn with today’s ...,
3,"Best CD rates today, October 19, 2025 (lock in...",https://finance.yahoo.com/personal-finance/ban...,2025-10-19T10:00:51Z,Yahoo Finance,Find out how much you could earn by locking in...,
4,"Mortgage and refinance interest rates today, O...",https://finance.yahoo.com/personal-finance/mor...,2025-10-19T10:00:23Z,Yahoo Finance,The current 30-year fixed mortgage rate has fa...,


In [11]:
df.shape

(48, 6)

In [12]:
df.isna().sum()

Unnamed: 0,0
title,0
url,0
published,0
source,0
content,0
summary,0


In [13]:
df.duplicated().sum()

np.int64(0)

In [10]:
df['source'].value_counts()

Unnamed: 0_level_0,count
source,Unnamed: 1_level_1
Yahoo Finance,12
CNBC,12
Investing.com,6
Bloomberg,6
Wall Street Journal,6
Business Insider,6


In [14]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [15]:
df.to_csv('/content/drive/My Drive/MLOps/news_alert/news.csv',index=False)