In [2]:
import streamlit as st
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
from datetime import datetime, timedelta
import requests
from bs4 import BeautifulSoup
import tweepy
from transformers import pipeline
import yfinance as yf
import time
import re
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')



class IndianNewsCollector:
    """Handles news collection from Indian financial news sources"""
    
    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        })
    
    def scrape_economic_times_news(self):
        """
        Scrape latest market news from Economic Times
        Returns:
            list: List of news articles with metadata
        """
        try:
            url = "https://economictimes.indiatimes.com/markets/stocks/news"
            response = self.session.get(url, timeout=10)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.content, 'html.parser')
            news_items = []
            
            articles = soup.find_all('div', {'class': re.compile('eachStory')}) or \
                      soup.find_all('a', {'class': re.compile('news')})
            
            for article in articles[:15]:
                try:
                    title_elem = article.find('h3') or article.find('h4') or article
                    if title_elem:
                        title = title_elem.get_text(strip=True)
                        link = article.get('href', '') or article.find('a', href=True)
                        
                        if isinstance(link, str) and link:
                            url = link if link.startswith('http') else f"https://economictimes.indiatimes.com{link}"
                        elif link and link.get('href'):
                            url = link['href']
                            url = url if url.startswith('http') else f"https://economictimes.indiatimes.com{url}"
                        else:
                            continue
                        
                        if title and len(title) > 10:
                            news_items.append({
                                'title': title,
                                'url': url,
                                'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M'),
                                'source': 'Economic Times',
                                'category': 'Markets'
                            })
                except Exception as e:
                    continue
            
            return news_items
            
        except Exception as e:
            st.warning(f"Could not fetch Economic Times news: {str(e)}")
            return []
    
    def scrape_moneycontrol_news(self):
        """
        Scrape latest market news from Moneycontrol
        Returns:
            list: List of news articles with metadata
        """
        try:
            url = "https://www.moneycontrol.com/news/business/markets/"
            response = self.session.get(url, timeout=10)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.content, 'html.parser')
            news_items = []
            
            articles = soup.find_all('li', {'class': re.compile('clearfix')}) or \
                      soup.find_all('h2') or soup.find_all('a', {'class': re.compile('news')})
            
            for article in articles[:15]:
                try:
                    title_elem = article.find('a') or article
                    if title_elem:
                        title = title_elem.get_text(strip=True)
                        link = title_elem.get('href', '') if hasattr(title_elem, 'get') else ''
                        
                        if link and not link.startswith('http'):
                            link = f"https://www.moneycontrol.com{link}"
                        
                        if title and len(title) > 10 and link:
                            news_items.append({
                                'title': title,
                                'url': link,
                                'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M'),
                                'source': 'MoneyControl',
                                'category': 'Markets'
                            })
                except Exception as e:
                    continue
            
            return news_items
            
        except Exception as e:
            st.warning(f"Could not fetch MoneyControl news: {str(e)}")
            return []
    
    def scrape_mint_news(self):
        """
        Scrape latest market news from LiveMint
        Returns:
            list: List of news articles with metadata
        """
        try:
            url = "https://www.livemint.com/market"
            response = self.session.get(url, timeout=10)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.content, 'html.parser')
            news_items = []
            
            articles = soup.find_all('h3') or soup.find_all('h2')
            
            for article in articles[:15]:
                try:
                    title_elem = article.find('a') or article
                    if title_elem:
                        title = title_elem.get_text(strip=True)
                        link = title_elem.get('href', '') if hasattr(title_elem, 'get') else ''
                        
                        if link and not link.startswith('http'):
                            link = f"https://www.livemint.com{link}"
                        
                        if title and len(title) > 10 and link:
                            news_items.append({
                                'title': title,
                                'url': link,
                                'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M'),
                                'source': 'LiveMint',
                                'category': 'Markets'
                            })
                except Exception as e:
                    continue
            
            return news_items
            
        except Exception as e:
            st.warning(f"Could not fetch LiveMint news: {str(e)}")
            return []
    
    def get_all_indian_market_news(self):
        """
        Aggregate news from all Indian sources
        Returns:
            list: Combined list of news articles
        """
        all_news = []
        
        sources = [
            self.scrape_economic_times_news,
            self.scrape_moneycontrol_news,
            self.scrape_mint_news
        ]
        
        for source_func in sources:
            try:
                news = source_func()
                all_news.extend(news)
            except Exception as e:
                st.warning(f"Error fetching from {source_func.__name__}: {str(e)}")
                continue
        
        unique_news = []
        seen_titles = set()
        
        for news_item in all_news:
            title_lower = news_item['title'].lower()
            is_duplicate = any(
                self._calculate_similarity(title_lower, seen_title) > 0.8 
                for seen_title in seen_titles
            )
            
            if not is_duplicate:
                unique_news.append(news_item)
                seen_titles.add(title_lower)
        
        return sorted(unique_news, key=lambda x: x['timestamp'], reverse=True)
    
    def _calculate_similarity(self, text1, text2):
        """Calculate similarity between two texts (simple implementation)"""
        words1 = set(text1.split())
        words2 = set(text2.split())
        intersection = words1.intersection(words2)
        union = words1.union(words2)
        return len(intersection) / len(union) if union else 0


In [3]:
if __name__ == "__main__":
    from pprint import pprint  

    collector = IndianNewsCollector()
    news_list = collector.get_all_indian_market_news()

    for news in news_list:
        print(f"\n📰 {news['title']}")
        print(f"🔗 URL: {news['url']}")
        print(f"⏱  Timestamp: {news['timestamp']}")
        print(f"📍 Source: {news['source']}")
        print(f"📂 Category: {news['category']}")



📰 Will consumer stocks see a comeback this festive season? 12 stocks to keep an eye on even when analysts are not bullish
🔗 URL: https://economictimes.indiatimes.com/markets/stocks/news/will-consumer-stocks-see-a-comeback-this-festive-season-12-stocks-to-keep-an-eye-on-even-when-analysts-are-not-bullish/articleshow/122797405.cms
⏱  Timestamp: 2025-07-20 17:58
📍 Source: Economic Times
📂 Category: Markets

📰 How global forces are reshaping base metal prices
🔗 URL: https://economictimes.indiatimes.com/markets/stocks/news/how-global-forces-are-reshaping-base-metal-prices/articleshow/122797289.cms
⏱  Timestamp: 2025-07-20 17:58
📍 Source: Economic Times
📂 Category: Markets

📰 Apollo Tyres, Brigade Enterprises among 10 small-cap stocks trading below industry PE; may rally up to 43%
🔗 URL: https://economictimes.indiatimes.com/markets/stocks/news/apollo-tyres-brigade-enterprises-among-10-small-cap-stocks-trading-below-industry-pe-may-rally-up-to-43/slideshow/122796421.cms
⏱  Timestamp: 2025-07

In [None]:
import csv

with open('data.csv', 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['title', 'url', 'timestamp', 'source', 'category']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader()  
        writer.writerows(news_list)  