In [2]:
import requests
from bs4 import BeautifulSoup
import json
import csv
from datetime import datetime
import re
import time
from urllib.parse import urljoin, urlparse
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class NewsExtractor:
    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        })
        
    def extract_from_url(self, url):
        """Extract news article data from a single URL"""
        try:
            response = self.session.get(url, timeout=10)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Extract article data
            article_data = {
                'url': url,
                'title': self._extract_title(soup),
                'description': self._extract_description(soup),
                'full_content': self._extract_full_content(soup),
                'source': self._extract_source(soup, url),
                'date': self._extract_date(soup),
                'time': self._extract_time(soup),
                'extracted_at': datetime.now().isoformat()
            }
            
            return article_data
            
        except Exception as e:
            logger.error(f"Error extracting from {url}: {str(e)}")
            return None
    
    def _extract_title(self, soup):
        """Extract article title using multiple selectors"""
        selectors = [
            'h1',
            '.headline',
            '.title',
            '[class*="title"]',
            '[class*="headline"]',
            'meta[property="og:title"]',
            'meta[name="twitter:title"]',
            'title'
        ]
        
        for selector in selectors:
            if selector.startswith('meta'):
                element = soup.find('meta', attrs={'property': selector.split('[')[1].split('=')[1].strip('"')}) or \
                         soup.find('meta', attrs={'name': selector.split('[')[1].split('=')[1].strip('"')})
                if element:
                    return element.get('content', '').strip()
            else:
                element = soup.select_one(selector)
                if element:
                    return element.get_text().strip()
        
        return "Title not found"
    
    def _extract_description(self, soup):
        """Extract article description/summary"""
        selectors = [
            'meta[name="description"]',
            'meta[property="og:description"]',
            'meta[name="twitter:description"]',
            '.summary',
            '.description',
            '.excerpt',
            '.lead',
            'p'
        ]
        
        for selector in selectors:
            if selector.startswith('meta'):
                element = soup.find('meta', attrs={'name': selector.split('[')[1].split('=')[1].strip('"')}) or \
                         soup.find('meta', attrs={'property': selector.split('[')[1].split('=')[1].strip('"')})
                if element:
                    return element.get('content', '').strip()
            else:
                element = soup.select_one(selector)
                if element:
                    text = element.get_text().strip()
                    if len(text) > 50:  # Ensure it's substantial content
                        return text[:500] + "..." if len(text) > 500 else text
        
        return "Description not found"
    
    def _extract_full_content(self, soup):
        """Extract the full article content"""
        # Common selectors for article content
        content_selectors = [
            'article',
            '.article-content',
            '.story-content',
            '.post-content',
            '.entry-content',
            '.content',
            '.article-body',
            '.story-body',
            '.post-body',
            '.main-content',
            '[class*="article-content"]',
            '[class*="story-content"]',
            '[class*="post-content"]',
            '[id*="article-content"]',
            '[id*="story-content"]'
        ]
        
        # Try to find the main content container
        content_container = None
        for selector in content_selectors:
            content_container = soup.select_one(selector)
            if content_container:
                break
        
        if content_container:
            # Clean up the content
            content_text = self._clean_article_content(content_container)
            return content_text
        
        # Fallback: try to extract from multiple paragraph tags
        paragraphs = soup.find_all('p')
        if paragraphs:
            # Filter out navigation, footer, and other non-content paragraphs
            content_paragraphs = []
            for p in paragraphs:
                text = p.get_text().strip()
                parent_classes = ' '.join(p.parent.get('class', []))
                
                # Skip if paragraph is in navigation, footer, or sidebar
                if any(skip_class in parent_classes.lower() for skip_class in 
                       ['nav', 'footer', 'sidebar', 'menu', 'header', 'ad', 'comment']):
                    continue
                
                # Skip very short paragraphs (likely not main content)
                if len(text) > 20:
                    content_paragraphs.append(text)
            
            if content_paragraphs:
                return '\n\n'.join(content_paragraphs)
        
        return "Full content not found"
    
    def _clean_article_content(self, content_container):
        """Clean and format the extracted article content"""
        # Remove unwanted elements
        unwanted_tags = ['script', 'style', 'nav', 'footer', 'aside', 'header', 'form']
        for tag in unwanted_tags:
            for element in content_container.find_all(tag):
                element.decompose()
        
        # Remove elements with common non-content classes
        unwanted_classes = ['ad', 'advertisement', 'social', 'share', 'comment', 'related', 'sidebar', 'navigation']
        for class_name in unwanted_classes:
            for element in content_container.find_all(class_=lambda x: x and any(unwanted in ' '.join(x).lower() for unwanted in unwanted_classes)):
                element.decompose()
        
        # Extract text from paragraphs, maintaining structure
        content_parts = []
        for element in content_container.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
            text = element.get_text().strip()
            if text and len(text) > 10:  # Filter out very short text
                # Add extra spacing for headings
                if element.name.startswith('h'):
                    content_parts.append(f"\n{text}\n")
                else:
                    content_parts.append(text)
        
        # Join content and clean up
        full_content = '\n\n'.join(content_parts)
        
        # Clean up extra whitespace
        full_content = re.sub(r'\n\s*\n', '\n\n', full_content)
        full_content = re.sub(r'[ \t]+', ' ', full_content)
        
        return full_content.strip()
    
    def _extract_source(self, soup, url):
        """Extract news source/publication name"""
        # Try to get from meta tags first
        meta_selectors = [
            'meta[property="og:site_name"]',
            'meta[name="application-name"]',
            'meta[name="author"]'
        ]
        
        for selector in meta_selectors:
            element = soup.find('meta', attrs={'property': selector.split('[')[1].split('=')[1].strip('"')}) or \
                     soup.find('meta', attrs={'name': selector.split('[')[1].split('=')[1].strip('"')})
            if element:
                return element.get('content', '').strip()
        
        # Try to get from common selectors
        selectors = [
            '.source',
            '.publication',
            '.site-name',
            '.logo',
            'header .brand'
        ]
        
        for selector in selectors:
            element = soup.select_one(selector)
            if element:
                return element.get_text().strip()
        
        # Fallback to domain name
        domain = urlparse(url).netloc
        return domain.replace('www.', '')
    
    def _extract_date(self, soup):
        """Extract publication date"""
        # Try meta tags first
        meta_selectors = [
            'meta[property="article:published_time"]',
            'meta[name="publish_date"]',
            'meta[name="date"]'
        ]
        
        for selector in meta_selectors:
            element = soup.find('meta', attrs={'property': selector.split('[')[1].split('=')[1].strip('"')}) or \
                     soup.find('meta', attrs={'name': selector.split('[')[1].split('=')[1].strip('"')})
            if element:
                return self._parse_datetime(element.get('content', ''))
        
        # Try time elements
        time_element = soup.find('time')
        if time_element:
            datetime_attr = time_element.get('datetime')
            if datetime_attr:
                return self._parse_datetime(datetime_attr)
        
        # Try common date selectors
        selectors = [
            '.date',
            '.published',
            '.timestamp',
            '[class*="date"]',
            '[class*="time"]'
        ]
        
        for selector in selectors:
            element = soup.select_one(selector)
            if element:
                return self._parse_datetime(element.get_text())
        
        return datetime.now().strftime('%Y-%m-%d')
    
    def _extract_time(self, soup):
        """Extract publication time"""
        # Similar to date extraction but focusing on time
        time_element = soup.find('time')
        if time_element:
            datetime_attr = time_element.get('datetime')
            if datetime_attr:
                parsed_time = self._parse_datetime(datetime_attr, return_time=True)
                if parsed_time:
                    return parsed_time
        
        # Try to extract from text content
        selectors = [
            '.time',
            '.timestamp',
            '[class*="time"]'
        ]
        
        for selector in selectors:
            element = soup.select_one(selector)
            if element:
                time_text = element.get_text()
                parsed_time = self._parse_datetime(time_text, return_time=True)
                if parsed_time:
                    return parsed_time
        
        return datetime.now().strftime('%H:%M:%S')
    
    def _parse_datetime(self, datetime_str, return_time=False):
        """Parse datetime string to extract date or time"""
        if not datetime_str:
            return None
        
        # Common datetime formats
        formats = [
            '%Y-%m-%dT%H:%M:%S',
            '%Y-%m-%dT%H:%M:%SZ',
            '%Y-%m-%d %H:%M:%S',
            '%Y-%m-%d',
            '%d/%m/%Y',
            '%m/%d/%Y',
            '%B %d, %Y',
            '%d %B %Y'
        ]
        
        datetime_str = datetime_str.strip()
        
        for fmt in formats:
            try:
                parsed_dt = datetime.strptime(datetime_str, fmt)
                if return_time:
                    return parsed_dt.strftime('%H:%M:%S')
                else:
                    return parsed_dt.strftime('%Y-%m-%d')
            except ValueError:
                continue
        
        # Try to extract date with regex
        date_pattern = r'(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})|(\d{4}[/-]\d{1,2}[/-]\d{1,2})'
        match = re.search(date_pattern, datetime_str)
        if match:
            date_str = match.group()
            try:
                if '/' in date_str:
                    parsed_dt = datetime.strptime(date_str, '%m/%d/%Y' if date_str.split('/')[2] > '31' else '%d/%m/%Y')
                else:
                    parsed_dt = datetime.strptime(date_str, '%Y-%m-%d')
                
                if return_time:
                    return parsed_dt.strftime('%H:%M:%S')
                else:
                    return parsed_dt.strftime('%Y-%m-%d')
            except ValueError:
                pass
        
        return None

class NewsAggregator:
    def __init__(self):
        self.extractor = NewsExtractor()
        self.news_sources = {
            'BBC': 'https://www.bbc.com/news',
            'CNN': 'https://www.cnn.com',
            'Reuters': 'https://www.reuters.com',
            'AP News': 'https://apnews.com',
            'The Guardian': 'https://www.theguardian.com'
        }
    
    def get_article_links(self, source_url, max_links=10):
        """Get article links from a news source homepage"""
        try:
            response = self.extractor.session.get(source_url, timeout=10)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Find all article links
            links = []
            for link in soup.find_all('a', href=True):
                href = link['href']
                
                # Convert relative URLs to absolute
                if href.startswith('/'):
                    href = urljoin(source_url, href)
                
                # Filter for article URLs (basic heuristic)
                if (href.startswith('http') and 
                    any(keyword in href.lower() for keyword in ['article', 'news', 'story', '202']) and
                    href not in links):
                    links.append(href)
                    
                    if len(links) >= max_links:
                        break
            
            return links
            
        except Exception as e:
            logger.error(f"Error getting links from {source_url}: {str(e)}")
            return []
    
    def scrape_latest_news(self, max_articles_per_source=5):
        """Scrape latest news from multiple sources"""
        all_articles = []
        
        for source_name, source_url in self.news_sources.items():
            logger.info(f"Scraping {source_name}...")
            
            # Get article links
            article_links = self.get_article_links(source_url, max_articles_per_source)
            
            # Extract data from each article
            for link in article_links:
                article_data = self.extractor.extract_from_url(link)
                if article_data:
                    article_data['source'] = source_name  # Override with known source
                    all_articles.append(article_data)
                
                time.sleep(1)  # Be respectful with requests
        
        return all_articles
    
    def save_to_json(self, articles, filename='news_articles.json'):
        """Save articles to JSON file"""
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(articles, f, indent=2, ensure_ascii=False)
        logger.info(f"Saved {len(articles)} articles to {filename}")
    
    def save_to_csv(self, articles, filename='news_articles.csv'):
        """Save articles to CSV file"""
        if not articles:
            return
        
        fieldnames = ['title', 'description', 'full_content', 'source', 'date', 'time', 'url', 'extracted_at']
        
        with open(filename, 'w', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(articles)
        
        logger.info(f"Saved {len(articles)} articles to {filename}")

# Example usage
if __name__ == "__main__":
    # Initialize the aggregator
    aggregator = NewsAggregator()
    
    # Method 1: Scrape from multiple sources
    print("Scraping latest news from multiple sources...")
    articles = aggregator.scrape_latest_news(max_articles_per_source=3)
    
    # Method 2: Extract from specific URLs
    specific_urls = [
        "https://www.bbc.com/news",
        "https://www.cnn.com/2024/01/15/example-article"  # Replace with actual URLs
    ]
    
    print("\nExtracting from specific URLs...")
    for url in specific_urls:
        article = aggregator.extractor.extract_from_url(url)
        if article:
            articles.append(article)
    
    # Display results
    print(f"\nExtracted {len(articles)} articles:")
    for i, article in enumerate(articles[:5], 1):  # Show first 5
        print(f"\n{i}. {article['title']}")
        print(f"   Source: {article['source']}")
        print(f"   Date: {article['date']} | Time: {article['time']}")
        print(f"   URL: {article['url']}")
        print(f"   Description: {article['description'][:100]}...")
        print(f"   Full Content Length: {len(article['full_content'])} characters")
        print(f"   Full Content Preview: {article['full_content'][:200]}...")
    
    # Save to files
    aggregator.save_to_json(articles)
    aggregator.save_to_csv(articles)
    
    print(f"\nTotal articles extracted: {len(articles)}")
    print("Data saved to news_articles.json and news_articles.csv")

2025-07-16 01:41:14,334 - INFO - Scraping BBC...


Scraping latest news from multiple sources...


2025-07-16 01:41:19,386 - INFO - Scraping CNN...
2025-07-16 01:41:26,891 - INFO - Scraping Reuters...
2025-07-16 01:41:27,016 - ERROR - Error getting links from https://www.reuters.com: 401 Client Error: HTTP Forbidden for url: https://www.reuters.com/
2025-07-16 01:41:27,019 - INFO - Scraping AP News...
2025-07-16 01:41:46,438 - INFO - Scraping The Guardian...



Extracting from specific URLs...


2025-07-16 01:41:54,242 - ERROR - Error extracting from https://www.cnn.com/2024/01/15/example-article: 404 Client Error: Not Found for url: https://edition.cnn.com/2024/01/15/example-article
2025-07-16 01:41:54,249 - INFO - Saved 13 articles to news_articles.json
2025-07-16 01:41:54,256 - INFO - Saved 13 articles to news_articles.csv



Extracted 13 articles:

1. NewsNews
   Source: BBC
   Date: 2025-07-16 | Time: 01:41:15
   URL: https://www.bbc.com/news
   Description: The existence of the scheme could not be reported until now because of a court injunction....
   Full Content Length: 7130 characters
   Full Content Preview: Thousands of Afghans were moved to UK in secret scheme after data breach

The existence of the scheme could not be reported until now because of a court injunction.

I'm 'disappointed but not done' wi...

2. NewsNews
   Source: BBC
   Date: 2025-07-16 | Time: 01:41:16
   URL: https://www.bbc.com/news/topics/c2vdnvdg6xxt
   Description: Mahmoud Abdul Rahman's son, Abdullah, was among six children who died at a water distribution point ...
   Full Content Length: 5802 characters
   Full Content Preview: Israel-Gaza war

Gaza father's outrage after Israeli strike kills son 'searching for sip' at water point

Mahmoud Abdul Rahman's son, Abdullah, was among six children who died at a water distribut

## Title Change code

In [3]:
import requests
from bs4 import BeautifulSoup
import json
import csv
from datetime import datetime
import re
import time
from urllib.parse import urljoin, urlparse
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class NewsExtractor:
    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        })
        
    def extract_from_url(self, url):
        """Extract news article data from a single URL"""
        try:
            response = self.session.get(url, timeout=10)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Extract article data
            article_data = {
                'url': url,
                'title': self._extract_title(soup),
                'description': self._extract_description(soup),
                'full_content': self._extract_full_content(soup),
                'source': self._extract_source(soup, url),
                'date': self._extract_date(soup),
                'time': self._extract_time(soup),
                'extracted_at': datetime.now().isoformat()
            }
            
            return article_data
            
        except Exception as e:
            logger.error(f"Error extracting from {url}: {str(e)}")
            return None
    
    def _extract_title(self, soup):
        """Extract article title using multiple selectors with better filtering"""
        
        # Priority 1: Meta tags (usually most reliable for article titles)
        meta_selectors = [
            ('meta[property="og:title"]', 'property', 'og:title'),
            ('meta[name="twitter:title"]', 'name', 'twitter:title'),
            ('meta[property="article:title"]', 'property', 'article:title'),
            ('meta[name="title"]', 'name', 'title')
        ]
        
        for selector, attr_type, attr_value in meta_selectors:
            element = soup.find('meta', attrs={attr_type: attr_value})
            if element:
                title = element.get('content', '').strip()
                if title and self._is_valid_title(title):
                    return title
        
        # Priority 2: Article-specific title selectors
        article_title_selectors = [
            'article h1',
            '.article-title',
            '.entry-title',
            '.post-title',
            '.story-title',
            '.headline',
            '.article-headline',
            '.story-headline',
            '[class*="article-title"]',
            '[class*="story-title"]',
            '[class*="post-title"]',
            '[class*="headline"]',
            '[id*="title"]',
            '[id*="headline"]'
        ]
        
        for selector in article_title_selectors:
            element = soup.select_one(selector)
            if element:
                title = element.get_text().strip()
                if title and self._is_valid_title(title):
                    return title
        
        # Priority 3: Main H1 tag (but filter out site titles)
        h1_elements = soup.find_all('h1')
        for h1 in h1_elements:
            title = h1.get_text().strip()
            if title and self._is_valid_title(title):
                # Additional check: ensure it's not in header/nav/footer
                parent_classes = ' '.join(h1.parent.get('class', []))
                if not any(skip_class in parent_classes.lower() for skip_class in 
                          ['header', 'nav', 'footer', 'menu', 'sidebar']):
                    return title
        
        # Priority 4: Any element with title-like classes
        general_title_selectors = [
            '.title',
            '[class*="title"]'
        ]
        
        for selector in general_title_selectors:
            elements = soup.select(selector)
            for element in elements:
                title = element.get_text().strip()
                if title and self._is_valid_title(title):
                    # Check if it's likely an article title (not navigation, etc.)
                    parent_classes = ' '.join(element.parent.get('class', []))
                    if not any(skip_class in parent_classes.lower() for skip_class in 
                              ['nav', 'footer', 'sidebar', 'menu', 'header']):
                        return title
        
        # Last resort: HTML title tag (but clean it up)
        title_element = soup.find('title')
        if title_element:
            title = title_element.get_text().strip()
            # Clean up site name from title
            title = self._clean_title(title)
            if title and self._is_valid_title(title):
                return title
        
        return "Title not found"
    
    def _is_valid_title(self, title):
        """Check if extracted title is valid (not generic site content)"""
        if not title or len(title) < 10:
            return False
        
        # Filter out common non-title content
        invalid_patterns = [
            'home',
            'news',
            'breaking news',
            'latest news',
            'top stories',
            'headlines',
            'menu',
            'navigation',
            'search',
            'subscribe',
            'login',
            'sign in',
            'contact',
            'about',
            'privacy',
            'terms',
            'cookie'
        ]
        
        title_lower = title.lower()
        
        # Check if title is just a generic term
        if title_lower in invalid_patterns:
            return False
        
        # Check for repetitive patterns like "newsnews" or "bbcbbc"
        if len(set(title_lower.split())) < len(title_lower.split()) * 0.7:
            return False
        
        # Check for overly repetitive characters
        if any(char * 3 in title_lower for char in 'abcdefghijklmnopqrstuvwxyz'):
            return False
        
        return True
    
    def _clean_title(self, title):
        """Clean title by removing site name and other clutter"""
        if not title:
            return title
        
        # Common separators used in HTML titles
        separators = [' - ', ' | ', ' :: ', ' › ', ' > ', ' • ', ' · ']
        
        # Try to split by separators and take the first (usually article title)
        for separator in separators:
            if separator in title:
                parts = title.split(separator)
                # Take the longest part (likely the article title)
                longest_part = max(parts, key=len).strip()
                if len(longest_part) > 10:
                    return longest_part
        
        # Remove common site suffixes
        site_suffixes = [
            'CNN',
            'BBC News',
            'Reuters',
            'AP News',
            'The Guardian',
            'News',
            'Breaking News',
            'Latest News'
        ]
        
        cleaned_title = title
        for suffix in site_suffixes:
            if cleaned_title.endswith(suffix):
                cleaned_title = cleaned_title[:-len(suffix)].strip()
                break
        
        return cleaned_title
    
    def _extract_description(self, soup):
        """Extract article description/summary"""
        selectors = [
            'meta[name="description"]',
            'meta[property="og:description"]',
            'meta[name="twitter:description"]',
            '.summary',
            '.description',
            '.excerpt',
            '.lead',
            'p'
        ]
        
        for selector in selectors:
            if selector.startswith('meta'):
                element = soup.find('meta', attrs={'name': selector.split('[')[1].split('=')[1].strip('"')}) or \
                         soup.find('meta', attrs={'property': selector.split('[')[1].split('=')[1].strip('"')})
                if element:
                    return element.get('content', '').strip()
            else:
                element = soup.select_one(selector)
                if element:
                    text = element.get_text().strip()
                    if len(text) > 50:  # Ensure it's substantial content
                        return text[:500] + "..." if len(text) > 500 else text
        
        return "Description not found"
    
    def _extract_full_content(self, soup):
        """Extract the full article content"""
        # Common selectors for article content
        content_selectors = [
            'article',
            '.article-content',
            '.story-content',
            '.post-content',
            '.entry-content',
            '.content',
            '.article-body',
            '.story-body',
            '.post-body',
            '.main-content',
            '[class*="article-content"]',
            '[class*="story-content"]',
            '[class*="post-content"]',
            '[id*="article-content"]',
            '[id*="story-content"]'
        ]
        
        # Try to find the main content container
        content_container = None
        for selector in content_selectors:
            content_container = soup.select_one(selector)
            if content_container:
                break
        
        if content_container:
            # Clean up the content
            content_text = self._clean_article_content(content_container)
            return content_text
        
        # Fallback: try to extract from multiple paragraph tags
        paragraphs = soup.find_all('p')
        if paragraphs:
            # Filter out navigation, footer, and other non-content paragraphs
            content_paragraphs = []
            for p in paragraphs:
                text = p.get_text().strip()
                parent_classes = ' '.join(p.parent.get('class', []))
                
                # Skip if paragraph is in navigation, footer, or sidebar
                if any(skip_class in parent_classes.lower() for skip_class in 
                       ['nav', 'footer', 'sidebar', 'menu', 'header', 'ad', 'comment']):
                    continue
                
                # Skip very short paragraphs (likely not main content)
                if len(text) > 20:
                    content_paragraphs.append(text)
            
            if content_paragraphs:
                return '\n\n'.join(content_paragraphs)
        
        return "Full content not found"
    
    def _clean_article_content(self, content_container):
        """Clean and format the extracted article content"""
        # Remove unwanted elements
        unwanted_tags = ['script', 'style', 'nav', 'footer', 'aside', 'header', 'form']
        for tag in unwanted_tags:
            for element in content_container.find_all(tag):
                element.decompose()
        
        # Remove elements with common non-content classes
        unwanted_classes = ['ad', 'advertisement', 'social', 'share', 'comment', 'related', 'sidebar', 'navigation']
        for class_name in unwanted_classes:
            for element in content_container.find_all(class_=lambda x: x and any(unwanted in ' '.join(x).lower() for unwanted in unwanted_classes)):
                element.decompose()
        
        # Extract text from paragraphs, maintaining structure
        content_parts = []
        for element in content_container.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
            text = element.get_text().strip()
            if text and len(text) > 10:  # Filter out very short text
                # Add extra spacing for headings
                if element.name.startswith('h'):
                    content_parts.append(f"\n{text}\n")
                else:
                    content_parts.append(text)
        
        # Join content and clean up
        full_content = '\n\n'.join(content_parts)
        
        # Clean up extra whitespace
        full_content = re.sub(r'\n\s*\n', '\n\n', full_content)
        full_content = re.sub(r'[ \t]+', ' ', full_content)
        
        return full_content.strip()
    
    def _extract_source(self, soup, url):
        """Extract news source/publication name"""
        # Try to get from meta tags first
        meta_selectors = [
            'meta[property="og:site_name"]',
            'meta[name="application-name"]',
            'meta[name="author"]'
        ]
        
        for selector in meta_selectors:
            element = soup.find('meta', attrs={'property': selector.split('[')[1].split('=')[1].strip('"')}) or \
                     soup.find('meta', attrs={'name': selector.split('[')[1].split('=')[1].strip('"')})
            if element:
                return element.get('content', '').strip()
        
        # Try to get from common selectors
        selectors = [
            '.source',
            '.publication',
            '.site-name',
            '.logo',
            'header .brand'
        ]
        
        for selector in selectors:
            element = soup.select_one(selector)
            if element:
                return element.get_text().strip()
        
        # Fallback to domain name
        domain = urlparse(url).netloc
        return domain.replace('www.', '')
    
    def _extract_date(self, soup):
        """Extract publication date"""
        # Try meta tags first
        meta_selectors = [
            'meta[property="article:published_time"]',
            'meta[name="publish_date"]',
            'meta[name="date"]'
        ]
        
        for selector in meta_selectors:
            element = soup.find('meta', attrs={'property': selector.split('[')[1].split('=')[1].strip('"')}) or \
                     soup.find('meta', attrs={'name': selector.split('[')[1].split('=')[1].strip('"')})
            if element:
                return self._parse_datetime(element.get('content', ''))
        
        # Try time elements
        time_element = soup.find('time')
        if time_element:
            datetime_attr = time_element.get('datetime')
            if datetime_attr:
                return self._parse_datetime(datetime_attr)
        
        # Try common date selectors
        selectors = [
            '.date',
            '.published',
            '.timestamp',
            '[class*="date"]',
            '[class*="time"]'
        ]
        
        for selector in selectors:
            element = soup.select_one(selector)
            if element:
                return self._parse_datetime(element.get_text())
        
        return datetime.now().strftime('%Y-%m-%d')
    
    def _extract_time(self, soup):
        """Extract publication time"""
        # Similar to date extraction but focusing on time
        time_element = soup.find('time')
        if time_element:
            datetime_attr = time_element.get('datetime')
            if datetime_attr:
                parsed_time = self._parse_datetime(datetime_attr, return_time=True)
                if parsed_time:
                    return parsed_time
        
        # Try to extract from text content
        selectors = [
            '.time',
            '.timestamp',
            '[class*="time"]'
        ]
        
        for selector in selectors:
            element = soup.select_one(selector)
            if element:
                time_text = element.get_text()
                parsed_time = self._parse_datetime(time_text, return_time=True)
                if parsed_time:
                    return parsed_time
        
        return datetime.now().strftime('%H:%M:%S')
    
    def _parse_datetime(self, datetime_str, return_time=False):
        """Parse datetime string to extract date or time"""
        if not datetime_str:
            return None
        
        # Common datetime formats
        formats = [
            '%Y-%m-%dT%H:%M:%S',
            '%Y-%m-%dT%H:%M:%SZ',
            '%Y-%m-%d %H:%M:%S',
            '%Y-%m-%d',
            '%d/%m/%Y',
            '%m/%d/%Y',
            '%B %d, %Y',
            '%d %B %Y'
        ]
        
        datetime_str = datetime_str.strip()
        
        for fmt in formats:
            try:
                parsed_dt = datetime.strptime(datetime_str, fmt)
                if return_time:
                    return parsed_dt.strftime('%H:%M:%S')
                else:
                    return parsed_dt.strftime('%Y-%m-%d')
            except ValueError:
                continue
        
        # Try to extract date with regex
        date_pattern = r'(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})|(\d{4}[/-]\d{1,2}[/-]\d{1,2})'
        match = re.search(date_pattern, datetime_str)
        if match:
            date_str = match.group()
            try:
                if '/' in date_str:
                    parsed_dt = datetime.strptime(date_str, '%m/%d/%Y' if date_str.split('/')[2] > '31' else '%d/%m/%Y')
                else:
                    parsed_dt = datetime.strptime(date_str, '%Y-%m-%d')
                
                if return_time:
                    return parsed_dt.strftime('%H:%M:%S')
                else:
                    return parsed_dt.strftime('%Y-%m-%d')
            except ValueError:
                pass
        
        return None

class NewsAggregator:
    def __init__(self):
        self.extractor = NewsExtractor()
        self.news_sources = {
            'BBC': 'https://www.bbc.com/news',
            'CNN': 'https://www.cnn.com',
            'Reuters': 'https://www.reuters.com',
            'AP News': 'https://apnews.com',
            'The Guardian': 'https://www.theguardian.com'
        }
    
    def get_article_links(self, source_url, max_links=10):
        """Get article links from a news source homepage"""
        try:
            response = self.extractor.session.get(source_url, timeout=10)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Find all article links
            links = []
            for link in soup.find_all('a', href=True):
                href = link['href']
                
                # Convert relative URLs to absolute
                if href.startswith('/'):
                    href = urljoin(source_url, href)
                
                # Filter for article URLs (basic heuristic)
                if (href.startswith('http') and 
                    any(keyword in href.lower() for keyword in ['article', 'news', 'story', '202']) and
                    href not in links):
                    links.append(href)
                    
                    if len(links) >= max_links:
                        break
            
            return links
            
        except Exception as e:
            logger.error(f"Error getting links from {source_url}: {str(e)}")
            return []
    
    def scrape_latest_news(self, max_articles_per_source=5):
        """Scrape latest news from multiple sources"""
        all_articles = []
        
        for source_name, source_url in self.news_sources.items():
            logger.info(f"Scraping {source_name}...")
            
            # Get article links
            article_links = self.get_article_links(source_url, max_articles_per_source)
            
            # Extract data from each article
            for link in article_links:
                article_data = self.extractor.extract_from_url(link)
                if article_data:
                    article_data['source'] = source_name  # Override with known source
                    all_articles.append(article_data)
                
                time.sleep(1)  # Be respectful with requests
        
        return all_articles
    
    def save_to_json(self, articles, filename='news_articles.json'):
        """Save articles to JSON file"""
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(articles, f, indent=2, ensure_ascii=False)
        logger.info(f"Saved {len(articles)} articles to {filename}")
    
    def save_to_csv(self, articles, filename='news_articles.csv'):
        """Save articles to CSV file"""
        if not articles:
            return
        
        fieldnames = ['title', 'description', 'full_content', 'source', 'date', 'time', 'url', 'extracted_at']
        
        with open(filename, 'w', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(articles)
        
        logger.info(f"Saved {len(articles)} articles to {filename}")

# Example usage
if __name__ == "__main__":
    # Initialize the aggregator
    aggregator = NewsAggregator()
    
    # Method 1: Scrape from multiple sources
    print("Scraping latest news from multiple sources...")
    articles = aggregator.scrape_latest_news(max_articles_per_source=3)
    
    # Method 2: Extract from specific URLs
    specific_urls = [
        "https://www.bbc.com/news"
        # "https://www.cnn.com/2024/01/15/example-article"  # Replace with actual URLs
    ]
    
    print("\nExtracting from specific URLs...")
    for url in specific_urls:
        article = aggregator.extractor.extract_from_url(url)
        if article:
            articles.append(article)
    
    # Display results
    print(f"\nExtracted {len(articles)} articles:")
    for i, article in enumerate(articles[:5], 1):  # Show first 5
        print(f"\n{i}. {article['title']}")
        print(f"   Source: {article['source']}")
        print(f"   Date: {article['date']} | Time: {article['time']}")
        print(f"   URL: {article['url']}")
        print(f"   Description: {article['description'][:100]}...")
        print(f"   Full Content Length: {len(article['full_content'])} characters")
        print(f"   Full Content Preview: {article['full_content'][:200]}...")
    
    # Save to files
    aggregator.save_to_json(articles)
    aggregator.save_to_csv(articles)
    
    print(f"\nTotal articles extracted: {len(articles)}")
    print("Data saved to news_articles.json and news_articles.csv")

2025-07-16 01:45:56,823 - INFO - Scraping BBC...


Scraping latest news from multiple sources...


2025-07-16 01:46:03,714 - INFO - Scraping CNN...
2025-07-16 01:46:11,141 - INFO - Scraping Reuters...
2025-07-16 01:46:11,283 - ERROR - Error getting links from https://www.reuters.com: 401 Client Error: HTTP Forbidden for url: https://www.reuters.com/
2025-07-16 01:46:11,284 - INFO - Scraping AP News...
2025-07-16 01:46:25,308 - INFO - Scraping The Guardian...



Extracting from specific URLs...


2025-07-16 01:46:31,559 - INFO - Saved 13 articles to news_articles.json
2025-07-16 01:46:31,567 - INFO - Saved 13 articles to news_articles.csv



Extracted 13 articles:

1. BBC News - Breaking news, video and the latest top stories from the U.S. and around the world
   Source: BBC
   Date: 2025-07-16 | Time: 01:45:58
   URL: https://www.bbc.com/news
   Description: The existence of the scheme could not be reported until now because of a court injunction....
   Full Content Length: 7130 characters
   Full Content Preview: Thousands of Afghans were moved to UK in secret scheme after data breach

The existence of the scheme could not be reported until now because of a court injunction.

I'm 'disappointed but not done' wi...

2. Israel Gaza war | Latest News & Updates | BBC News
   Source: BBC
   Date: 2025-07-16 | Time: 01:46:01
   URL: https://www.bbc.com/news/topics/c2vdnvdg6xxt
   Description: Mahmoud Abdul Rahman's son, Abdullah, was among six children who died at a water distribution point ...
   Full Content Length: 5802 characters
   Full Content Preview: Israel-Gaza war

Gaza father's outrage after Israeli strike kills son

In [4]:
import requests
from bs4 import BeautifulSoup
import json
import csv
from datetime import datetime
import re
import time
from urllib.parse import urljoin, urlparse
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class NewsExtractor:
    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        })
        
    def extract_from_url(self, url):
        """Extract news article data from a single URL"""
        try:
            response = self.session.get(url, timeout=10)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Extract article data
            article_data = {
                'url': url,
                'title': self._extract_title(soup),
                'description': self._extract_description(soup),
                'full_content': self._extract_full_content(soup),
                'source': self._extract_source(soup, url),
                'date': self._extract_date(soup),
                'time': self._extract_time(soup),
                'extracted_at': datetime.now().isoformat()
            }
            
            return article_data
            
        except Exception as e:
            logger.error(f"Error extracting from {url}: {str(e)}")
            return None
    
    def _extract_title(self, soup):
        """Extract article title using multiple selectors with better filtering"""
        
        # Priority 1: Meta tags (usually most reliable for article titles)
        meta_selectors = [
            ('meta[property="og:title"]', 'property', 'og:title'),
            ('meta[name="twitter:title"]', 'name', 'twitter:title'),
            ('meta[property="article:title"]', 'property', 'article:title'),
            ('meta[name="title"]', 'name', 'title')
        ]
        
        for selector, attr_type, attr_value in meta_selectors:
            element = soup.find('meta', attrs={attr_type: attr_value})
            if element:
                title = element.get('content', '').strip()
                if title and self._is_valid_title(title):
                    return title
        
        # Priority 2: Article-specific title selectors
        article_title_selectors = [
            'article h1',
            '.article-title',
            '.entry-title',
            '.post-title',
            '.story-title',
            '.headline',
            '.article-headline',
            '.story-headline',
            '[class*="article-title"]',
            '[class*="story-title"]',
            '[class*="post-title"]',
            '[class*="headline"]',
            '[id*="title"]',
            '[id*="headline"]'
        ]
        
        for selector in article_title_selectors:
            element = soup.select_one(selector)
            if element:
                title = element.get_text().strip()
                if title and self._is_valid_title(title):
                    return title
        
        # Priority 3: Main H1 tag (but filter out site titles)
        h1_elements = soup.find_all('h1')
        for h1 in h1_elements:
            title = h1.get_text().strip()
            if title and self._is_valid_title(title):
                # Additional check: ensure it's not in header/nav/footer
                parent_classes = ' '.join(h1.parent.get('class', []))
                if not any(skip_class in parent_classes.lower() for skip_class in 
                          ['header', 'nav', 'footer', 'menu', 'sidebar']):
                    return title
        
        # Priority 4: Any element with title-like classes
        general_title_selectors = [
            '.title',
            '[class*="title"]'
        ]
        
        for selector in general_title_selectors:
            elements = soup.select(selector)
            for element in elements:
                title = element.get_text().strip()
                if title and self._is_valid_title(title):
                    # Check if it's likely an article title (not navigation, etc.)
                    parent_classes = ' '.join(element.parent.get('class', []))
                    if not any(skip_class in parent_classes.lower() for skip_class in 
                              ['nav', 'footer', 'sidebar', 'menu', 'header']):
                        return title
        
        # Last resort: HTML title tag (but clean it up)
        title_element = soup.find('title')
        if title_element:
            title = title_element.get_text().strip()
            # Clean up site name from title
            title = self._clean_title(title)
            if title and self._is_valid_title(title):
                return title
        
        return "Title not found"
    
    def _is_valid_title(self, title):
        """Check if extracted title is valid (not generic site content)"""
        if not title or len(title) < 10:
            return False
        
        # Filter out common non-title content
        invalid_patterns = [
            'home',
            'news',
            'breaking news',
            'latest news',
            'top stories',
            'headlines',
            'menu',
            'navigation',
            'search',
            'subscribe',
            'login',
            'sign in',
            'contact',
            'about',
            'privacy',
            'terms',
            'cookie'
        ]
        
        title_lower = title.lower()
        
        # Check if title is just a generic term
        if title_lower in invalid_patterns:
            return False
        
        # Check for repetitive patterns like "newsnews" or "bbcbbc"
        if len(set(title_lower.split())) < len(title_lower.split()) * 0.7:
            return False
        
        # Check for overly repetitive characters
        if any(char * 3 in title_lower for char in 'abcdefghijklmnopqrstuvwxyz'):
            return False
        
        return True
    
    def _clean_title(self, title):
        """Clean title by removing site name and other clutter"""
        if not title:
            return title
        
        # Common separators used in HTML titles
        separators = [' - ', ' | ', ' :: ', ' › ', ' > ', ' • ', ' · ']
        
        # Try to split by separators and take the first (usually article title)
        for separator in separators:
            if separator in title:
                parts = title.split(separator)
                # Take the longest part (likely the article title)
                longest_part = max(parts, key=len).strip()
                if len(longest_part) > 10:
                    return longest_part
        
        # Remove common site suffixes
        site_suffixes = [
            'CNN',
            'BBC News',
            'Reuters',
            'AP News',
            'The Guardian',
            'News',
            'Breaking News',
            'Latest News'
        ]
        
        cleaned_title = title
        for suffix in site_suffixes:
            if cleaned_title.endswith(suffix):
                cleaned_title = cleaned_title[:-len(suffix)].strip()
                break
        
        return cleaned_title
    
    def _extract_description(self, soup):
        """Extract article description/summary"""
        selectors = [
            'meta[name="description"]',
            'meta[property="og:description"]',
            'meta[name="twitter:description"]',
            '.summary',
            '.description',
            '.excerpt',
            '.lead',
            'p'
        ]
        
        for selector in selectors:
            if selector.startswith('meta'):
                element = soup.find('meta', attrs={'name': selector.split('[')[1].split('=')[1].strip('"')}) or \
                         soup.find('meta', attrs={'property': selector.split('[')[1].split('=')[1].strip('"')})
                if element:
                    return element.get('content', '').strip()
            else:
                element = soup.select_one(selector)
                if element:
                    text = element.get_text().strip()
                    if len(text) > 50:  # Ensure it's substantial content
                        return text[:500] + "..." if len(text) > 500 else text
        
        return "Description not found"
    
    def _extract_full_content(self, soup):
        """Extract the full article content"""
        # Common selectors for article content
        content_selectors = [
            'article',
            '.article-content',
            '.story-content',
            '.post-content',
            '.entry-content',
            '.content',
            '.article-body',
            '.story-body',
            '.post-body',
            '.main-content',
            '[class*="article-content"]',
            '[class*="story-content"]',
            '[class*="post-content"]',
            '[id*="article-content"]',
            '[id*="story-content"]'
        ]
        
        # Try to find the main content container
        content_container = None
        for selector in content_selectors:
            content_container = soup.select_one(selector)
            if content_container:
                break
        
        if content_container:
            # Clean up the content
            content_text = self._clean_article_content(content_container)
            return content_text
        
        # Fallback: try to extract from multiple paragraph tags
        paragraphs = soup.find_all('p')
        if paragraphs:
            # Filter out navigation, footer, and other non-content paragraphs
            content_paragraphs = []
            for p in paragraphs:
                text = p.get_text().strip()
                parent_classes = ' '.join(p.parent.get('class', []))
                
                # Skip if paragraph is in navigation, footer, or sidebar
                if any(skip_class in parent_classes.lower() for skip_class in 
                       ['nav', 'footer', 'sidebar', 'menu', 'header', 'ad', 'comment']):
                    continue
                
                # Skip very short paragraphs (likely not main content)
                if len(text) > 20:
                    content_paragraphs.append(text)
            
            if content_paragraphs:
                return '\n\n'.join(content_paragraphs)
        
        return "Full content not found"
    
    def _clean_article_content(self, content_container):
        """Clean and format the extracted article content"""
        # Remove unwanted elements
        unwanted_tags = ['script', 'style', 'nav', 'footer', 'aside', 'header', 'form']
        for tag in unwanted_tags:
            for element in content_container.find_all(tag):
                element.decompose()
        
        # Remove elements with common non-content classes
        unwanted_classes = ['ad', 'advertisement', 'social', 'share', 'comment', 'related', 'sidebar', 'navigation']
        for class_name in unwanted_classes:
            for element in content_container.find_all(class_=lambda x: x and any(unwanted in ' '.join(x).lower() for unwanted in unwanted_classes)):
                element.decompose()
        
        # Extract text from paragraphs, maintaining structure
        content_parts = []
        for element in content_container.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
            text = element.get_text().strip()
            if text and len(text) > 10:  # Filter out very short text
                # Add extra spacing for headings
                if element.name.startswith('h'):
                    content_parts.append(f"\n{text}\n")
                else:
                    content_parts.append(text)
        
        # Join content and clean up
        full_content = '\n\n'.join(content_parts)
        
        # Clean up extra whitespace
        full_content = re.sub(r'\n\s*\n', '\n\n', full_content)
        full_content = re.sub(r'[ \t]+', ' ', full_content)
        
        return full_content.strip()
    
    def _extract_source(self, soup, url):
        """Extract news source/publication name"""
        # Try to get from meta tags first
        meta_selectors = [
            'meta[property="og:site_name"]',
            'meta[name="application-name"]',
            'meta[name="author"]'
        ]
        
        for selector in meta_selectors:
            element = soup.find('meta', attrs={'property': selector.split('[')[1].split('=')[1].strip('"')}) or \
                     soup.find('meta', attrs={'name': selector.split('[')[1].split('=')[1].strip('"')})
            if element:
                return element.get('content', '').strip()
        
        # Try to get from common selectors
        selectors = [
            '.source',
            '.publication',
            '.site-name',
            '.logo',
            'header .brand'
        ]
        
        for selector in selectors:
            element = soup.select_one(selector)
            if element:
                return element.get_text().strip()
        
        # Fallback to domain name
        domain = urlparse(url).netloc
        return domain.replace('www.', '')
    
    def _extract_date(self, soup):
        """Extract publication date"""
        # Try meta tags first
        meta_selectors = [
            'meta[property="article:published_time"]',
            'meta[name="publish_date"]',
            'meta[name="date"]'
        ]
        
        for selector in meta_selectors:
            element = soup.find('meta', attrs={'property': selector.split('[')[1].split('=')[1].strip('"')}) or \
                     soup.find('meta', attrs={'name': selector.split('[')[1].split('=')[1].strip('"')})
            if element:
                return self._parse_datetime(element.get('content', ''))
        
        # Try time elements
        time_element = soup.find('time')
        if time_element:
            datetime_attr = time_element.get('datetime')
            if datetime_attr:
                return self._parse_datetime(datetime_attr)
        
        # Try common date selectors
        selectors = [
            '.date',
            '.published',
            '.timestamp',
            '[class*="date"]',
            '[class*="time"]'
        ]
        
        for selector in selectors:
            element = soup.select_one(selector)
            if element:
                return self._parse_datetime(element.get_text())
        
        return datetime.now().strftime('%Y-%m-%d')
    
    def _extract_time(self, soup):
        """Extract publication time"""
        # Similar to date extraction but focusing on time
        time_element = soup.find('time')
        if time_element:
            datetime_attr = time_element.get('datetime')
            if datetime_attr:
                parsed_time = self._parse_datetime(datetime_attr, return_time=True)
                if parsed_time:
                    return parsed_time
        
        # Try to extract from text content
        selectors = [
            '.time',
            '.timestamp',
            '[class*="time"]'
        ]
        
        for selector in selectors:
            element = soup.select_one(selector)
            if element:
                time_text = element.get_text()
                parsed_time = self._parse_datetime(time_text, return_time=True)
                if parsed_time:
                    return parsed_time
        
        return datetime.now().strftime('%H:%M:%S')
    
    def _parse_datetime(self, datetime_str, return_time=False):
        """Parse datetime string to extract date or time"""
        if not datetime_str:
            return None
        
        # Common datetime formats
        formats = [
            '%Y-%m-%dT%H:%M:%S',
            '%Y-%m-%dT%H:%M:%SZ',
            '%Y-%m-%d %H:%M:%S',
            '%Y-%m-%d',
            '%d/%m/%Y',
            '%m/%d/%Y',
            '%B %d, %Y',
            '%d %B %Y'
        ]
        
        datetime_str = datetime_str.strip()
        
        for fmt in formats:
            try:
                parsed_dt = datetime.strptime(datetime_str, fmt)
                if return_time:
                    return parsed_dt.strftime('%H:%M:%S')
                else:
                    return parsed_dt.strftime('%Y-%m-%d')
            except ValueError:
                continue
        
        # Try to extract date with regex
        date_pattern = r'(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})|(\d{4}[/-]\d{1,2}[/-]\d{1,2})'
        match = re.search(date_pattern, datetime_str)
        if match:
            date_str = match.group()
            try:
                if '/' in date_str:
                    parsed_dt = datetime.strptime(date_str, '%m/%d/%Y' if date_str.split('/')[2] > '31' else '%d/%m/%Y')
                else:
                    parsed_dt = datetime.strptime(date_str, '%Y-%m-%d')
                
                if return_time:
                    return parsed_dt.strftime('%H:%M:%S')
                else:
                    return parsed_dt.strftime('%Y-%m-%d')
            except ValueError:
                pass
        
        return None

class NewsAggregator:
    def __init__(self):
        self.extractor = NewsExtractor()
        self.news_sources = {
            'BBC': 'https://www.bbc.com/news',
            'CNN': 'https://www.cnn.com',
            'Reuters': 'https://www.reuters.com',
            'AP News': 'https://apnews.com',
            'The Guardian': 'https://www.theguardian.com',
            'Fox News': 'https://www.foxnews.com',
            'NBC News': 'https://www.nbcnews.com',
            'CBS News': 'https://www.cbsnews.com',
            'ABC News': 'https://abcnews.go.com',
            'USA Today': 'https://www.usatoday.com',
            'The Indian Express': 'https://indianexpress.com',
            'Times of India': 'https://timesofindia.indiatimes.com'
        }
    
    def get_article_links(self, source_url, max_links=10):
        """Get article links from a news source homepage"""
        try:
            response = self.extractor.session.get(source_url, timeout=10)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Find all article links
            links = []
            for link in soup.find_all('a', href=True):
                href = link['href']
                
                # Convert relative URLs to absolute
                if href.startswith('/'):
                    href = urljoin(source_url, href)
                
                # Filter for article URLs (basic heuristic)
                if (href.startswith('http') and 
                    any(keyword in href.lower() for keyword in ['article', 'news', 'story', '202']) and
                    href not in links):
                    links.append(href)
                    
                    if len(links) >= max_links:
                        break
            
            return links
            
        except Exception as e:
            logger.error(f"Error getting links from {source_url}: {str(e)}")
            return []
    
    def scrape_latest_news(self, max_articles_per_source=5):
        """Scrape latest news from multiple sources"""
        all_articles = []
        
        for source_name, source_url in self.news_sources.items():
            logger.info(f"Scraping {source_name}...")
            
            # Get article links
            article_links = self.get_article_links(source_url, max_articles_per_source)
            
            # Extract data from each article
            for link in article_links:
                article_data = self.extractor.extract_from_url(link)
                if article_data:
                    article_data['source'] = source_name  # Override with known source
                    all_articles.append(article_data)
                
                time.sleep(1)  # Be respectful with requests
        
        return all_articles
    
    def save_to_json(self, articles, filename='news_articles.json'):
        """Save articles to JSON file"""
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(articles, f, indent=2, ensure_ascii=False)
        logger.info(f"Saved {len(articles)} articles to {filename}")
    
    def save_to_csv(self, articles, filename='news_articles.csv'):
        """Save articles to CSV file"""
        if not articles:
            return
        
        fieldnames = ['title', 'description', 'full_content', 'source', 'date', 'time', 'url', 'extracted_at']
        
        with open(filename, 'w', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(articles)
        
        logger.info(f"Saved {len(articles)} articles to {filename}")

# Example usage
if __name__ == "__main__":
    # Initialize the aggregator
    aggregator = NewsAggregator()
    
    # Method 1: Scrape from multiple sources
    print("Scraping latest news from multiple sources...")
    articles = aggregator.scrape_latest_news(max_articles_per_source=3)
    
    # Method 2: Extract from specific URLs
    specific_urls = [
        "https://www.bbc.com/news",
        "https://www.cnn.com/2024/01/15/example-article"  # Replace with actual URLs
    ]
    
    print("\nExtracting from specific URLs...")
    for url in specific_urls:
        article = aggregator.extractor.extract_from_url(url)
        if article:
            articles.append(article)
    
    # Display results
    print(f"\nExtracted {len(articles)} articles:")
    for i, article in enumerate(articles[:5], 1):  # Show first 5
        print(f"\n{i}. {article['title']}")
        print(f"   Source: {article['source']}")
        print(f"   Date: {article['date']} | Time: {article['time']}")
        print(f"   URL: {article['url']}")
        print(f"   Description: {article['description'][:100]}...")
        print(f"   Full Content Length: {len(article['full_content'])} characters")
        print(f"   Full Content Preview: {article['full_content'][:200]}...")
    
    # Save to files
    aggregator.save_to_json(articles)
    aggregator.save_to_csv(articles)
    
    print(f"\nTotal articles extracted: {len(articles)}")
    print("Data saved to news_articles.json and news_articles.csv")

2025-07-16 02:03:10,165 - INFO - Scraping BBC...


Scraping latest news from multiple sources...


2025-07-16 02:03:16,839 - INFO - Scraping CNN...
2025-07-16 02:03:29,884 - INFO - Scraping Reuters...
2025-07-16 02:03:30,045 - ERROR - Error getting links from https://www.reuters.com: 401 Client Error: HTTP Forbidden for url: https://www.reuters.com/
2025-07-16 02:03:30,047 - INFO - Scraping AP News...
2025-07-16 02:03:50,470 - INFO - Scraping The Guardian...
2025-07-16 02:03:57,197 - INFO - Scraping Fox News...
2025-07-16 02:04:06,711 - INFO - Scraping NBC News...
2025-07-16 02:04:15,820 - INFO - Scraping CBS News...
2025-07-16 02:04:23,258 - INFO - Scraping ABC News...
2025-07-16 02:04:32,035 - INFO - Scraping USA Today...
2025-07-16 02:04:38,075 - INFO - Scraping The Indian Express...
2025-07-16 02:04:43,655 - INFO - Scraping Times of India...



Extracting from specific URLs...


2025-07-16 02:04:51,792 - ERROR - Error extracting from https://www.cnn.com/2024/01/15/example-article: 404 Client Error: Not Found for url: https://edition.cnn.com/2024/01/15/example-article
2025-07-16 02:04:51,799 - INFO - Saved 34 articles to news_articles.json
2025-07-16 02:04:51,805 - INFO - Saved 34 articles to news_articles.csv



Extracted 34 articles:

1. BBC News - Breaking news, video and the latest top stories from the U.S. and around the world
   Source: BBC
   Date: 2025-07-16 | Time: 02:03:11
   URL: https://www.bbc.com/news
   Description: The existence of the scheme could not be reported until now because of a court injunction....
   Full Content Length: 7130 characters
   Full Content Preview: Thousands of Afghans were moved to UK in secret scheme after data breach

The existence of the scheme could not be reported until now because of a court injunction.

I'm 'disappointed but not done' wi...

2. Israel Gaza war | Latest News & Updates | BBC News
   Source: BBC
   Date: 2025-07-16 | Time: 02:03:13
   URL: https://www.bbc.com/news/topics/c2vdnvdg6xxt
   Description: Mahmoud Abdul Rahman's son, Abdullah, was among six children who died at a water distribution point ...
   Full Content Length: 5802 characters
   Full Content Preview: Israel-Gaza war

Gaza father's outrage after Israeli strike kills son

In [3]:
import os
from scrapegraph_py import Client
client = Client(api_key="sgai-3e5b40b7-ae60-490b-83f2-629a8c2a2891")

print(dir(client))

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__enter__', '__eq__', '__exit__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_make_request', 'api_key', 'close', 'crawl', 'from_env', 'get_crawl', 'get_credits', 'get_markdownify', 'get_searchscraper', 'get_smartscraper', 'headers', 'markdownify', 'max_retries', 'retry_delay', 'searchscraper', 'session', 'smartscraper', 'submit_feedback', 'timeout']
