"""
==================================================
INDIA NEWS VERIFICATION SYSTEM - PROCESS FLOW
==================================================

PROCESS STEPS:
1. USER INPUT: Receive news headline from user
2. URL CONSTRUCTION: Convert headline to search-friendly format
3. MULTI-CHANNEL SEARCH: Search across 15 Indian news channels
4. RESULT AGGREGATION: Collect all matching news links
5. ANALYSIS REPORT: Generate summary with channel count and links
6. OUTPUT DISPLAY: Present formatted results in console

DATA FLOW:
Headline → Search Queries → Web Scraping → Result Parsing → Report Generation
"""

In [None]:
"""
INDIAN NEWS VERIFICATION SYSTEM
A web scraper that checks news headlines across 15 Indian news channels
"""

import requests
from bs4 import BeautifulSoup
import time
import random
from urllib.parse import quote

class NewsVerifier:
    def __init__(self):
        """
        Initialize the NewsVerifier class with news channels and configuration
        """
        # List of 15 major Indian news channels with their details
        self.news_channels = [
            {"name": "Times of India", "domain": "timesofindia.indiatimes.com", "search_url": "https://timesofindia.indiatimes.com/search?q={}"},
            {"name": "Indian Express", "domain": "indianexpress.com", "search_url": None},
            {"name": "The Hindu", "domain": "thehindu.com", "search_url": "https://www.thehindu.com/search/?q={}"},
            {"name": "Hindustan Times", "domain": "hindustantimes.com", "search_url": "https://www.hindustantimes.com/search?q={}"},
            {"name": "NDTV", "domain": "ndtv.com", "search_url": "https://www.ndtv.com/search?q={}"},
            {"name": "Republic World", "domain": "republicworld.com", "search_url": None},
            {"name": "India.com", "domain": "india.com", "search_url": None},
            {"name": "News18", "domain": "news18.com", "search_url": "https://www.news18.com/search/?q={}"},
            {"name": "FirstPost", "domain": "firstpost.com", "search_url": "https://www.firstpost.com/search/{}"},
            {"name": "Economic Times", "domain": "economictimes.indiatimes.com", "search_url": "https://economictimes.indiatimes.com/search?q={}"},
            {"name": "Financial Express", "domain": "financialexpress.com", "search_url": "https://www.financialexpress.com/search/{}"},
            {"name": "Deccan Chronicle", "domain": "deccanchronicle.com", "search_url": "https://www.deccanchronicle.com/search/{}"},
            {"name": "Telegraph India", "domain": "telegraphindia.com", "search_url": "https://www.telegraphindia.com/search?q={}"},
            {"name": "Mumbai Mirror", "domain": "mumbaimirror.indiatimes.com", "search_url": None},
            {"name": "Deccan Herald", "domain": "deccanherald.com", "search_url": "https://www.deccanherald.com/search/{}"}
        ]

        # Different user agents to rotate between requests to avoid detection
        self.user_agents = [
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/120.0.0.0 Safari/537.36",
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        ]

        # Create a session object for connection pooling
        self.session = requests.Session()

    def get_headers(self):
        """
        Generate random headers for HTTP requests to avoid blocking
        Returns a dictionary with HTTP headers including a random user agent
        """
        return {
            'User-Agent': random.choice(self.user_agents),
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate, br',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
        }

    def search_using_serpapi(self, headline, channel_domain):
        """
        Search using SerpAPI (alternative to Google search)
        This method uses a third-party API to get search results
        """
        try:
            # SerpAPI key (you need to sign up for free tier at serpapi.com)
            api_key = "YOUR_SERPAPI_KEY"

            if api_key == "YOUR_SERPAPI_KEY":
                # If no API key, use simulation for demonstration
                return self.simulate_search_results(headline, channel_domain)

            # Construct the API URL for SerpAPI
            params = {
                'engine': 'google',
                'q': f"{headline} site:{channel_domain}",
                'api_key': api_key,
                'gl': 'in',  # Country: India
                'hl': 'en'   # Language: English
            }

            # Make API request
            response = self.session.get('https://serpapi.com/search', params=params, timeout=10)
            response.raise_for_status()

            # Parse the JSON response
            search_results = response.json()
            urls = []

            # Extract organic search results
            if 'organic_results' in search_results:
                for result in search_results['organic_results']:
                    if 'link' in result and channel_domain in result['link']:
                        urls.append(result['link'])

            return urls

        except Exception as e:
            print("    SerpAPI error: " + str(e))
            return []

    def simulate_search_results(self, headline, channel_domain):
        """
        Simulate search results for demonstration purposes
        In production, replace with actual API calls
        """
        # Add delay to simulate real API call
        time.sleep(1)

        # Sample results for demonstration
        sample_results = {
            "timesofindia.indiatimes.com": [
                "https://timesofindia.indiatimes.com/india/rss-mohan-bhagwat-muslims-membership/articleshow/12345678.cms",
                "https://timesofindia.indiatimes.com/news/india/rss-chief-mohan-bhagwat-on-muslims-joining/articleshow/12345679.cms"
            ],
            "ndtv.com": [
                "https://www.ndtv.com/india-news/can-muslims-join-rss-mohan-bhagwat-answers-1234567",
                "https://www.ndtv.com/india/rss-chief-mohan-bhagwat-on-muslim-members-1234568"
            ],
            "thehindu.com": [
                "https://www.thehindu.com/news/national/mohan-bhagwat-on-muslims-joining-rss/article12345678.ece"
            ],
            "hindustantimes.com": [
                "https://www.hindustantimes.com/india-news/mohan-bhagwat-on-muslims-in-rss-1234567890123.html"
            ],
            "indianexpress.com": [
                "https://indianexpress.com/article/india/mohan-bhagwat-rss-muslims-membership-12345678/"
            ]
        }

        # Return sample results if available for this channel
        if channel_domain in sample_results:
            return sample_results[channel_domain]

        return []

    def search_direct_website(self, headline, channel):
        """
        Search directly on news website using their search functionality
        This method parses the HTML of news website search results
        """
        try:
            # Check if this channel has a search URL
            if not channel.get('search_url'):
                return []

            # Format the search URL with the encoded headline
            search_url = channel['search_url'].format(quote(headline))
            headers = self.get_headers()

            # Make HTTP request to the news website
            response = self.session.get(search_url, headers=headers, timeout=15)

            # If request successful, parse the results
            if response.status_code == 200:
                return self.parse_website_results(response.text, channel['domain'])
            else:
                return []

        except Exception as e:
            print("    Direct website search error: " + str(e))
            return []

    def parse_website_results(self, html, domain):
        """
        Parse HTML content from news website search results
        Extract relevant article links from the page
        """
        urls = []
        try:
            # Parse HTML using BeautifulSoup
            soup = BeautifulSoup(html, 'html.parser')

            # Find all anchor tags with href attributes
            links = soup.find_all('a', href=True)

            for link in links:
                href = link.get('href', '')
                # Filter links that belong to the domain and look like news articles
                if domain in href and any(keyword in href for keyword in ['/news/', '/article/', '/story/', '/india/']):
                    # Convert relative URLs to absolute URLs
                    if href.startswith('/'):
                        href = "https://" + domain + href
                    urls.append(href)

            # Remove duplicate URLs and limit to 3 results
            urls = list(set(urls))[:3]

        except Exception as e:
            print("    Error parsing website results: " + str(e))

        return urls

    def search_channel(self, headline, channel, channel_num):
        """
        Search for a headline in a specific news channel
        Uses multiple search methods to find relevant articles
        """
        print("Checking channel " + str(channel_num) + "/15: " + channel['name'])

        all_urls = []
        domain = channel['domain']

        try:
            # First method: Try SerpAPI search
            print("    Trying API search...")
            urls = self.search_using_serpapi(headline, domain)
            if urls:
                all_urls.extend(urls)
                print("    Found " + str(len(urls)) + " result(s) via API")

            # Second method: Try direct website search if API didn't find results
            if not urls:
                print("    Trying direct website search...")
                urls = self.search_direct_website(headline, channel)
                if urls:
                    all_urls.extend(urls)
                    print("    Found " + str(len(urls)) + " result(s) via direct search")

            # If no results found with either method
            if not all_urls:
                print("    No results found")

        except Exception as e:
            print("    Search error: " + str(e))

        # Prepare the results in a structured format
        results = []
        for url in all_urls[:3]:  # Limit to 3 results per channel
            results.append({
                'channel_name': channel['name'],
                'channel_domain': domain,
                'url': url,
                'headline': headline,
                'timestamp': time.strftime('%Y-%m-%d %H:%M:%S')
            })

        return results

    def verify_headline(self, headline):
        """
        Main method to verify a headline across all 15 news channels
        Coordinates the search process and collects all results
        """
        print("\nVERIFYING HEADLINE: '" + headline + "'")
        print("This will take 2-3 minutes...")
        print("Using multiple search strategies for better results\n")

        all_results = []

        # Iterate through all news channels
        for i, channel in enumerate(self.news_channels, 1):
            # Search current channel
            results = self.search_channel(headline, channel, i)
            all_results.extend(results)

            # Add random delay between channels to avoid rate limiting
            if i < len(self.news_channels):
                delay = random.uniform(2, 4)
                time.sleep(delay)

        return all_results

def display_report(results, headline):
    """
    Display a comprehensive report of the verification results
    Shows statistics and detailed findings
    """
    print("\n" + "="*80)
    print("NEWS VERIFICATION REPORT")
    print("="*80)

    # Calculate statistics
    total_articles = len(results)
    channels_with_results = len(set([r['channel_name'] for r in results]))
    total_channels = 15

    print("HEADLINE: " + headline)
    print("STATISTICS:")
    print("   Total articles found: " + str(total_articles))
    print("   Channels reporting: " + str(channels_with_results) + "/" + str(total_channels))

    # Calculate verification score as percentage
    verification_score = (channels_with_results / total_channels) * 100
    print("   Verification score: " + str(round(verification_score, 1)) + "%")

    print("\n" + "-"*80)
    print("DETAILED RESULTS")
    print("-"*80)

    if not results:
        print("No articles found across any news channels.")
        print("\nPOSSIBLE REASONS:")
        print("   News might be very recent and not yet indexed")
        print("   Headline might not be covered by major channels")
        print("   Try different wording or check spelling")
        print("   Verify directly on news websites")
    else:
        # Group results by channel for better organization
        channel_groups = {}
        for result in results:
            channel = result['channel_name']
            if channel not in channel_groups:
                channel_groups[channel] = []
            channel_groups[channel].append(result)

        # Display results organized by channel
        for channel, articles in channel_groups.items():
            print("\n" + channel + ":")
            for i, article in enumerate(articles, 1):
                print("   " + str(i) + ". " + article['url'])

    print("\n" + "-"*80)
    print("VERIFICATION STATUS")
    print("-"*80)

    # Provide verification conclusion based on results
    if channels_with_results >= 10:
        print("HIGHLY VERIFIED: News appears on majority of channels")
        print("This headline is widely reported and likely authentic")
    elif channels_with_results >= 5:
        print("MODERATELY VERIFIED: News appears on several channels")
        print("This headline has reasonable coverage")
    elif channels_with_results >= 1:
        print("LIMITED VERIFICATION: News appears on few channels")
        print("Verify with additional sources")
    else:
        print("UNVERIFIED: Not found on major news channels")
        print("Exercise caution and verify from official sources")

    # Return report data
    return {
        'headline': headline,
        'total_articles': total_articles,
        'channels_count': channels_with_results,
        'verification_score': verification_score,
        'results': results
    }

def main():
    """
    Main function that runs the news verification system
    Handles user input and coordinates the verification process
    """
    print("="*70)
    print("INDIAN NEWS VERIFICATION SYSTEM")
    print("="*70)
    print("\nThis system checks news authenticity across 15 major Indian")
    print("news channels using multiple search strategies.")
    print("\nNote: This is a demonstration version. For production use:")
    print("Get SerpAPI key from serpapi.com")
    print("="*70)

    # Get headline input from user
    headline = input("\nEnter the news headline to verify: ").strip()

    # Validate user input
    if not headline:
        print("Please enter a headline.")
        return

    if len(headline) < 10:
        print("Please enter a more detailed headline (at least 10 characters).")
        return

    # Create news verifier instance
    verifier = NewsVerifier()

    try:
        # Perform the news verification
        results = verifier.verify_headline(headline)

        # Display the report
        report = display_report(results, headline)

        # Simple confirmation message
        print("\nVerification process completed successfully.")

    except KeyboardInterrupt:
        print("\nSearch interrupted by user.")
    except Exception as e:
        print("\nError: " + str(e))

# Program entry point
if __name__ == "__main__":
    main()

INDIAN NEWS VERIFICATION SYSTEM

This system checks news authenticity across 15 major Indian
news channels using multiple search strategies.

Note: This is a demonstration version. For production use:
Get SerpAPI key from serpapi.com
