In [2]:
"""
CNN Headline Scraper with Selenium
Handles JavaScript-rendered content

Installation:
pip install selenium webdriver-manager beautifulsoup4

For Ubuntu/Linux, install Firefox:
sudo apt-get update
sudo apt-get install firefox
"""

from selenium import webdriver
from selenium.webdriver.firefox.service import Service as FirefoxService
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.firefox import GeckoDriverManager
from bs4 import BeautifulSoup
import time

def scrape_cnn_headlines(query="finance", use_firefox=True):
    """Scrape headlines from CNN search using Selenium"""
    
    url = f"https://edition.cnn.com/search?q={query}&size=10&from=0&page=1&sort=newest&types=all&section="
    
    print(f"üåê Fetching: {url}\n")
    print("‚è≥ Opening browser and loading page...\n")
    
    driver = None
    
    try:
        # Setup Firefox (more likely to be installed on Linux)
        firefox_options = FirefoxOptions()
        firefox_options.add_argument("--headless")  # Run without GUI
        firefox_options.add_argument("--no-sandbox")
        firefox_options.add_argument("--disable-dev-shm-usage")
        
        service = FirefoxService(GeckoDriverManager().install())
        driver = webdriver.Firefox(service=service, options=firefox_options)
        
        print("‚úì Browser started\n")
        
        # Load the page
        driver.get(url)
        
        # Wait for content to load
        print("‚è≥ Waiting for headlines to load...\n")
        time.sleep(5)  # Give it time to load
        
        # Try to wait for span elements
        try:
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, "container__headline-text"))
            )
        except:
            print("‚ö†Ô∏è  Timeout waiting for headlines, continuing anyway...\n")
        
        # Get page source after JavaScript execution
        page_source = driver.page_source
        
        # Save for debugging
        with open('debug_selenium_page.html', 'w', encoding='utf-8') as f:
            f.write(page_source)
        print("‚úì Saved rendered HTML to 'debug_selenium_page.html'\n")
        
        # Parse with BeautifulSoup
        soup = BeautifulSoup(page_source, 'html.parser')
        
        # Find all span tags with class 'container__headline-text'
        headline_spans = soup.find_all('span', class_='container__headline-text')
        
        print(f"‚úì Found {len(headline_spans)} headlines:\n")
        print("="*100)
        
        articles = []
        
        # Extract text and URL from each span
        for i, span in enumerate(headline_spans, 1):
            headline_text = span.get_text(strip=True)
            article_url = span.get('data-zjs-href', 'N/A')
            
            articles.append({
                'headline': headline_text,
                'url': article_url
            })
            
            print(f"{i}. {headline_text}")
            print(f"   üîó {article_url}\n")
        
        print("="*100)
        
        return articles
        
    except Exception as e:
        print(f"‚ùå Error: {e}")
        return []
    
    finally:
        if driver:
            driver.quit()
            print("\n‚úì Browser closed")


# Run the scraper
if __name__ == "__main__":
    print("="*100)
    print("üîç SCRAPING CNN FINANCE HEADLINES")
    print("="*100 + "\n")
    
    articles = scrape_cnn_headlines("finance")
    
    print(f"\nüìä Total headlines scraped: {len(articles)}")
    
    if articles:
        print("\nüíæ Saving to file...")
        with open('cnn_headlines.txt', 'w', encoding='utf-8') as f:
            for i, article in enumerate(articles, 1):
                f.write(f"{i}. {article['headline']}\n")
                f.write(f"   {article['url']}\n\n")
        print("‚úì Saved to 'cnn_headlines.txt'")

üîç SCRAPING CNN FINANCE HEADLINES

üåê Fetching: https://edition.cnn.com/search?q=finance&size=10&from=0&page=1&sort=newest&types=all&section=

‚è≥ Opening browser and loading page...

‚úì Browser started

‚è≥ Waiting for headlines to load...

‚úì Saved rendered HTML to 'debug_selenium_page.html'

‚úì Found 9 headlines:

1. The Fed just cut rates. Here‚Äôs how to make that move work for your money
   üîó https://www.cnn.com/2025/09/17/business/your-money-federal-reserve-interest-rate-cut

2. Stocks have literally never been this expensive
   üîó https://www.cnn.com/2025/09/02/economy/us-stock-market

3. Here are 4 ways lower interest rates could affect your personal finances
   üîó https://www.cnn.com/2025/08/25/business/personal-finances-how-fed-interest-cut-affects-you

4. Make your money work for you by ‚Äòladdering‚Äô bonds or CDs
   üîó https://www.cnn.com/2025/08/16/business/building-a-cd-or-bond-ladder

5. Worried about money? Experts share how to prepare for hard times
 

In [3]:
"""
CNN Headline Scraper with Selenium
Handles JavaScript-rendered content

Installation:
pip install selenium webdriver-manager beautifulsoup4

For Ubuntu/Linux, install Firefox:
sudo apt-get update
sudo apt-get install firefox
"""

from selenium import webdriver
from selenium.webdriver.firefox.service import Service as FirefoxService
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.firefox import GeckoDriverManager
from bs4 import BeautifulSoup
import time

def scrape_cnn_headlines(query="finance", use_firefox=True):
    """Scrape headlines from CNN search using Selenium"""
    
    url = f"https://edition.cnn.com/search?q={query}&size=10&from=0&page=1&sort=newest&types=all&section="
    
    print(f"üåê Fetching: {url}\n")
    print("‚è≥ Opening browser and loading page...\n")
    
    driver = None
    
    try:
        # Setup Firefox (more likely to be installed on Linux)
        firefox_options = FirefoxOptions()
        firefox_options.add_argument("--headless")  # Run without GUI
        firefox_options.add_argument("--no-sandbox")
        firefox_options.add_argument("--disable-dev-shm-usage")
        
        service = FirefoxService(GeckoDriverManager().install())
        driver = webdriver.Firefox(service=service, options=firefox_options)
        
        print("‚úì Browser started\n")
        
        # Load the page
        driver.get(url)
        
        # Wait for content to load
        print("‚è≥ Waiting for headlines to load...\n")
        time.sleep(5)  # Give it time to load
        
        # Scroll down multiple times to load all content
        print("üìú Scrolling to load all content...\n")
        for i in range(5):
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(2)
        
        # Scroll back to top
        driver.execute_script("window.scrollTo(0, 0);")
        time.sleep(2)
        
        # Try to wait for span elements
        try:
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, "container__headline-text"))
            )
        except:
            print("‚ö†Ô∏è  Timeout waiting for headlines, continuing anyway...\n")
        
        # Get page source after JavaScript execution
        page_source = driver.page_source
        
        # Save for debugging
        with open('debug_selenium_page.html', 'w', encoding='utf-8') as f:
            f.write(page_source)
        print("‚úì Saved rendered HTML to 'debug_selenium_page.html'\n")
        
        # Parse with BeautifulSoup
        soup = BeautifulSoup(page_source, 'html.parser')
        
        # Find all span tags with class 'container__headline-text'
        headline_spans = soup.find_all('span', class_='container__headline-text')
        
        print(f"‚úì Found {len(headline_spans)} headlines:\n")
        print("="*100)
        
        articles = []
        
        # Extract text and URL from each span
        for i, span in enumerate(headline_spans, 1):
            headline_text = span.get_text(strip=True)
            article_url = span.get('data-zjs-href', 'N/A')
            
            articles.append({
                'headline': headline_text,
                'url': article_url
            })
            
            print(f"{i}. {headline_text}")
            print(f"   üîó {article_url}\n")
        
        print("="*100)
        
        return articles
        
    except Exception as e:
        print(f"‚ùå Error: {e}")
        return []
    
    finally:
        if driver:
            driver.quit()
            print("\n‚úì Browser closed")


# Run the scraper
if __name__ == "__main__":
    print("="*100)
    print("üîç SCRAPING CNN FINANCE HEADLINES")
    print("="*100 + "\n")
    
    articles = scrape_cnn_headlines("finance")
    
    print(f"\nüìä Total headlines scraped: {len(articles)}")
    
    if articles:
        print("\nüíæ Saving to file...")
        with open('cnn_headlines.txt', 'w', encoding='utf-8') as f:
            for i, article in enumerate(articles, 1):
                f.write(f"{i}. {article['headline']}\n")
                f.write(f"   {article['url']}\n\n")
        print("‚úì Saved to 'cnn_headlines.txt'")

üîç SCRAPING CNN FINANCE HEADLINES

üåê Fetching: https://edition.cnn.com/search?q=finance&size=10&from=0&page=1&sort=newest&types=all&section=

‚è≥ Opening browser and loading page...

‚úì Browser started

‚è≥ Waiting for headlines to load...

üìú Scrolling to load all content...

‚úì Saved rendered HTML to 'debug_selenium_page.html'

‚úì Found 9 headlines:

1. The Fed just cut rates. Here‚Äôs how to make that move work for your money
   üîó https://www.cnn.com/2025/09/17/business/your-money-federal-reserve-interest-rate-cut

2. Stocks have literally never been this expensive
   üîó https://www.cnn.com/2025/09/02/economy/us-stock-market

3. Here are 4 ways lower interest rates could affect your personal finances
   üîó https://www.cnn.com/2025/08/25/business/personal-finances-how-fed-interest-cut-affects-you

4. Make your money work for you by ‚Äòladdering‚Äô bonds or CDs
   üîó https://www.cnn.com/2025/08/16/business/building-a-cd-or-bond-ladder

5. Worried about money? Expert

In [2]:
"""
CNN Headline Scraper with Selenium
Handles JavaScript-rendered content with manual query input

Installation:
pip install selenium webdriver-manager beautifulsoup4

For Ubuntu/Linux, install Firefox:
sudo apt-get update
sudo apt-get install firefox
"""

from selenium import webdriver
from selenium.webdriver.firefox.service import Service as FirefoxService
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.firefox import GeckoDriverManager
from bs4 import BeautifulSoup
import time

def scrape_cnn_headlines(query, use_firefox=True):
    """Scrape headlines from CNN search using Selenium"""
    
    url = f"https://edition.cnn.com/search?q={query}&size=10&from=0&page=1&sort=newest&types=all&section="
    
    print(f"\nüåê Fetching: {url}\n")
    print("‚è≥ Opening browser and loading page...\n")
    
    driver = None
    
    try:
        # Setup Firefox (more likely to be installed on Linux)
        firefox_options = FirefoxOptions()
        firefox_options.add_argument("--headless")  # Run without GUI
        firefox_options.add_argument("--no-sandbox")
        firefox_options.add_argument("--disable-dev-shm-usage")
        
        service = FirefoxService(GeckoDriverManager().install())
        driver = webdriver.Firefox(service=service, options=firefox_options)
        
        print("‚úì Browser started\n")
        
        # Load the page
        driver.get(url)
        
        # Wait for content to load
        print("‚è≥ Waiting for headlines to load...\n")
        time.sleep(5)  # Give it time to load
        
        # Scroll down multiple times to load all content
        print("üìú Scrolling to load all content...\n")
        for i in range(5):
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(2)
        
        # Scroll back to top
        driver.execute_script("window.scrollTo(0, 0);")
        time.sleep(2)
        
        # Try to wait for span elements
        try:
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, "container__headline-text"))
            )
        except:
            print("‚ö†Ô∏è  Timeout waiting for headlines, continuing anyway...\n")
        
        # Get page source after JavaScript execution
        page_source = driver.page_source
        
        # Save for debugging
        with open('debug_selenium_page.html', 'w', encoding='utf-8') as f:
            f.write(page_source)
        print("‚úì Saved rendered HTML to 'debug_selenium_page.html'\n")
        
        # Parse with BeautifulSoup
        soup = BeautifulSoup(page_source, 'html.parser')
        
        # Find all span tags with class 'container__headline-text'
        headline_spans = soup.find_all('span', class_='container__headline-text')
        
        print(f"‚úì Found {len(headline_spans)} headlines:\n")
        print("="*100)
        
        articles = []
        
        # Extract text and URL from each span
        for i, span in enumerate(headline_spans, 1):
            headline_text = span.get_text(strip=True)
            article_url = span.get('data-zjs-href', 'N/A')
            
            articles.append({
                'headline': headline_text,
                'url': article_url
            })
            
            print(f"{i}. {headline_text}")
            print(f"   üîó {article_url}\n")
        
        print("="*100)
        
        return articles
        
    except Exception as e:
        print(f"‚ùå Error: {e}")
        return []
    
    finally:
        if driver:
            driver.quit()
            print("\n‚úì Browser closed")


# Run the scraper
if __name__ == "__main__":
    print("="*100)
    print("üîç CNN HEADLINE SCRAPER")
    print("="*100 + "\n")
    
    # Get search query from user
    query = input("Enter your search query (e.g., finance, technology, sports): ").strip()
    
    if not query:
        print("‚ùå No query entered. Using default: 'finance'")
        query = "finance"
    
    print(f"\nüîé Searching for: '{query}'")
    
    articles = scrape_cnn_headlines(query)
    
    print(f"\nüìä Total headlines scraped: {len(articles)}")
    
    if articles:
        print("\nüíæ Saving to file...")
        filename = f'cnn_headlines_{query.replace(" ", "_")}.txt'
        with open(filename, 'w', encoding='utf-8') as f:
            f.write(f"CNN Headlines for: {query}\n")
            f.write("="*100 + "\n\n")
            for i, article in enumerate(articles, 1):
                f.write(f"{i}. {article['headline']}\n")
                f.write(f"   {article['url']}\n\n")
        print(f"‚úì Saved to '{filename}'")

üîç CNN HEADLINE SCRAPER


üîé Searching for: 'technology'

üåê Fetching: https://edition.cnn.com/search?q=technology&size=10&from=0&page=1&sort=newest&types=all&section=

‚è≥ Opening browser and loading page...

‚úì Browser started

‚è≥ Waiting for headlines to load...

üìú Scrolling to load all content...

‚úì Saved rendered HTML to 'debug_selenium_page.html'

‚úì Found 8 headlines:

1. Exhausted? The reason may be how you‚Äôre using technology
   üîó https://www.cnn.com/2025/10/07/health/digital-exhaustion-solutions-book-wellness

2. Fareed‚Äôs take: America needs to get serious about China‚Äôs tech rise
   üîó https://www.cnn.com/2025/10/05/world/video/gps-1005-take-china-tech-rise-dominance

3. Google says 90% of tech workers are now using AI at work
   üîó https://www.cnn.com/2025/09/23/tech/google-study-90-percent-tech-jobs-ai

4. Angolan startup Anda is building a platform for human mobility
   üîó https://www.cnn.com/2025/09/19/world/video/angola-business-startup-techn

In [7]:
"""
The National News Headline Scraper with Selenium
Handles JavaScript-rendered content with manual query input

Installation:
pip install selenium webdriver-manager beautifulsoup4

For Ubuntu/Linux, install Firefox:
sudo apt-get update
sudo apt-get install firefox
"""

from selenium import webdriver
from selenium.webdriver.firefox.service import Service as FirefoxService
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.firefox import GeckoDriverManager
from bs4 import BeautifulSoup
import time

def scrape_thenational_headlines(query, headless=True):
    """Scrape headlines from The National News search using Selenium
    
    Args:
        query: Search term
        headless: If False, browser window will be visible (useful for debugging)
    """
    
    # The National News search URL
    url = f"https://www.thenationalnews.com/search/?query={query.replace(' ', '+')}"
    
    print(f"\nüåê Fetching: {url}\n")
    print("‚è≥ Opening browser and loading page...\n")
    
    driver = None
    
    try:
        # Setup Firefox
        firefox_options = FirefoxOptions()
        if headless:
            firefox_options.add_argument("--headless")  # Run without GUI
        firefox_options.add_argument("--no-sandbox")
        firefox_options.add_argument("--disable-dev-shm-usage")
        firefox_options.add_argument("--window-size=1920,1080")
        
        # Add user agent to appear as a real browser
        firefox_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
        
        service = FirefoxService(GeckoDriverManager().install())
        driver = webdriver.Firefox(service=service, options=firefox_options)
        
        print("‚úì Browser started\n")
        
        # Load the page
        driver.get(url)
        
        # Wait for content to load
        print("‚è≥ Waiting for search results to load...\n")
        time.sleep(8)  # Give it time to load
        
        # Scroll down multiple times to load all content
        print("üìú Scrolling to load all content...\n")
        for i in range(5):
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(3)
        
        # Scroll back to top
        driver.execute_script("window.scrollTo(0, 0);")
        time.sleep(2)
        
        # Try to wait for queryly items
        try:
            WebDriverWait(driver, 15).until(
                EC.presence_of_element_located((By.CLASS_NAME, "queryly_item_row"))
            )
            print("‚úì Search results found\n")
        except:
            print("‚ö†Ô∏è  Timeout waiting for results, continuing anyway...\n")
        
        # Get page source after JavaScript execution
        page_source = driver.page_source
        
        # Save for debugging
        with open('debug_thenational_page.html', 'w', encoding='utf-8') as f:
            f.write(page_source)
        print("‚úì Saved rendered HTML to 'debug_thenational_page.html'\n")
        
        # Parse with BeautifulSoup
        soup = BeautifulSoup(page_source, 'html.parser')
        
        print("üîç Analyzing page structure...\n")
        
        # Find all queryly_item_row divs
        item_rows = soup.find_all('div', class_='queryly_item_row')
        
        print(f"‚úì Found {len(item_rows)} article containers\n")
        print("="*100)
        
        articles = []
        
        # Extract data from each item
        for i, row in enumerate(item_rows, 1):
            # Find the title div
            title_div = row.find('div', class_='queryly_item_title')
            
            # Find the link (parent <a> tag)
            link_tag = row.find('a')
            
            if title_div and link_tag:
                headline_text = title_div.get_text(strip=True)
                article_url = link_tag.get('href', 'N/A')
                
                # Make URL absolute if it's relative
                if article_url.startswith('/'):
                    article_url = f"https://www.thenationalnews.com{article_url}"
                
                # Extract description if available
                desc_div = row.find('div', class_='queryly_item_description')
                description = desc_div.get_text(strip=True) if desc_div else 'N/A'
                
                articles.append({
                    'headline': headline_text,
                    'url': article_url,
                    'description': description
                })
                
                print(f"{i}. {headline_text}")
                print(f"   üîó {article_url}")
                if description != 'N/A':
                    # Truncate long descriptions
                    short_desc = description[:100] + '...' if len(description) > 100 else description
                    print(f"   üìù {short_desc}")
                print()
        
        print("="*100)
        
        return articles
        
    except Exception as e:
        print(f"‚ùå Error: {e}")
        import traceback
        traceback.print_exc()
        return []
    
    finally:
        if driver:
            driver.quit()
            print("\n‚úì Browser closed")


# Run the scraper
if __name__ == "__main__":
    print("="*100)
    print("üîç THE NATIONAL NEWS HEADLINE SCRAPER")
    print("="*100 + "\n")
    
    # Get search query from user
    query = input("Enter your search query (e.g., finance, technology, sports): ").strip()
    
    if not query:
        print("‚ùå No query entered. Using default: 'finance'")
        query = "finance"
    
    print(f"\nüîé Searching for: '{query}'")
    
    # Ask if user wants to see the browser (for debugging)
    debug_mode = input("\nRun in DEBUG mode (see browser window)? (y/n): ").strip().lower()
    headless = debug_mode != 'y'
    
    if not headless:
        print("üîç DEBUG MODE: Browser window will be visible\n")
    
    articles = scrape_thenational_headlines(query, headless)
    
    print(f"\nüìä Total headlines scraped: {len(articles)}")
    
    if articles:
        print("\nüíæ Saving to file...")
        filename = f'thenational_headlines_{query.replace(" ", "_")}.txt'
        with open(filename, 'w', encoding='utf-8') as f:
            f.write(f"The National News Headlines for: {query}\n")
            f.write("="*100 + "\n\n")
            for i, article in enumerate(articles, 1):
                f.write(f"{i}. {article['headline']}\n")
                f.write(f"   URL: {article['url']}\n")
                if article['description'] != 'N/A':
                    f.write(f"   Description: {article['description']}\n")
                f.write("\n")
        print(f"‚úì Saved to '{filename}'")

üîç THE NATIONAL NEWS HEADLINE SCRAPER




üîé Searching for: 'finance'

üåê Fetching: https://www.thenationalnews.com/search/?query=finance

‚è≥ Opening browser and loading page...

‚úì Browser started

‚è≥ Waiting for search results to load...

üìú Scrolling to load all content...

‚úì Search results found

‚úì Saved rendered HTML to 'debug_thenational_page.html'

üîç Analyzing page structure...

‚úì Found 20 article containers

1. It's never too late to put your finances in shape
   üîó https://www.thenationalnews.com/business/money/2025/09/24/its-never-too-late-to-put-your-finances-in-shape/
   üìù Sep 24, 2025 -Have you looked at your financial situation and felt a knot in your stomach? Maybe you...

2. New York's Met Opera in Saudi deal to boost shaky finances
   üîó https://www.thenationalnews.com/news/us/2025/09/08/new-yorks-met-opera-in-saudi-deal-to-boost-shaky-finances/
   üìù Sep 08, 2025 -The Metropolitan Opera, a stalwart of New York ‚Äôs cultural life, has struck a deal wit...

3. Terror financing invest

In [1]:
import requests
from bs4 import BeautifulSoup
import PyPDF2
import io
import os
import json
from urllib.parse import urljoin
import time

class GSTCouncilPDFScraper:
    def __init__(self, base_url):
        self.base_url = base_url
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        })
        self.pdf_data = []
        
    def get_page_content(self, page_num=0):
        """Fetch content from a specific page"""
        url = f"{self.base_url}?page={page_num}"
        print(f"Fetching page {page_num + 1}...")
        
        try:
            response = self.session.get(url, timeout=30)
            response.raise_for_status()
            return response.text
        except Exception as e:
            print(f"Error fetching page {page_num}: {e}")
            return None
    
    def extract_pdf_links(self, html_content):
        """Extract all PDF links from the HTML content"""
        soup = BeautifulSoup(html_content, 'html.parser')
        pdf_links = []
        
        # Find all rows in the table
        rows = soup.find_all('tr')
        
        for row in rows:
            # Find PDF links in each row
            link_tag = row.find('a', href=lambda x: x and x.endswith('.pdf'))
            if link_tag:
                href = link_tag.get('href')
                title = link_tag.get_text(strip=True)
                
                # Get date from the row
                date_cell = row.find('td', class_='views-field-field-date-of-uploading')
                date = date_cell.get_text(strip=True) if date_cell else 'Unknown'
                
                pdf_links.append({
                    'url': href,
                    'title': title,
                    'date': date
                })
        
        return pdf_links
    
    def download_pdf(self, pdf_url):
        """Download PDF and return content as bytes"""
        # Handle relative URLs
        if pdf_url.startswith('/'):
            # Extract base domain from base_url
            from urllib.parse import urlparse
            parsed = urlparse(self.base_url)
            full_url = f"{parsed.scheme}://{parsed.netloc}{pdf_url}"
        else:
            full_url = pdf_url
        
        print(f"Downloading: {full_url}")
        
        try:
            response = self.session.get(full_url, timeout=60)
            response.raise_for_status()
            return response.content
        except Exception as e:
            print(f"Error downloading PDF {full_url}: {e}")
            return None
    
    def extract_text_from_pdf(self, pdf_content):
        """Extract text from PDF bytes"""
        try:
            pdf_file = io.BytesIO(pdf_content)
            pdf_reader = PyPDF2.PdfReader(pdf_file)
            
            text = ""
            for page_num in range(len(pdf_reader.pages)):
                page = pdf_reader.pages[page_num]
                text += page.extract_text() + "\n\n"
            
            return text.strip()
        except Exception as e:
            print(f"Error extracting text from PDF: {e}")
            return None
    
    def scrape_all_pages(self, total_pages=9):
        """Scrape PDFs from all pages"""
        all_pdf_links = []
        
        # Iterate through all pages
        for page_num in range(total_pages):
            html_content = self.get_page_content(page_num)
            
            if html_content:
                pdf_links = self.extract_pdf_links(html_content)
                all_pdf_links.extend(pdf_links)
                print(f"Found {len(pdf_links)} PDFs on page {page_num + 1}")
            
            # Be respectful with requests
            time.sleep(2)
        
        print(f"\nTotal PDFs found: {len(all_pdf_links)}")
        return all_pdf_links
    
    def process_pdfs(self, pdf_links, save_pdfs=True, save_text=True):
        """Download and extract content from all PDFs"""
        
        # Create directories
        if save_pdfs:
            os.makedirs('pdfs', exist_ok=True)
        if save_text:
            os.makedirs('pdf_texts', exist_ok=True)
        
        for idx, pdf_info in enumerate(pdf_links, 1):
            print(f"\nProcessing {idx}/{len(pdf_links)}: {pdf_info['title'][:50]}...")
            
            # Download PDF
            pdf_content = self.download_pdf(pdf_info['url'])
            
            if pdf_content:
                # Generate safe filename
                safe_filename = f"{idx:03d}_{pdf_info['date'].replace('/', '-')}"
                
                # Save PDF file
                if save_pdfs:
                    pdf_path = os.path.join('pdfs', f"{safe_filename}.pdf")
                    with open(pdf_path, 'wb') as f:
                        f.write(pdf_content)
                    print(f"Saved PDF: {pdf_path}")
                
                # Extract and save text
                if save_text:
                    text = self.extract_text_from_pdf(pdf_content)
                    if text:
                        text_path = os.path.join('pdf_texts', f"{safe_filename}.txt")
                        with open(text_path, 'w', encoding='utf-8') as f:
                            f.write(f"Title: {pdf_info['title']}\n")
                            f.write(f"Date: {pdf_info['date']}\n")
                            f.write(f"URL: {pdf_info['url']}\n")
                            f.write("=" * 80 + "\n\n")
                            f.write(text)
                        print(f"Saved text: {text_path}")
                
                # Store metadata
                self.pdf_data.append({
                    'index': idx,
                    'title': pdf_info['title'],
                    'date': pdf_info['date'],
                    'url': pdf_info['url'],
                    'text_length': len(text) if text else 0
                })
            
            # Rate limiting
            time.sleep(1)
    
    def save_metadata(self, filename='pdf_metadata.json'):
        """Save metadata to JSON file"""
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(self.pdf_data, f, indent=2, ensure_ascii=False)
        print(f"\nMetadata saved to {filename}")
    
    def run(self, total_pages=9, save_pdfs=True, save_text=True):
        """Main execution method"""
        print("Starting GST Council PDF Scraper...\n")
        
        # Step 1: Scrape all pages for PDF links
        pdf_links = self.scrape_all_pages(total_pages)
        
        # Step 2: Process all PDFs
        if pdf_links:
            self.process_pdfs(pdf_links, save_pdfs, save_text)
            
            # Step 3: Save metadata
            self.save_metadata()
            
            print(f"\n{'='*80}")
            print(f"Scraping complete!")
            print(f"Total PDFs processed: {len(self.pdf_data)}")
            print(f"PDFs saved in: ./pdfs/")
            print(f"Text files saved in: ./pdf_texts/")
            print(f"Metadata saved in: ./pdf_metadata.json")
        else:
            print("No PDFs found!")


# Usage
if __name__ == "__main__":
    # Replace with the actual base URL of the GST Council press release page
    BASE_URL = "https://gstcouncil.gov.in/press-release"
    
    scraper = GSTCouncilPDFScraper(BASE_URL)
    
    # Run the scraper
    # Parameters:
    # - total_pages: number of pagination pages (9 based on your HTML)
    # - save_pdfs: whether to save PDF files
    # - save_text: whether to save extracted text
    scraper.run(total_pages=9, save_pdfs=True, save_text=True)

Starting GST Council PDF Scraper...

Fetching page 1...
Found 3 PDFs on page 1
Fetching page 2...
Found 7 PDFs on page 2
Fetching page 3...
Found 10 PDFs on page 3
Fetching page 4...
Found 9 PDFs on page 4
Fetching page 5...
Found 10 PDFs on page 5
Fetching page 6...
Found 10 PDFs on page 6
Fetching page 7...
Found 10 PDFs on page 7
Fetching page 8...
Found 10 PDFs on page 8
Fetching page 9...
Found 6 PDFs on page 9

Total PDFs found: 75

Processing 1/75: Frequently Asked Questions (FAQs) on the decisions...
Downloading: https://gstcouncil.gov.in/sites/default/files/2025-09/faq.pdf
Saved PDF: pdfs\001_03-09-2025.pdf
Saved text: pdf_texts\001_03-09-2025.txt

Processing 2/75: Recommendations of the 56th Meeting of the GST Cou...
Downloading: https://gstcouncil.gov.in/sites/default/files/2025-09/press_release_press_information_bureau.pdf
Saved PDF: pdfs\002_03-09-2025.pdf
Saved text: pdf_texts\002_03-09-2025.txt

Processing 3/75: Recommendations during 54th meeting of the GST Cou...
Downl

Multiple definitions in dictionary at byte 0xb4d0a for key /Info
Multiple definitions in dictionary at byte 0xb4d17 for key /Info
Multiple definitions in dictionary at byte 0xb4d24 for key /Info


Saved PDF: pdfs\033_01-07-2020.pdf

Processing 34/75: CBIC introduces machine release of goods...
Downloading: https://gstcouncil.gov.in/sites/default/files/2024-02/pressrelease_0602.pdf
Saved PDF: pdfs\034_06-02-2020.pdf
Saved text: pdf_texts\034_06-02-2020.txt

Processing 35/75: GST rate on all Electric Vehicles reduced from 12%...
Downloading: https://gstcouncil.gov.in/sites/default/files/2024-10/36_gst_press_information_bureau.pdf
Saved PDF: pdfs\035_27-07-2019.pdf
Saved text: pdf_texts\035_27-07-2019.txt

Processing 36/75: FM chairs the 35th GST Council Meeting held today ...
Downloading: https://gstcouncil.gov.in/sites/default/files/2024-10/35_gst_press_information_bureau.pdf
Saved PDF: pdfs\036_21-06-2019.pdf
Saved text: pdf_texts\036_21-06-2019.txt

Processing 37/75: FM chairs 35th GST Council meeting in Delhi; decis...
Downloading: https://gstcouncil.gov.in/sites/default/files/2024-10/8.35th_gst_press_information_bureau.pdf
Saved PDF: pdfs\037_21-06-2019.pdf
Saved text: pdf_te

In [5]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from bs4 import BeautifulSoup
import PyPDF2
import requests
import io
import os
import time
import re
import json

class IncomeTaxPDFScraperComplete:
    def __init__(self):
        self.base_url = "https://incometaxindia.gov.in/Pages/tps/latest-updates.aspx"
        self.output_dir = "income_tax_pdfs"
        
        if not os.path.exists(self.output_dir):
            os.makedirs(self.output_dir)
            print(f"Created directory: {self.output_dir}")
        
        self.chrome_options = webdriver.ChromeOptions()
        self.chrome_options.add_argument('--headless')
        self.chrome_options.add_argument('--no-sandbox')
        self.chrome_options.add_argument('--disable-dev-shm-usage')
        self.chrome_options.add_argument('--disable-gpu')
        self.chrome_options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36')
        
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        })
        
        self.pdf_data = []
        
    def init_driver(self):
        """Initialize Selenium WebDriver"""
        try:
            self.driver = webdriver.Chrome(options=self.chrome_options)
            self.driver.maximize_window()
            return True
        except Exception as e:
            print(f"Error initializing driver: {e}")
            print("Make sure ChromeDriver is installed and in PATH")
            return False
    
    def extract_pdf_urls_from_html(self, html_content):
        """Extract all PDF URLs from the HTML content"""
        soup = BeautifulSoup(html_content, 'html.parser')
        pdf_info = []
        
        news_rows = soup.find_all('div', class_='news-rows')
        
        for row in news_rows:
            try:
                title_elem = row.find('h1')
                if not title_elem:
                    continue
                
                title_link = title_elem.find('a')
                if not title_link:
                    continue
                    
                title = title_link.get_text(strip=True)
                
                date_elem = row.find('span', id=re.compile('publishDt'))
                date = date_elem.get_text(strip=True) if date_elem else "Unknown Date"
                
                onclick = title_link.get('onclick', '')
                url_match = re.search(r"'(https://[^']+\.pdf)", onclick)
                
                if url_match:
                    pdf_url = url_match.group(1)
                    pdf_info.append({
                        'title': title,
                        'date': date,
                        'url': pdf_url
                    })
                    
            except Exception as e:
                print(f"Error extracting PDF info from row: {e}")
                continue
        
        return pdf_info
    
    def download_and_extract_pdf(self, pdf_info, page_num, pdf_index):
        """Download PDF and extract text content"""
        try:
            print(f"  Downloading: {pdf_info['title'][:60]}...")
            
            max_retries = 3
            pdf_content = None
            
            for attempt in range(max_retries):
                try:
                    response = self.session.get(pdf_info['url'], timeout=60)
                    response.raise_for_status()
                    pdf_content = response.content
                    break
                except Exception as e:
                    if attempt < max_retries - 1:
                        print(f"  Retry {attempt + 1}/{max_retries}...")
                        time.sleep(2)
                    else:
                        raise e
            
            if not pdf_content:
                print(f"  ‚úó Failed to download PDF")
                return False
            
            safe_date = pdf_info['date'].replace('/', '-').replace(' ', '_')
            safe_filename = f"page{page_num}_pdf{pdf_index:03d}_{safe_date}"
            
            pdf_path = os.path.join(self.output_dir, f"{safe_filename}.pdf")
            with open(pdf_path, 'wb') as f:
                f.write(pdf_content)
            print(f"  ‚úì Saved PDF: {safe_filename}.pdf")
            
            text = ""
            try:
                pdf_file = io.BytesIO(pdf_content)
                pdf_reader = PyPDF2.PdfReader(pdf_file)
                
                text_content = []
                for page_idx in range(len(pdf_reader.pages)):
                    page = pdf_reader.pages[page_idx]
                    page_text = page.extract_text()
                    text_content.append(f"--- Page {page_idx + 1} ---\n{page_text}\n")
                
                text = "\n".join(text_content)
                
                text_path = os.path.join(self.output_dir, f"{safe_filename}.txt")
                with open(text_path, 'w', encoding='utf-8') as f:
                    f.write(f"Source Page: {page_num}\n")
                    f.write(f"PDF Index: {pdf_index}\n")
                    f.write(f"Title: {pdf_info['title']}\n")
                    f.write(f"Date: {pdf_info['date']}\n")
                    f.write(f"URL: {pdf_info['url']}\n")
                    f.write("="*80 + "\n\n")
                    f.write(text)
                
                print(f"  ‚úì Extracted text: {safe_filename}.txt")
                
            except Exception as e:
                print(f"  ‚ö† Warning: Could not extract text: {e}")
                text = "[Text extraction failed]"
            
            self.pdf_data.append({
                'page': page_num,
                'index': pdf_index,
                'title': pdf_info['title'],
                'date': pdf_info['date'],
                'url': pdf_info['url'],
                'filename': safe_filename,
                'text_length': len(text)
            })
            
            return True
                
        except Exception as e:
            print(f"  ‚úó Error processing PDF: {e}")
            return False
    
    def get_total_pages(self):
        """Extract total number of pages from the page info text"""
        try:
            # Look for text like "410 Record(s) | Page [6 of 41]"
            page_source = self.driver.page_source
            soup = BeautifulSoup(page_source, 'html.parser')
            
            # Try to find the page info text
            page_info = soup.find(text=re.compile(r'Page\s*\[\s*\d+\s+of\s+\d+\s*\]'))
            
            if page_info:
                match = re.search(r'of\s+(\d+)', page_info)
                if match:
                    total = int(match.group(1))
                    print(f"‚úì Detected total pages: {total}")
                    return total
            
            # Fallback: look in any element
            all_text = soup.get_text()
            match = re.search(r'Page\s*\[\s*\d+\s+of\s+(\d+)\s*\]', all_text)
            if match:
                total = int(match.group(1))
                print(f"‚úì Detected total pages: {total}")
                return total
            
            print("‚ö† Could not detect total pages, defaulting to 1")
            return 1
            
        except Exception as e:
            print(f"Error detecting total pages: {e}")
            return 1
    
    def click_next_page(self):
        """Click the Next button to go to the next page"""
        try:
            # Find the Next button
            next_button = self.driver.find_element(
                By.CSS_SELECTOR,
                "input[id*='imgbtnNext']"
            )
            
            # Check if Next button is enabled
            if next_button.is_enabled():
                # Scroll to the button
                self.driver.execute_script("arguments[0].scrollIntoView(true);", next_button)
                time.sleep(1)
                
                # Click the Next button
                next_button.click()
                
                # Wait for page to load
                time.sleep(3)
                
                # Wait for news rows to be present
                WebDriverWait(self.driver, 10).until(
                    EC.presence_of_element_located((By.CLASS_NAME, "news-rows"))
                )
                
                return True
            else:
                print("Next button is disabled (last page reached)")
                return False
                
        except Exception as e:
            print(f"Error clicking Next button: {e}")
            return False
    
    def save_metadata(self):
        """Save metadata to JSON file"""
        metadata_path = os.path.join(self.output_dir, 'metadata.json')
        with open(metadata_path, 'w', encoding='utf-8') as f:
            json.dump(self.pdf_data, f, indent=2, ensure_ascii=False)
        print(f"\nMetadata saved to: {metadata_path}")
    
    def scrape_all_pages(self):
        """Main method to scrape all pages"""
        print("="*80)
        print("Income Tax India PDF Scraper - Complete All Pages")
        print("="*80)
        
        if not self.init_driver():
            print("Failed to initialize browser driver. Exiting.")
            return []
        
        try:
            # Load the main page
            print(f"\nLoading website: {self.base_url}")
            self.driver.get(self.base_url)
            time.sleep(5)
            
            # Get total number of pages
            total_pages = self.get_total_pages()
            
            print(f"\n‚úì Found {total_pages} pages to scrape")
            print("="*80)
            
            pdf_counter = 0
            
            # Process each page
            for page_num in range(1, total_pages + 1):
                print(f"\n{'='*80}")
                print(f"PROCESSING PAGE {page_num} of {total_pages}")
                print(f"{'='*80}")
                
                # Get current page HTML
                page_html = self.driver.page_source
                
                # Extract PDF information
                pdf_info_list = self.extract_pdf_urls_from_html(page_html)
                print(f"\n‚úì Found {len(pdf_info_list)} PDFs on page {page_num}")
                
                # Download and extract each PDF
                for i, pdf_info in enumerate(pdf_info_list, 1):
                    pdf_counter += 1
                    print(f"\n[PDF {i}/{len(pdf_info_list)} on Page {page_num}] (Overall: {pdf_counter})")
                    self.download_and_extract_pdf(pdf_info, page_num, pdf_counter)
                    time.sleep(1)
                
                print(f"\n‚úì Completed page {page_num}")
                
                # Navigate to next page (if not the last page)
                if page_num < total_pages:
                    print(f"Navigating to page {page_num + 1}...")
                    if not self.click_next_page():
                        print(f"Failed to navigate to next page. Stopping at page {page_num}.")
                        break
                    print(f"‚úì Successfully navigated to page {page_num + 1}")
                
                time.sleep(2)
            
            # Save metadata
            self.save_metadata()
            
        except Exception as e:
            print(f"\nError during scraping: {e}")
            import traceback
            traceback.print_exc()
            
        finally:
            # Close the browser
            print("\nClosing browser...")
            self.driver.quit()
        
        print("\n" + "="*80)
        print("SCRAPING COMPLETED!")
        print("="*80)
        print(f"Total PDFs processed: {len(self.pdf_data)}")
        print(f"Output directory: {os.path.abspath(self.output_dir)}")
        print("="*80)
        
        return self.pdf_data


def main():
    """Main execution function"""
    scraper = IncomeTaxPDFScraperComplete()
    
    # Run the scraper
    results = scraper.scrape_all_pages()
    
    # Print detailed summary
    if results:
        print("\n" + "="*80)
        print("DETAILED SUMMARY")
        print("="*80)
        
        # Group by page
        pages = {}
        for pdf in results:
            page = pdf.get('page', 1)
            if page not in pages:
                pages[page] = []
            pages[page].append(pdf)
        
        for page_num in sorted(pages.keys()):
            print(f"\n--- Page {page_num} ({len(pages[page_num])} PDFs) ---")
            for pdf in pages[page_num]:
                print(f"  {pdf['index']:03d}. {pdf['title'][:65]}... ({pdf['date']})")
        
        print("\n" + "="*80)
        print(f"All files saved in: {os.path.abspath(scraper.output_dir)}")
        print("="*80)


if __name__ == "__main__":
    main()

Created directory: income_tax_pdfs
Income Tax India PDF Scraper - Complete All Pages

Loading website: https://incometaxindia.gov.in/Pages/tps/latest-updates.aspx


  page_info = soup.find(text=re.compile(r'Page\s*\[\s*\d+\s+of\s+\d+\s*\]'))


‚úì Detected total pages: 41

‚úì Found 41 pages to scrape

PROCESSING PAGE 1 of 41

‚úì Found 9 PDFs on page 1

[PDF 1/9 on Page 1] (Overall: 1)
  Downloading: ‚Äã Corrigendum- Board's letter dated 21.10.2025 on the subjec...
  ‚úì Saved PDF: page1_pdf001_28_October_2025.pdf
  ‚úì Extracted text: page1_pdf001_28_October_2025.txt

[PDF 2/9 on Page 1] (Overall: 2)
  Downloading: C&AG's performance Audit report No.1 of 2019 ON "Assessment ...
  ‚úì Saved PDF: page1_pdf002_21_October_2025.pdf
  ‚úì Extracted text: page1_pdf002_21_October_2025.txt

[PDF 3/9 on Page 1] (Overall: 3)
  Downloading: Direct Tax Collections for F.Y. 2025-26 as on 12.10.2025...
  ‚úì Saved PDF: page1_pdf003_13_October_2025.pdf
  ‚úì Extracted text: page1_pdf003_13_October_2025.txt

[PDF 4/9 on Page 1] (Overall: 4)
  Downloading: Direct Tax Collections for F.Y. 2025-26 (as on 17.09.2025)...
  ‚úì Saved PDF: page1_pdf004_18_September_2025.pdf
  ‚úì Extracted text: page1_pdf004_18_September_2025.txt

[PDF 5/9 on Pag

Multiple definitions in dictionary at byte 0x7a6b9 for key /Info
Multiple definitions in dictionary at byte 0x7a6c6 for key /Info
Multiple definitions in dictionary at byte 0x7a6d3 for key /Info


  ‚úì Saved PDF: page16_pdf132_17_March_2022.pdf
  ‚úì Extracted text: page16_pdf132_17_March_2022.txt

[PDF 3/7 on Page 16] (Overall: 133)
  Downloading: Order under sub-section (2) of Section 144B of the Income-ta...
  ‚úì Saved PDF: page16_pdf133_17_March_2022.pdf
  ‚úì Extracted text: page16_pdf133_17_March_2022.txt

[PDF 4/7 on Page 16] (Overall: 134)
  Downloading: Condonation of delay under section 119(2)(b) of the Income-t...
  ‚úì Saved PDF: page16_pdf134_17_March_2022.pdf
  ‚úì Extracted text: page16_pdf134_17_March_2022.txt

[PDF 5/7 on Page 16] (Overall: 135)
  Downloading: Setting up of office for operationalising Interim Boards for...
  ‚úì Saved PDF: page16_pdf135_31_January_2022.pdf
  ‚úì Extracted text: page16_pdf135_31_January_2022.txt

[PDF 6/7 on Page 16] (Overall: 136)
  Downloading: Corrigendum to Order dated 20.01.2022 issued in pursuant to ...
  ‚úì Saved PDF: page16_pdf136_28_January_2022.pdf
  ‚úì Extracted text: page16_pdf136_28_January_2022.txt

[PDF 7/7 on 

KeyboardInterrupt: 

In [9]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from bs4 import BeautifulSoup
import PyPDF2
import pdfplumber
import requests
import io
import os
import time
import re
import json

class RBIPDFScraper:
    def __init__(self):
        self.base_url = "https://rbi.org.in/Scripts/BS_PressreleaseDisplay.aspx"
        self.output_dir = "rbi_pdfs"
        
        if not os.path.exists(self.output_dir):
            os.makedirs(self.output_dir)
            print(f"Created directory: {self.output_dir}")
        
        # Setup Chrome options
        self.chrome_options = webdriver.ChromeOptions()
        self.chrome_options.add_argument('--headless')
        self.chrome_options.add_argument('--no-sandbox')
        self.chrome_options.add_argument('--disable-dev-shm-usage')
        self.chrome_options.add_argument('--disable-gpu')
        self.chrome_options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36')
        
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        })
        
        self.pdf_data = []
        
    def init_driver(self):
        """Initialize Selenium WebDriver"""
        try:
            self.driver = webdriver.Chrome(options=self.chrome_options)
            self.driver.maximize_window()
            return True
        except Exception as e:
            print(f"Error initializing driver: {e}")
            print("Make sure ChromeDriver is installed and in PATH")
            return False
    
    def extract_pdf_info_from_html(self, html_content):
        """Extract all PDF information from the HTML content"""
        soup = BeautifulSoup(html_content, 'html.parser')
        pdf_info_list = []
        
        # Find all table rows
        table = soup.find('table', class_='tablebg')
        if not table:
            print("Warning: Could not find table with class 'tablebg'")
            return pdf_info_list
        
        rows = table.find_all('tr')
        current_date = "Unknown Date"
        
        for row in rows:
            try:
                # Check if this row is a date header
                date_header = row.find('h2', class_='dop_header')
                if date_header:
                    current_date = date_header.get_text(strip=True)
                    continue
                
                # Extract title and PDF link
                title_link = row.find('a', class_='link2')
                if not title_link:
                    continue
                
                title = title_link.get_text(strip=True)
                
                # Find PDF link
                pdf_link = row.find('a', href=re.compile(r'\.PDF$', re.IGNORECASE))
                if not pdf_link:
                    continue
                
                pdf_url = pdf_link.get('href')
                
                # Make sure URL is absolute
                if not pdf_url.startswith('http'):
                    pdf_url = 'https://rbidocs.rbi.org.in' + pdf_url if pdf_url.startswith('/') else 'https://rbidocs.rbi.org.in/' + pdf_url
                
                # Extract file size
                size_span = pdf_link.find_next_sibling('span')
                file_size = size_span.get_text(strip=True) if size_span else "Unknown size"
                
                pdf_info_list.append({
                    'title': title,
                    'date': current_date,
                    'url': pdf_url,
                    'size': file_size
                })
                
            except Exception as e:
                print(f"Error extracting PDF info from row: {e}")
                continue
        
        return pdf_info_list
    
    def extract_text_with_pypdf2(self, pdf_content):
        """Extract text using PyPDF2"""
        try:
            pdf_file = io.BytesIO(pdf_content)
            pdf_reader = PyPDF2.PdfReader(pdf_file)
            
            text_content = []
            num_pages = len(pdf_reader.pages)
            
            for page_idx in range(num_pages):
                page = pdf_reader.pages[page_idx]
                page_text = page.extract_text()
                if page_text.strip():
                    text_content.append(f"--- Page {page_idx + 1} of {num_pages} ---\n{page_text}\n")
            
            text = "\n".join(text_content)
            return text, num_pages, "PyPDF2"
        except Exception as e:
            raise Exception(f"PyPDF2 failed: {e}")
    
    def extract_text_with_pdfplumber(self, pdf_content):
        """Extract text using pdfplumber (more robust)"""
        try:
            pdf_file = io.BytesIO(pdf_content)
            text_content = []
            
            with pdfplumber.open(pdf_file) as pdf:
                num_pages = len(pdf.pages)
                
                for page_idx, page in enumerate(pdf.pages):
                    page_text = page.extract_text()
                    if page_text and page_text.strip():
                        text_content.append(f"--- Page {page_idx + 1} of {num_pages} ---\n{page_text}\n")
            
            text = "\n".join(text_content)
            return text, num_pages, "pdfplumber"
        except Exception as e:
            raise Exception(f"pdfplumber failed: {e}")
    
    def extract_text_from_pdf(self, pdf_content):
        """Try multiple methods to extract text from PDF"""
        extraction_methods = [
            ("pdfplumber", self.extract_text_with_pdfplumber),
            ("PyPDF2", self.extract_text_with_pypdf2)
        ]
        
        for method_name, method_func in extraction_methods:
            try:
                text, num_pages, used_method = method_func(pdf_content)
                if text and text.strip():
                    return text, num_pages, used_method
            except Exception as e:
                print(f"  ‚ö† {method_name} extraction failed: {e}")
                continue
        
        # If all methods fail, return failure indicator
        return "[Text extraction failed - PDF may be image-based or corrupted]", 0, "Failed"
    
    def download_and_extract_pdf(self, pdf_info, page_num, pdf_index):
        """Download PDF and extract text content"""
        try:
            print(f"  Downloading: {pdf_info['title'][:60]}...")
            print(f"  URL: {pdf_info['url']}")
            
            # Download PDF with timeout and retries
            max_retries = 3
            pdf_content = None
            
            for attempt in range(max_retries):
                try:
                    response = self.session.get(pdf_info['url'], timeout=60)
                    response.raise_for_status()
                    pdf_content = response.content
                    break
                except Exception as e:
                    if attempt < max_retries - 1:
                        print(f"  Retry {attempt + 1}/{max_retries}...")
                        time.sleep(2)
                    else:
                        raise e
            
            if not pdf_content:
                print(f"  ‚úó Failed to download PDF")
                return False
            
            # Create safe filename
            safe_date = pdf_info['date'].replace('/', '-').replace(' ', '_').replace(',', '')
            safe_title = re.sub(r'[^\w\s-]', '', pdf_info['title'])[:50]
            safe_title = re.sub(r'\s+', '_', safe_title)
            safe_filename = f"page{page_num}_pdf{pdf_index:03d}_{safe_date}_{safe_title}"
            
            # Save PDF file
            pdf_path = os.path.join(self.output_dir, f"{safe_filename}.pdf")
            with open(pdf_path, 'wb') as f:
                f.write(pdf_content)
            print(f"  ‚úì Saved PDF: {safe_filename}.pdf ({pdf_info['size']})")
            
            # Extract text from PDF using multiple methods
            text, num_pages, extraction_method = self.extract_text_from_pdf(pdf_content)
            
            if extraction_method != "Failed":
                print(f"  ‚úì Extracted text using {extraction_method}: {safe_filename}.txt ({num_pages} pages)")
            else:
                print(f"  ‚ö† Text extraction failed - PDF saved but text not extracted")
            
            # Save extracted text
            text_path = os.path.join(self.output_dir, f"{safe_filename}.txt")
            with open(text_path, 'w', encoding='utf-8') as f:
                f.write(f"Source: RBI Press Release\n")
                f.write(f"Webpage Page Number: {page_num}\n")
                f.write(f"PDF Index: {pdf_index}\n")
                f.write(f"Title: {pdf_info['title']}\n")
                f.write(f"Date: {pdf_info['date']}\n")
                f.write(f"File Size: {pdf_info['size']}\n")
                f.write(f"URL: {pdf_info['url']}\n")
                f.write(f"PDF Pages: {num_pages}\n")
                f.write(f"Extraction Method: {extraction_method}\n")
                f.write("="*80 + "\n\n")
                f.write(text)
            
            # Store metadata
            self.pdf_data.append({
                'page': page_num,
                'index': pdf_index,
                'title': pdf_info['title'],
                'date': pdf_info['date'],
                'url': pdf_info['url'],
                'size': pdf_info['size'],
                'filename': safe_filename,
                'text_length': len(text),
                'pdf_pages': num_pages,
                'extraction_method': extraction_method
            })
            
            return True
                
        except Exception as e:
            print(f"  ‚úó Error processing PDF: {e}")
            return False
    
    def get_total_pages(self):
        """Extract total number of pages from pagination"""
        try:
            # Wait for pagination to load
            time.sleep(2)
            
            # Look for pagination links
            pagination_links = self.driver.find_elements(By.CSS_SELECTOR, "a[href*='javascript:__doPostBack']")
            
            page_numbers = []
            for link in pagination_links:
                try:
                    text = link.text.strip()
                    if text.isdigit():
                        page_numbers.append(int(text))
                except:
                    continue
            
            if page_numbers:
                total = max(page_numbers)
                print(f"‚úì Detected {total} pages from pagination links")
                return total
            
            # If no pagination links found, assume single page
            print("‚ö† No pagination found, assuming single page")
            return 1
            
        except Exception as e:
            print(f"Error detecting total pages: {e}")
            return 1
    
    def click_next_page(self):
        """Click the next page button"""
        try:
            # Try to find and click "Next" or ">" button
            # Different possible selectors for next button
            next_selectors = [
                "a[title='Next Page']",
                "input[title='Next Page']",
                "a:contains('Next')",
                "img[alt='Next']"
            ]
            
            for selector in next_selectors:
                try:
                    next_button = self.driver.find_element(By.CSS_SELECTOR, selector)
                    
                    if next_button.is_displayed() and next_button.is_enabled():
                        # Scroll to button
                        self.driver.execute_script("arguments[0].scrollIntoView(true);", next_button)
                        time.sleep(1)
                        
                        # Click
                        next_button.click()
                        time.sleep(3)
                        
                        return True
                except:
                    continue
            
            print("Could not find Next button")
            return False
                
        except Exception as e:
            print(f"Error clicking next page: {e}")
            return False
    
    def click_page_number(self, page_num):
        """Click on a specific page number link"""
        try:
            # Find all links that might be page numbers
            page_links = self.driver.find_elements(By.CSS_SELECTOR, "a[href*='javascript:__doPostBack']")
            
            for link in page_links:
                if link.text.strip() == str(page_num):
                    # Scroll to element
                    self.driver.execute_script("arguments[0].scrollIntoView(true);", link)
                    time.sleep(1)
                    
                    # Click the link
                    link.click()
                    time.sleep(3)
                    
                    return True
            
            return False
            
        except Exception as e:
            print(f"Error clicking page {page_num}: {e}")
            return False
    
    def save_metadata(self):
        """Save metadata to JSON file"""
        metadata_path = os.path.join(self.output_dir, 'metadata.json')
        with open(metadata_path, 'w', encoding='utf-8') as f:
            json.dump(self.pdf_data, f, indent=2, ensure_ascii=False)
        print(f"\nMetadata saved to: {metadata_path}")
        
        # Also save a summary CSV
        csv_path = os.path.join(self.output_dir, 'summary.csv')
        with open(csv_path, 'w', encoding='utf-8') as f:
            f.write("Page,Index,Date,Title,Size,PDF_Pages,Extraction_Method,Filename\n")
            for pdf in self.pdf_data:
                f.write(f"{pdf['page']},{pdf['index']},\"{pdf['date']}\",\"{pdf['title']}\",\"{pdf['size']}\",{pdf.get('pdf_pages', 0)},\"{pdf.get('extraction_method', 'Unknown')}\",\"{pdf['filename']}\"\n")
        print(f"Summary CSV saved to: {csv_path}")
    
    def scrape_all_pages(self, max_pages=None):
        """Main method to scrape all pages
        
        Args:
            max_pages: Maximum number of pages to scrape (None = all pages)
        """
        print("="*80)
        print("RBI Press Release PDF Scraper - All Pages")
        print("="*80)
        
        if not self.init_driver():
            print("Failed to initialize browser driver. Exiting.")
            return []
        
        try:
            # Load the main page
            print(f"\nLoading website: {self.base_url}")
            self.driver.get(self.base_url)
            time.sleep(5)
            
            # Get total number of pages
            total_pages = self.get_total_pages()
            
            if max_pages and max_pages < total_pages:
                total_pages = max_pages
                print(f"Limiting scrape to {max_pages} pages")
            
            print(f"\n‚úì Will scrape {total_pages} page(s)")
            print("="*80)
            
            pdf_counter = 0
            
            # Process each page
            for page_num in range(1, total_pages + 1):
                print(f"\n{'='*80}")
                print(f"PROCESSING PAGE {page_num} of {total_pages}")
                print(f"{'='*80}")
                
                # Get current page HTML
                page_html = self.driver.page_source
                
                # Extract PDF information
                pdf_info_list = self.extract_pdf_info_from_html(page_html)
                print(f"\n‚úì Found {len(pdf_info_list)} PDFs on page {page_num}")
                
                # Download and extract each PDF
                for i, pdf_info in enumerate(pdf_info_list, 1):
                    pdf_counter += 1
                    print(f"\n[PDF {i}/{len(pdf_info_list)} on Page {page_num}] (Overall: {pdf_counter})")
                    self.download_and_extract_pdf(pdf_info, page_num, pdf_counter)
                    time.sleep(1)  # Be polite to the server
                
                print(f"\n‚úì Completed page {page_num}")
                
                # Navigate to next page (if not the last page)
                if page_num < total_pages:
                    print(f"\nNavigating to page {page_num + 1}...")
                    
                    # Try clicking specific page number first
                    if not self.click_page_number(page_num + 1):
                        # If that fails, try clicking Next button
                        if not self.click_next_page():
                            print(f"Failed to navigate to page {page_num + 1}. Stopping.")
                            break
                    
                    print(f"‚úì Successfully navigated to page {page_num + 1}")
                
                time.sleep(2)
            
            # Save metadata
            self.save_metadata()
            
        except Exception as e:
            print(f"\nError during scraping: {e}")
            import traceback
            traceback.print_exc()
            
        finally:
            # Close the browser
            print("\nClosing browser...")
            self.driver.quit()
        
        print("\n" + "="*80)
        print("SCRAPING COMPLETED!")
        print("="*80)
        print(f"Total PDFs processed: {len(self.pdf_data)}")
        
        # Count successful extractions
        successful = sum(1 for pdf in self.pdf_data if pdf.get('extraction_method') != 'Failed')
        failed = len(self.pdf_data) - successful
        print(f"Text extracted successfully: {successful}")
        print(f"Text extraction failed: {failed}")
        print(f"Output directory: {os.path.abspath(self.output_dir)}")
        print("="*80)
        
        return self.pdf_data


def main():
    """Main execution function"""
    print("\n" + "="*80)
    print("RBI PDF SCRAPER")
    print("="*80)
    print("\nRequired packages:")
    print("  pip install selenium beautifulsoup4 PyPDF2 pdfplumber requests")
    print("\nNote: pdfplumber is used as a fallback when PyPDF2 fails")
    print("="*80 + "\n")
    
    scraper = RBIPDFScraper()
    
    # Run the scraper
    # Set max_pages=None to scrape all pages, or set a number to limit (e.g., max_pages=5)
    results = scraper.scrape_all_pages(max_pages=None)
    
    # Print detailed summary
    if results:
        print("\n" + "="*80)
        print("DETAILED SUMMARY")
        print("="*80)
        
        # Group by page
        pages = {}
        for pdf in results:
            page = pdf.get('page', 1)
            if page not in pages:
                pages[page] = []
            pages[page].append(pdf)
        
        # Group by date
        dates = {}
        for pdf in results:
            date = pdf.get('date', 'Unknown')
            if date not in dates:
                dates[date] = []
            dates[date].append(pdf)
        
        # Group by extraction method
        methods = {}
        for pdf in results:
            method = pdf.get('extraction_method', 'Unknown')
            if method not in methods:
                methods[method] = []
            methods[method].append(pdf)
        
        print(f"\nTotal Pages Scraped: {len(pages)}")
        print(f"Total Dates: {len(dates)}")
        print(f"Total PDFs: {len(results)}")
        
        print("\n--- Extraction Methods ---")
        for method, pdfs in methods.items():
            print(f"  {method}: {len(pdfs)} PDFs")
        
        print("\n--- By Page ---")
        for page_num in sorted(pages.keys()):
            print(f"\nPage {page_num} ({len(pages[page_num])} PDFs):")
            for pdf in pages[page_num][:3]:  # Show first 3 from each page
                method_icon = "‚úì" if pdf.get('extraction_method') != 'Failed' else "‚úó"
                print(f"  {method_icon} {pdf['index']:03d}. [{pdf['date']}] {pdf['title'][:55]}...")
            if len(pages[page_num]) > 3:
                print(f"  ... and {len(pages[page_num]) - 3} more")
        
        print("\n--- By Date ---")
        for date in sorted(dates.keys(), reverse=True)[:5]:  # Show 5 most recent dates
            print(f"\n{date} ({len(dates[date])} PDFs):")
            for pdf in dates[date][:3]:
                print(f"  ‚Ä¢ {pdf['title'][:65]}...")
            if len(dates[date]) > 3:
                print(f"  ... and {len(dates[date]) - 3} more")
        
        print("\n" + "="*80)
        print(f"All files saved in: {os.path.abspath(scraper.output_dir)}")
        print("Files created:")
        print(f"  ‚Ä¢ {len(results)} PDF files")
        print(f"  ‚Ä¢ {len(results)} TXT files (extracted text)")
        print(f"  ‚Ä¢ 1 metadata.json file")
        print(f"  ‚Ä¢ 1 summary.csv file")
        print("="*80)


if __name__ == "__main__":
    main()


RBI PDF SCRAPER

Required packages:
  pip install selenium beautifulsoup4 PyPDF2 pdfplumber requests

Note: pdfplumber is used as a fallback when PyPDF2 fails

Created directory: rbi_pdfs
RBI Press Release PDF Scraper - All Pages

Loading website: https://rbi.org.in/Scripts/BS_PressreleaseDisplay.aspx
‚ö† No pagination found, assuming single page

‚úì Will scrape 1 page(s)

PROCESSING PAGE 1 of 1

‚úì Found 55 PDFs on page 1

[PDF 1/55 on Page 1] (Overall: 1)
  Downloading: Treasury Bills: Full Auction Result...
  URL: https://rbidocs.rbi.org.in/rdocs/PressRelease/PDFs/PR1456CB0AABB86A474281B8AF568E7E58A7AD.PDF
  ‚úì Saved PDF: page1_pdf001_Nov_06_2025_Treasury_Bills_Full_Auction_Result.pdf (284 kb)
  ‚ö† pdfplumber extraction failed: pdfplumber failed: No /Root object! - Is this really a PDF?
  ‚ö† PyPDF2 extraction failed: PyPDF2 failed: EOF marker not found
  ‚ö† Text extraction failed - PDF saved but text not extracted

[PDF 2/55 on Page 1] (Overall: 2)
  Downloading: 91-Day, 182-

In [10]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin
import time
import re

class RBIScraper:
    def __init__(self):
        self.base_url = "https://rbi.org.in/Scripts/"
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        })
    
    def get_press_releases(self, url):
        """Fetch all press releases from the main page"""
        try:
            response = self.session.get(url)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')
            
            releases = []
            # Find all press release links
            for row in soup.find_all('tr'):
                link = row.find('a', class_='link2')
                if link and 'href' in link.attrs:
                    releases.append({
                        'title': link.get_text(strip=True),
                        'url': urljoin(self.base_url, link['href']),
                        'date': self._extract_date(row)
                    })
            
            return releases
        except Exception as e:
            print(f"Error fetching press releases: {e}")
            return []
    
    def _extract_date(self, row):
        """Extract date from the row or previous header"""
        # Look for date header in previous rows
        prev = row.find_previous('td', class_='tableheader')
        if prev:
            h2 = prev.find('h2', class_='dop_header')
            if h2:
                return h2.get_text(strip=True)
        return None
    
    def scrape_table_from_page(self, url):
        """Scrape table data from detail page"""
        try:
            response = self.session.get(url)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Find all tables in the page
            tables = soup.find_all('table', class_='tablebg')
            
            all_data = []
            for table in tables:
                # Try to extract structured data
                data = self._parse_table(table)
                if data:
                    all_data.extend(data)
            
            return all_data
        except Exception as e:
            print(f"Error scraping table from {url}: {e}")
            return None
    
    def _parse_table(self, table):
        """Parse table into structured data"""
        rows = []
        
        # Get all table rows
        for tr in table.find_all('tr'):
            cells = tr.find_all(['td', 'th'])
            if cells:
                row_data = [cell.get_text(strip=True) for cell in cells]
                # Skip empty rows
                if any(row_data):
                    rows.append(row_data)
        
        return rows
    
    def scrape_treasury_bills(self, url):
        """Specific scraper for Treasury Bills data"""
        try:
            response = self.session.get(url)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Find the main content table
            content_table = soup.find('table', class_='td')
            if not content_table:
                return None
            
            # Extract title and date
            title_elem = soup.find('h2', class_='dop_header')
            title = title_elem.get_text(strip=True) if title_elem else "Unknown"
            
            # Find the data table
            data_table = content_table.find('table', class_='tablebg')
            if not data_table:
                return None
            
            # Parse the Treasury Bills table
            data = []
            headers = ['Description', '91-Day', '182-Day', '364-Day']
            
            for tr in data_table.find_all('tr'):
                cells = tr.find_all('td')
                if len(cells) >= 2:
                    # Skip header rows
                    if cells[0].get('class') and 'head' in cells[0].get('class'):
                        continue
                    
                    # Extract row data
                    row_data = {}
                    
                    # Handle different row structures
                    if len(cells) == 5:  # Row with Roman numeral, description, and 3 values
                        row_data['Section'] = cells[0].get_text(strip=True)
                        row_data['Description'] = cells[1].get_text(strip=True)
                        row_data['91-Day'] = cells[2].get_text(strip=True)
                        row_data['182-Day'] = cells[3].get_text(strip=True)
                        row_data['364-Day'] = cells[4].get_text(strip=True)
                    elif len(cells) == 4:  # Row without section number
                        row_data['Section'] = ''
                        row_data['Description'] = cells[0].get_text(strip=True)
                        row_data['91-Day'] = cells[1].get_text(strip=True)
                        row_data['182-Day'] = cells[2].get_text(strip=True)
                        row_data['364-Day'] = cells[3].get_text(strip=True)
                    
                    if row_data and row_data['Description']:
                        data.append(row_data)
            
            return {
                'title': title,
                'data': data
            }
        except Exception as e:
            print(f"Error scraping treasury bills: {e}")
            return None
    
    def save_to_csv(self, data, filename):
        """Save data to CSV file"""
        try:
            df = pd.DataFrame(data)
            df.to_csv(filename, index=False, encoding='utf-8-sig')
            print(f"Data saved to {filename}")
        except Exception as e:
            print(f"Error saving to CSV: {e}")
    
    def save_to_excel(self, data, filename):
        """Save data to Excel file"""
        try:
            df = pd.DataFrame(data)
            df.to_excel(filename, index=False, engine='openpyxl')
            print(f"Data saved to {filename}")
        except Exception as e:
            print(f"Error saving to Excel: {e}")


def main():
    # Initialize scraper
    scraper = RBIScraper()
    
    # Main press release page
    main_url = "https://rbi.org.in/Scripts/BS_PressreleaseDisplay.aspx"
    
    print("Fetching press releases...")
    releases = scraper.get_press_releases(main_url)
    print(f"Found {len(releases)} press releases")
    
    # Scrape Treasury Bills specifically
    treasury_bills_data = []
    
    for release in releases[:5]:  # Limit to first 5 for testing
        print(f"\nProcessing: {release['title']}")
        
        if 'Treasury Bills' in release['title'] or 'T-Bill' in release['title']:
            time.sleep(1)  # Be polite to the server
            
            result = scraper.scrape_treasury_bills(release['url'])
            if result:
                # Add metadata
                for row in result['data']:
                    row['Release_Title'] = release['title']
                    row['Release_Date'] = release['date']
                    row['Release_URL'] = release['url']
                    treasury_bills_data.append(row)
    
    # Save results
    if treasury_bills_data:
        print(f"\n\nTotal records scraped: {len(treasury_bills_data)}")
        scraper.save_to_csv(treasury_bills_data, 'rbi_treasury_bills.csv')
        scraper.save_to_excel(treasury_bills_data, 'rbi_treasury_bills.xlsx')
    else:
        print("\nNo Treasury Bills data found")
    
    # Save all press releases list
    if releases:
        scraper.save_to_csv(releases, 'rbi_press_releases_list.csv')


if __name__ == "__main__":
    main()

Fetching press releases...
Found 55 press releases

Processing: Treasury Bills: Full Auction Result

Processing: 91-Day, 182-Day and 364-Day T-Bill Auction Result: Cut-off

Processing: Underwriting Auction for sale of Government Security for ‚Çπ32,000 crore on November 07, 2025

Processing: Money Market Operations as on November 05, 2025

Processing: Money Market Operations as on November 04, 2025


Total records scraped: 22
Data saved to rbi_treasury_bills.csv
Data saved to rbi_treasury_bills.xlsx
Data saved to rbi_press_releases_list.csv


In [11]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin
import time
import re
import os

class RBIScraper:
    def __init__(self):
        self.base_url = "https://rbi.org.in/Scripts/"
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        })
    
    def get_press_release_links(self, main_url):
        """Extract all press release links from the main page"""
        try:
            print(f"Fetching main page: {main_url}")
            response = self.session.get(main_url, timeout=30)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')
            
            releases = []
            current_date = None
            
            # Find all rows in the table
            for row in soup.find_all('tr'):
                # Check if this row is a date header
                date_header = row.find('td', class_='tableheader')
                if date_header:
                    h2 = date_header.find('h2', class_='dop_header')
                    if h2:
                        current_date = h2.get_text(strip=True)
                        continue
                
                # Find press release link
                link_td = row.find('a', class_='link2')
                if link_td and 'href' in link_td.attrs:
                    title = link_td.get_text(strip=True)
                    relative_url = link_td['href']
                    full_url = urljoin(self.base_url, relative_url)
                    
                    releases.append({
                        'title': title,
                        'date': current_date,
                        'url': full_url,
                        'relative_url': relative_url
                    })
            
            print(f"Found {len(releases)} press releases")
            return releases
        
        except Exception as e:
            print(f"Error fetching press releases: {e}")
            return []
    
    def scrape_detail_page_tables(self, url, title):
        """Scrape all tables from a detail page"""
        try:
            print(f"\n  Fetching detail page: {title[:60]}...")
            response = self.session.get(url, timeout=30)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Find all tables
            tables = soup.find_all('table')
            
            if not tables:
                print(f"    No tables found")
                return None
            
            # Extract data from all tables
            all_tables_data = []
            
            for idx, table in enumerate(tables):
                table_data = self._extract_table_data(table)
                if table_data and len(table_data) > 0:
                    all_tables_data.append({
                        'table_index': idx,
                        'data': table_data
                    })
            
            if all_tables_data:
                print(f"    Found {len(all_tables_data)} table(s)")
                return all_tables_data
            else:
                print(f"    No valid table data extracted")
                return None
                
        except Exception as e:
            print(f"    Error scraping detail page: {e}")
            return None
    
    def _extract_table_data(self, table):
        """Extract data from a single table"""
        rows = []
        
        # Get all rows
        for tr in table.find_all('tr'):
            cells = tr.find_all(['td', 'th'])
            if cells:
                # Extract text from each cell
                row_data = []
                for cell in cells:
                    # Get text and clean it
                    text = cell.get_text(strip=True)
                    # Handle line breaks
                    text = ' '.join(text.split())
                    row_data.append(text)
                
                # Only add non-empty rows
                if any(row_data):
                    rows.append(row_data)
        
        return rows
    
    def save_table_to_csv(self, table_data, filename):
        """Save table data to CSV"""
        try:
            if not table_data or len(table_data) == 0:
                return False
            
            # Find the maximum number of columns
            max_cols = max(len(row) for row in table_data)
            
            # Pad rows to have equal columns
            padded_data = []
            for row in table_data:
                padded_row = row + [''] * (max_cols - len(row))
                padded_data.append(padded_row)
            
            # Create DataFrame
            df = pd.DataFrame(padded_data)
            
            # Save to CSV
            df.to_csv(filename, index=False, header=False, encoding='utf-8-sig')
            print(f"    Saved: {filename}")
            return True
            
        except Exception as e:
            print(f"    Error saving CSV: {e}")
            return False
    
    def clean_filename(self, title):
        """Create a clean filename from title"""
        # Remove special characters
        clean = re.sub(r'[^\w\s-]', '', title)
        # Replace spaces with underscores
        clean = re.sub(r'\s+', '_', clean)
        # Limit length
        clean = clean[:100]
        return clean


def main():
    # Create output directory
    output_dir = "rbi_scraped_data"
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # Initialize scraper
    scraper = RBIScraper()
    
    # Main press release page URL
    main_url = "https://rbi.org.in/Scripts/BS_PressreleaseDisplay.aspx"
    
    print("="*80)
    print("RBI Press Release Scraper")
    print("="*80)
    
    # Step 1: Get all press release links
    releases = scraper.get_press_release_links(main_url)
    
    if not releases:
        print("\nNo press releases found. Exiting.")
        return
    
    print(f"\nStarting to scrape {len(releases)} press releases...")
    print("="*80)
    
    # Step 2: Visit each link and scrape tables
    successful_scrapes = 0
    failed_scrapes = 0
    
    for i, release in enumerate(releases, 1):
        print(f"\n[{i}/{len(releases)}] {release['title'][:70]}")
        print(f"  Date: {release['date']}")
        print(f"  URL: {release['url']}")
        
        # Scrape tables from detail page
        tables = scraper.scrape_detail_page_tables(release['url'], release['title'])
        
        if tables:
            # Save each table as a separate CSV
            clean_title = scraper.clean_filename(release['title'])
            
            for table_info in tables:
                table_idx = table_info['table_index']
                table_data = table_info['data']
                
                # Create filename
                if len(tables) > 1:
                    filename = f"{output_dir}/{i:03d}_{clean_title}_table_{table_idx+1}.csv"
                else:
                    filename = f"{output_dir}/{i:03d}_{clean_title}.csv"
                
                # Save table
                if scraper.save_table_to_csv(table_data, filename):
                    successful_scrapes += 1
        else:
            failed_scrapes += 1
        
        # Be polite to the server
        time.sleep(1)
    
    # Summary
    print("\n" + "="*80)
    print("SCRAPING COMPLETE")
    print("="*80)
    print(f"Total press releases processed: {len(releases)}")
    print(f"Successful scrapes: {successful_scrapes}")
    print(f"Failed scrapes: {failed_scrapes}")
    print(f"\nAll CSV files saved in: {output_dir}/")
    print("="*80)


if __name__ == "__main__":
    main()

RBI Press Release Scraper
Fetching main page: https://rbi.org.in/Scripts/BS_PressreleaseDisplay.aspx
Found 55 press releases

Starting to scrape 55 press releases...

[1/55] Treasury Bills: Full Auction Result
  Date: None
  URL: https://rbi.org.in/Scripts/BS_PressReleaseDisplay.aspx?prid=61572

  Fetching detail page: Treasury Bills: Full Auction Result...
    Found 4 table(s)
    Saved: rbi_scraped_data/001_Treasury_Bills_Full_Auction_Result_table_1.csv
    Saved: rbi_scraped_data/001_Treasury_Bills_Full_Auction_Result_table_2.csv
    Saved: rbi_scraped_data/001_Treasury_Bills_Full_Auction_Result_table_3.csv
    Saved: rbi_scraped_data/001_Treasury_Bills_Full_Auction_Result_table_4.csv

[2/55] 91-Day, 182-Day and 364-Day T-Bill Auction Result: Cut-off
  Date: None
  URL: https://rbi.org.in/Scripts/BS_PressReleaseDisplay.aspx?prid=61571

  Fetching detail page: 91-Day, 182-Day and 364-Day T-Bill Auction Result: Cut-off...
    Found 4 table(s)
    Saved: rbi_scraped_data/002_91-Day_182