In [None]:
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
from datetime import datetime, timedelta

In [7]:
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
import time

# Setup browser options
options = Options()
# Uncomment the next line to run in headless mode (no browser window)
# options.add_argument("--headless")
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")

# Initialize driver
driver = webdriver.Firefox(options=options)
driver.maximize_window()

# Navigate to the PIB Archives page
url = "https://archive.pib.gov.in/archive2/erelease.aspx"
driver.get(url)
time.sleep(5)

print("Browser initialized and page loaded successfully!")

Browser initialized and page loaded successfully!


In [8]:
from bs4 import BeautifulSoup

def extract_headlines_from_current_page():
    """Extract headlines from the current page"""
    soup = BeautifulSoup(driver.page_source, "html.parser")
    
    headlines = []
    
    # Find all headline elements
    headline_elements = soup.find_all('li', class_='rel rel-list')
    
    for element in headline_elements:
        headline_text = element.get_text(strip=True)
        if headline_text:
            headlines.append(headline_text)
    
    return headlines

# Test the function with current page
current_headlines = extract_headlines_from_current_page()
print(f"Found {len(current_headlines)} headlines on current page:")
for i, headline in enumerate(current_headlines, 1):
    print(f"{i}. {headline}")

Found 4 headlines on current page:
1. Prime Minister congratulates the Indian contingent at 2025 Asian Athletics Championships for their stupendous performance
2. Prime Minister greets the people of Telangana on their Statehood Day
3. Government NotifiesGuidelines for Scheme to Promote Manufacturing of Electric Passenger Cars in India
4. Government Notifies Guidelines for Scheme to Promote Manufacturing of Electric Passenger Cars in India


In [10]:
def change_date_and_extract(day, month, year):
    """Change the date on the website and extract headlines"""
    try:
        # Wait for page to load
        wait = WebDriverWait(driver, 10)
        
        # Select day
        day_dropdown = wait.until(EC.element_to_be_clickable((By.ID, "rdateID")))
        day_select = Select(day_dropdown)
        day_select.select_by_value(str(day))
        
        # Select month
        month_dropdown = driver.find_element(By.ID, "rmonthID")
        month_select = Select(month_dropdown)
        month_select.select_by_value(str(month))
        
        # Select year
        year_dropdown = driver.find_element(By.ID, "ryearID")
        year_select = Select(year_dropdown)
        year_select.select_by_value(str(year))
        
        # Wait for the page to update
        time.sleep(3)
        
        # Extract headlines
        headlines = extract_headlines_from_current_page()
        
        # Get the current date string from the page
        try:
            date_header = driver.find_element(By.ID, "relhead").text
        except:
            date_header = f"{day}-{month}-{year}"
        
        return {
            'date': f"{year}-{month:02d}-{day:02d}",
            'formatted_date': date_header,
            'headlines': headlines
        }
        
    except Exception as e:
        print(f"Error extracting data for {day}-{month}-{year}: {str(e)}")
        return None

# Test with current date
test_result = change_date_and_extract(2, 6, 2025)
if test_result:
    print(f"Date: {test_result['formatted_date']}")
    print(f"Found {len(test_result['headlines'])} headlines")
    for headline in test_result['headlines'][:3]:  # Show first 3
        print(f"- {headline}")
else:
    print("Failed to extract test data")

Date: English Release 2-June 2025
Found 4 headlines
- Prime Minister congratulates the Indian contingent at 2025 Asian Athletics Championships for their stupendous performance
- Prime Minister greets the people of Telangana on their Statehood Day
- Government NotifiesGuidelines for Scheme to Promote Manufacturing of Electric Passenger Cars in India


In [11]:
def extract_headlines_date_range(start_date, end_date, output_file='pib_headlines.csv'):
    """Extract headlines from a date range and save to CSV"""
    
    all_data = []
    current_date = start_date
    
    print(f"Starting extraction from {start_date} to {end_date}")
    print(f"Output will be saved to: {output_file}")
    
    while current_date <= end_date:
        day = current_date.day
        month = current_date.month
        year = current_date.year
        
        print(f"\nProcessing: {current_date.strftime('%Y-%m-%d')}")
        
        # Extract data for this date
        date_data = change_date_and_extract(day, month, year)
        
        if date_data and date_data['headlines']:
            for headline in date_data['headlines']:
                all_data.append({
                    'date': date_data['date'],
                    'formatted_date': date_data['formatted_date'],
                    'headline': headline
                })
            print(f"  ✓ Found {len(date_data['headlines'])} headlines")
        else:
            print(f"  ✗ No headlines found or error occurred")
        
        # Move to next date
        current_date += timedelta(days=1)
        
        # Small delay to be respectful to the server
        time.sleep(1)
    
    # Save to CSV
    if all_data:
        df = pd.DataFrame(all_data)
        df.to_csv(output_file, index=False, encoding='utf-8')
        print(f"\n🎉 Successfully saved {len(all_data)} headlines to {output_file}")
        print(f"📊 Data covers {len(df['date'].unique())} unique dates")
        return df
    else:
        print("\n❌ No data was extracted")
        return None

# Function to extract headlines for the current year (2025)
def extract_current_year_headlines():
    """Extract headlines from January 1, 2025 to current date"""
    start_date = datetime(2025, 1, 1)
    end_date = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0)
    
    return extract_headlines_date_range(start_date, end_date, 'pib_headlines_2025.csv')

print("Functions defined successfully!")
print("Ready to extract headlines from date ranges.")

Functions defined successfully!
Ready to extract headlines from date ranges.


In [13]:
# Extract headlines for the past 7 days as a test
print("Testing with past 7 days...")

end_date = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0)
start_date = end_date - timedelta(days=7)

test_df = extract_headlines_date_range(start_date, end_date, 'pib_headlines_test.csv')

if test_df is not None:
    print("\n📋 Sample of extracted data:")
    print(test_df.head(10))
    print(f"\n📈 Summary:")
    print(f"Total headlines: {len(test_df)}")
    print(f"Date range: {test_df['date'].min()} to {test_df['date'].max()}")
    print(f"Unique dates: {test_df['date'].nunique()}")

# Let's first check what dates have press releases by testing individual dates
print("Testing individual dates to find ones with press releases...")

# Try June 2, 2025 (we know this has headlines)
print("\nTesting June 2, 2025:")
result_june2 = change_date_and_extract(2, 6, 2025)
if result_june2:
    print(f"✓ Found {len(result_june2['headlines'])} headlines")
    
# Try June 1, 2025
print("\nTesting June 1, 2025:")
result_june1 = change_date_and_extract(1, 6, 2025)
if result_june1:
    print(f"✓ Found {len(result_june1['headlines'])} headlines")
else:
    print("✗ No headlines found")

# Try May 31, 2025
print("\nTesting May 31, 2025:")
result_may31 = change_date_and_extract(31, 5, 2025)
if result_may31:
    print(f"✓ Found {len(result_may31['headlines'])} headlines")
else:
    print("✗ No headlines found")

# Try May 30, 2025
print("\nTesting May 30, 2025:")
result_may30 = change_date_and_extract(30, 5, 2025)
if result_may30:
    print(f"✓ Found {len(result_may30['headlines'])} headlines")
else:
    print("✗ No headlines found")

Testing with past 7 days...
Starting extraction from 2025-06-03 00:00:00 to 2025-06-10 00:00:00
Output will be saved to: pib_headlines_test.csv

Processing: 2025-06-03
  ✗ No headlines found or error occurred

Processing: 2025-06-04
  ✗ No headlines found or error occurred

Processing: 2025-06-05
  ✗ No headlines found or error occurred

Processing: 2025-06-06
  ✗ No headlines found or error occurred

Processing: 2025-06-07
  ✗ No headlines found or error occurred

Processing: 2025-06-08
  ✗ No headlines found or error occurred

Processing: 2025-06-09
  ✗ No headlines found or error occurred

Processing: 2025-06-10
  ✗ No headlines found or error occurred

❌ No data was extracted
Testing individual dates to find ones with press releases...

Testing June 2, 2025:
✓ Found 4 headlines

Testing June 1, 2025:
✓ Found 16 headlines

Testing May 31, 2025:
✓ Found 15 headlines

Testing May 30, 2025:
✓ Found 39 headlines


In [14]:
# Let's create a list of successful extractions and save to CSV
successful_extractions = []

# Add the results we got
if 'result_june2' in locals() and result_june2:
    for headline in result_june2['headlines']:
        successful_extractions.append({
            'date': result_june2['date'],
            'formatted_date': result_june2['formatted_date'],
            'headline': headline
        })

if 'result_june1' in locals() and result_june1:
    for headline in result_june1['headlines']:
        successful_extractions.append({
            'date': result_june1['date'],
            'formatted_date': result_june1['formatted_date'],
            'headline': headline
        })

if 'result_may31' in locals() and result_may31:
    for headline in result_may31['headlines']:
        successful_extractions.append({
            'date': result_may31['date'],
            'formatted_date': result_may31['formatted_date'],
            'headline': headline
        })

if 'result_may30' in locals() and result_may30:
    for headline in result_may30['headlines']:
        successful_extractions.append({
            'date': result_may30['date'],
            'formatted_date': result_may30['formatted_date'],
            'headline': headline
        })

# Save to CSV if we have data
if successful_extractions:
    df_sample = pd.DataFrame(successful_extractions)
    df_sample.to_csv('pib_headlines_sample.csv', index=False, encoding='utf-8')
    print(f"\n🎉 Successfully saved {len(successful_extractions)} headlines to pib_headlines_sample.csv")
    print(f"📊 Data covers {len(df_sample['date'].unique())} unique dates")
    
    # Show sample data
    print("\n📋 Sample of extracted data:")
    print(df_sample.head(10))
else:
    print("\n❌ No successful extractions to save")


🎉 Successfully saved 74 headlines to pib_headlines_sample.csv
📊 Data covers 4 unique dates

📋 Sample of extracted data:
         date               formatted_date  \
0  2025-06-02  English Release 2-June 2025   
1  2025-06-02  English Release 2-June 2025   
2  2025-06-02  English Release 2-June 2025   
3  2025-06-02  English Release 2-June 2025   
4  2025-06-01  English Release 1-June 2025   
5  2025-06-01  English Release 1-June 2025   
6  2025-06-01  English Release 1-June 2025   
7  2025-06-01  English Release 1-June 2025   
8  2025-06-01  English Release 1-June 2025   
9  2025-06-01  English Release 1-June 2025   

                                            headline  
0  Prime Minister congratulates the Indian contin...  
1  Prime Minister greets the people of Telangana ...  
2  Government NotifiesGuidelines for Scheme to Pr...  
3  Government Notifies Guidelines for Scheme to P...  
4  PM to participate in International Air Transpo...  
5  Prime Minister meets the President of A

In [15]:
# Uncomment and run the following to extract ALL headlines from 2025
# WARNING: This will take a long time and make many requests to the server

print("\n🚀 Starting extraction for entire year 2025...")
print("This will take several minutes. Please be patient.")

full_year_df = extract_current_year_headlines()

if full_year_df is not None:
    print("\n🎊 Full year extraction completed!")
    print(f"Total headlines for 2025: {len(full_year_df)}")
    print(f"Files created: pib_headlines_2025.csv")

# print("\n💡 To extract the full year, uncomment the code above and run this cell.")
# print("For now, we've tested with the past 7 days.")


🚀 Starting extraction for entire year 2025...
This will take several minutes. Please be patient.
Starting extraction from 2025-01-01 00:00:00 to 2025-06-10 00:00:00
Output will be saved to: pib_headlines_2025.csv

Processing: 2025-01-01
  ✓ Found 36 headlines
  ✓ Found 36 headlines

Processing: 2025-01-02

Processing: 2025-01-02
  ✓ Found 26 headlines
  ✓ Found 26 headlines

Processing: 2025-01-03

Processing: 2025-01-03
  ✓ Found 45 headlines
  ✓ Found 45 headlines

Processing: 2025-01-04

Processing: 2025-01-04
  ✓ Found 28 headlines
  ✓ Found 28 headlines

Processing: 2025-01-05

Processing: 2025-01-05
  ✓ Found 21 headlines
  ✓ Found 21 headlines

Processing: 2025-01-06

Processing: 2025-01-06
  ✓ Found 38 headlines
  ✓ Found 38 headlines

Processing: 2025-01-07

Processing: 2025-01-07
  ✓ Found 36 headlines
  ✓ Found 36 headlines

Processing: 2025-01-08

Processing: 2025-01-08
  ✓ Found 37 headlines
  ✓ Found 37 headlines

Processing: 2025-01-09

Processing: 2025-01-09
  ✓ Found 

In [19]:
# Function to safely close the browser when done
def cleanup():
    """Close the browser and clean up"""
    try:
        driver.quit()
        print("✅ Browser closed successfully")
    except:
        print("⚠️ Browser was already closed or error occurred")
cleanup()

✅ Browser closed successfully
