In [1]:
!pip install selenium webdriver-manager pandas



In [2]:
# Cell 2: Imports and Basic Setup
import logging
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import NoSuchElementException, TimeoutException, StaleElementReferenceException

# Configure logging
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')

logging.info("Libraries imported and logging configured.")

# Define the target URL
TARGET_URL = "https://www.myscheme.gov.in/search"
print(f"Target URL set to: {TARGET_URL}")

2025-04-22 23:02:44,556 - INFO - Libraries imported and logging configured.


Target URL set to: https://www.myscheme.gov.in/search


In [3]:
# Cell 3: Initialize WebDriver and Navigate
logging.info("Initializing WebDriver...")
driver = None # Initialize driver variable to None
try:
    # Setup Chrome options
    chrome_options = Options()
    # chrome_options.add_argument("--headless") # Uncomment for headless mode later if needed
    chrome_options.add_argument("--start-maximized") # Start maximized to capture full view
    chrome_options.add_argument("--disable-gpu") # Often recommended, especially for headless
    chrome_options.add_argument("--no-sandbox") # Bypass OS security model, sometimes needed
    chrome_options.add_argument("--disable-dev-shm-usage") # Overcome limited resource problems in some environments

    # Use webdriver-manager to automatically handle driver setup
    logging.info("Setting up ChromeDriver using WebDriverManager...")
    service = Service(ChromeDriverManager().install())
    logging.info("ChromeDriver setup complete.")

    # Initialize the Chrome driver
    driver = webdriver.Chrome(service=service, options=chrome_options)
    logging.info("WebDriver initialized successfully.")

    # Navigate to the target URL
    logging.info(f"Navigating to {TARGET_URL}...")
    driver.get(TARGET_URL)

    # Add a reasonable wait for the initial page load and potential dynamic elements
    # WebDriverWait is generally better but time.sleep is simpler for initial load
    wait_time = 10 # seconds
    logging.info(f"Waiting {wait_time} seconds for page elements to load...")
    time.sleep(wait_time)

    logging.info(f"Successfully navigated to {TARGET_URL} and waited.")
    print("WebDriver initialized and navigated to the target URL.")
    print(f"Current page title: '{driver.title}'") # Verify page load

except Exception as e:
    logging.error(f"Error initializing WebDriver or navigating: {e}")
    print(f"An error occurred during WebDriver setup or navigation: {e}")
    # Ensure driver is quit if it was partially initialized and an error occurred
    if driver:
        driver.quit()

2025-04-22 23:03:34,241 - INFO - Initializing WebDriver...
2025-04-22 23:03:34,243 - INFO - Setting up ChromeDriver using WebDriverManager...
2025-04-22 23:03:38,773 - INFO - Get LATEST chromedriver version for google-chrome
2025-04-22 23:03:39,567 - INFO - Get LATEST chromedriver version for google-chrome
2025-04-22 23:03:39,997 - INFO - Get LATEST chromedriver version for google-chrome
2025-04-22 23:03:40,507 - INFO - WebDriver version 135.0.7049.95 selected
2025-04-22 23:03:40,512 - INFO - Modern chrome version https://storage.googleapis.com/chrome-for-testing-public/135.0.7049.95/win32/chromedriver-win32.zip
2025-04-22 23:03:40,513 - INFO - About to download new driver from https://storage.googleapis.com/chrome-for-testing-public/135.0.7049.95/win32/chromedriver-win32.zip
2025-04-22 23:03:40,848 - INFO - Driver downloading response is 200
2025-04-22 23:03:43,462 - INFO - Get LATEST chromedriver version for google-chrome
2025-04-22 23:03:44,149 - INFO - Driver has been saved in cach

WebDriver initialized and navigated to the target URL.
Current page title: 'Search Schemes'


In [4]:
# Cell 4 (Revised): Locate and Extract Scheme Data (First Page)

logging.info("Attempting to locate scheme elements with refined selectors...")

# --- Refined Locators based on provided HTML ---
scheme_card_selector = "div.mx-auto.rounded-xl.shadow-md" # Selector for the whole scheme card/box
scheme_link_selector = "h2 > a"                      # Selector for the link element (<a>) containing name and href
ministry_state_selector = "h2.mt-3.font-normal"       # Selector for the ministry/state h2 tag
description_selector = "span.line-clamp-2 > span"    # Selector for the short description span
tag_selector = "div[title] > span"                   # Selector for individual category tag spans
# --- ---

# Re-initialize list for this run, in case it was run before
scheme_data_list = []
max_wait_time = 20

try:
    # Wait for the scheme cards to be present
    wait = WebDriverWait(driver, max_wait_time)
    logging.info(f"Waiting up to {max_wait_time} seconds for scheme cards using selector: '{scheme_card_selector}'")
    scheme_cards = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, scheme_card_selector)))
    logging.info(f"Found {len(scheme_cards)} potential scheme cards on the current page.")

    if not scheme_cards:
        logging.warning("No scheme cards found using the refined selector. Check page or selector again.")
        print("Could not find any scheme cards on the page with the refined selector.")
    else:
        logging.info("Iterating through scheme cards and extracting data...")
        cards_processed = 0
        for card_index, card in enumerate(scheme_cards):
            scheme_info = {} # Dictionary for current scheme data
            try:
                # Scroll the card into view (can sometimes help with element interaction)
                # driver.execute_script("arguments[0].scrollIntoView({block: 'center', behavior: 'smooth'});", card)
                # time.sleep(0.2) # Short pause after scroll

                # Extract Scheme Link and Name
                link_element = card.find_element(By.CSS_SELECTOR, scheme_link_selector)
                relative_url = link_element.get_attribute('href')
                # Construct full URL - handle potential errors if href is missing/malformed
                if relative_url and relative_url.startswith('/'):
                    scheme_info["Source URL"] = f"https://www.myscheme.gov.in{relative_url}"
                elif relative_url:
                     scheme_info["Source URL"] = relative_url # Keep as is if it's already a full URL (unlikely based on HTML)
                else:
                    scheme_info["Source URL"] = "N/A"

                scheme_info["Scheme Name"] = link_element.text.strip()

                # Extract Ministry/State
                try:
                    ministry_element = card.find_element(By.CSS_SELECTOR, ministry_state_selector)
                    scheme_info["Ministry"] = ministry_element.text.strip()
                except NoSuchElementException:
                    logging.debug(f"Ministry/State element not found in card {card_index} for scheme: '{scheme_info.get('Scheme Name', 'Unknown')}'")
                    scheme_info["Ministry"] = "N/A"

                # Extract Short Description
                try:
                    desc_element = card.find_element(By.CSS_SELECTOR, description_selector)
                    scheme_info["Description"] = desc_element.text.strip()
                except NoSuchElementException:
                    logging.debug(f"Description element not found in card {card_index} for scheme: '{scheme_info.get('Scheme Name', 'Unknown')}'")
                    scheme_info["Description"] = "N/A" # Keep it short as requested originally

                # Extract Category Tags
                try:
                    # Find all tag span elements within the current card
                    tags_elements = card.find_elements(By.CSS_SELECTOR, tag_selector)
                    # Extract text and join with comma, filtering out empty strings
                    tags = [tag.text.strip() for tag in tags_elements if tag.text.strip()]
                    scheme_info["Category"] = ", ".join(tags) if tags else "N/A"
                except NoSuchElementException:
                    # This might happen if the structure varies or the selector needs refinement
                    logging.debug(f"Tags elements not found in card {card_index} for scheme: '{scheme_info.get('Scheme Name', 'Unknown')}'")
                    scheme_info["Category"] = "N/A"

                # Add placeholders for fields not available on search page (to be filled later if needed)
                scheme_info["Eligibility"] = "Details on Source URL" # Placeholder
                scheme_info["Benefits"] = "Details on Source URL" # Placeholder
                scheme_info["Application Process"] = "Details on Source URL" # Placeholder
                scheme_info["Documents"] = "Details on Source URL" # Placeholder
                scheme_info["Last Updated"] = "N/A" # Placeholder

                # Add to list only if a valid scheme name was extracted
                if scheme_info.get("Scheme Name"):
                    scheme_data_list.append(scheme_info)
                    logging.info(f"Extracted: '{scheme_info['Scheme Name']}' (URL: {scheme_info['Source URL']})")
                    cards_processed += 1
                else:
                    logging.warning(f"Card {card_index} processed, but scheme name was empty.")

            except StaleElementReferenceException:
                logging.warning(f"Stale element reference encountered for card {card_index}. The page might have updated dynamically. Skipping this card.")
            except NoSuchElementException as e:
                # Log which element was not found if possible
                logging.warning(f"Could not find a required element in card {card_index}. Error: {e}. Skipping card.")
            except Exception as e:
                logging.error(f"An unexpected error occurred processing card {card_index} for scheme '{scheme_info.get('Scheme Name', 'Unknown')}': {e}", exc_info=True)

        logging.info(f"Successfully processed {cards_processed} schemes out of {len(scheme_cards)} cards found on the first page.")
        print(f"Successfully extracted initial data for {len(scheme_data_list)} schemes from the first page.")

        # Display the first extracted scheme's details for verification
        if scheme_data_list:
            print("\n--- First Scheme Extracted (Example) ---")
            first_scheme = scheme_data_list[0]
            for key, value in first_scheme.items():
                 # Truncate long descriptions/categories for cleaner printing here
                 if isinstance(value, str) and len(value) > 70:
                     print(f"- {key}: {value[:70]}...")
                 else:
                     print(f"- {key}: {value}")
            print("--- End Example ---")

except TimeoutException:
    logging.error(f"Timed out waiting for scheme cards ('{scheme_card_selector}') to appear. Check selector and page load state.")
    print(f"Error: Timed out waiting for scheme elements ({scheme_card_selector}) to load. Please verify the selector and ensure the page content is visible.")
except Exception as e:
    logging.error(f"A critical error occurred during scheme extraction: {e}", exc_info=True)
    print(f"A critical error occurred: {e}")

2025-04-22 23:07:48,829 - INFO - Attempting to locate scheme elements with refined selectors...
2025-04-22 23:07:48,834 - INFO - Waiting up to 20 seconds for scheme cards using selector: 'div.mx-auto.rounded-xl.shadow-md'
2025-04-22 23:07:48,891 - INFO - Found 10 potential scheme cards on the current page.
2025-04-22 23:07:48,893 - INFO - Iterating through scheme cards and extracting data...
2025-04-22 23:07:49,457 - INFO - Extracted: 'Financial Assistance To Disabled Students Pursuing (10th, 11th, 12th Equivalent Exams)' (URL: https://www.myscheme.gov.in/schemes/fadsp1012e)
2025-04-22 23:07:50,085 - INFO - Extracted: 'ICMR- Post Doctoral Fellowship' (URL: https://www.myscheme.gov.in/schemes/icmr-pdf)
2025-04-22 23:07:50,903 - INFO - Extracted: 'Tool Kit Grant for Traditional Handicrafts Experts' (URL: https://www.myscheme.gov.in/schemes/tkgthe)
2025-04-22 23:07:51,467 - INFO - Extracted: 'Snehasanthwanam' (URL: https://www.myscheme.gov.in/schemes/skerala)
2025-04-22 23:07:52,048 - INF

Successfully extracted initial data for 10 schemes from the first page.

--- First Scheme Extracted (Example) ---
- Source URL: https://www.myscheme.gov.in/schemes/fadsp1012e
- Scheme Name: Financial Assistance To Disabled Students Pursuing (10th, 11th, 12th E...
- Ministry: Kerala
- Description: The scheme “Financial Assistance to Disabled Students Pursuing (10th, ...
- Category: APL, BPL, Disabled, Financial Assistance, PwD, Student
- Eligibility: Details on Source URL
- Benefits: Details on Source URL
- Application Process: Details on Source URL
- Documents: Details on Source URL
- Last Updated: N/A
--- End Example ---


In [8]:
# Cell 5 (Attempt 4): Pagination by Clicking Page Number 'li'

logging.info("Starting pagination process (Attempt 4 - Clicking Page Number 'li')...")
from selenium.webdriver.common.action_chains import ActionChains # Keep ActionChains as a fallback click

# --- Configuration ---
target_scheme_count = 100
max_pages_to_scrape = 35 # Limit to prevent infinite loops
current_page_number = 1  # Start on page 1
# --- ---

# --- Selectors ---
# XPath template to find the <li> element by its text content (page number)
# normalize-space() handles potential extra whitespace around the number
page_number_xpath_template = "//ul[contains(@class, 'list-none')]//li[normalize-space(text())='{}']"
# XPath to find the currently active page <li> (for verification)
active_page_xpath = "//ul[contains(@class, 'list-none')]//li[contains(@class, 'bg-green-700')]"
# Other selectors remain the same
scheme_card_selector = "div.mx-auto.rounded-xl.shadow-md"
scheme_link_selector = "h2 > a"
ministry_state_selector = "h2.mt-3.font-normal"
description_selector = "span.line-clamp-2 > span"
tag_selector = "div[title] > span"
# --- ---

# Ensure scheme_data_list exists and reset if needed
if 'scheme_data_list' not in locals() or not isinstance(scheme_data_list, list):
    logging.error("scheme_data_list not found. Run Cell 4.")
    print("Error: scheme_data_list missing.")
    raise NameError("scheme_data_list not defined")
elif len(scheme_data_list) > 10: # Reset if re-running
     logging.warning("Resetting scheme_data_list to first 10 entries for this run.")
     scheme_data_list = scheme_data_list[:10]

initial_count = len(scheme_data_list)
logging.info(f"Starting pagination loop. Have {initial_count} schemes from page {current_page_number}.")

while len(scheme_data_list) < target_scheme_count and current_page_number < max_pages_to_scrape:
    # Determine the next page number to click
    page_to_click = current_page_number + 1
    page_to_click_str = str(page_to_click)
    logging.info(f"--- Attempting to Navigate to Page {page_to_click} by clicking its number ---")
    page_start_time = time.time()
    page_number_to_click_xpath = page_number_xpath_template.format(page_to_click_str)

    try:
        # 1. Find the list item (button) for the next page number
        wait = WebDriverWait(driver, 30)
        logging.info(f"Looking for page number '{page_to_click_str}' button using XPath: {page_number_to_click_xpath}")
        time.sleep(1) # Pause before searching

        logging.info(f"Waiting for page '{page_to_click_str}' button presence...")
        page_button = wait.until(EC.presence_of_element_located((By.XPATH, page_number_to_click_xpath)))
        logging.info(f"Page '{page_to_click_str}' button is present.")

        # Scroll into view
        driver.execute_script("arguments[0].scrollIntoView({block: 'center', behavior: 'smooth'});", page_button)
        time.sleep(1.0) # Pause after scroll

        logging.info(f"Waiting for page '{page_to_click_str}' button to be clickable...")
        page_button = wait.until(EC.element_to_be_clickable((By.XPATH, page_number_to_click_xpath)))
        logging.info(f"Page '{page_to_click_str}' button appears clickable.")

        # 2. Click the page number button - Try multiple methods
        clicked = False
        try:
            logging.info(f"Attempting direct click on page '{page_to_click_str}' button...")
            page_button.click()
            clicked = True
            logging.info("Direct click successful.")
        except Exception as e1:
             logging.warning(f"Direct click failed: {e1}. Trying JS click...")
             try:
                 driver.execute_script("arguments[0].click();", page_button)
                 clicked = True
                 logging.info("JS click successful.")
             except Exception as e2:
                 logging.warning(f"JS click failed: {e2}. Trying ActionChains click...")
                 try:
                    actions = ActionChains(driver)
                    actions.move_to_element(page_button).pause(0.5).click().perform()
                    clicked = True
                    logging.info("ActionChains click successful.")
                 except Exception as e3:
                      logging.error(f"All click methods failed for page '{page_to_click_str}' button: {e3}")
                      raise e3 # Propagate error if all fail

        if not clicked: # Safety break if somehow error wasn't raised
             logging.error(f"Failed to click page '{page_to_click_str}' button using any method.")
             break

        # 3. Wait for active page indicator to show the *new* page number
        logging.info(f"Waiting up to 30s for active page number indicator to become '{page_to_click_str}'...")
        wait.until(EC.text_to_be_present_in_element((By.XPATH, active_page_xpath), page_to_click_str))
        logging.info(f"Active page number confirmed as '{page_to_click_str}'. Page {page_to_click} loaded.")

        # Update current page number *after* successful navigation confirmation
        current_page_number = page_to_click
        time.sleep(4) # Allow render time

        # 4. Extract data from the new page
        logging.info(f"Extracting schemes from page {current_page_number}...")
        new_scheme_cards = driver.find_elements(By.CSS_SELECTOR, scheme_card_selector)
        logging.info(f"Found {len(new_scheme_cards)} scheme cards on page {current_page_number}.")
        page_schemes_extracted_count = 0

        # --- Extraction Logic (same as before) ---
        for card_index, card in enumerate(new_scheme_cards):
             scheme_info = {}
             try:
                 link_element = card.find_element(By.CSS_SELECTOR, scheme_link_selector)
                 relative_url = link_element.get_attribute('href')
                 if relative_url and relative_url.startswith('/'): scheme_info["Source URL"] = f"https://www.myscheme.gov.in{relative_url}"
                 elif relative_url: scheme_info["Source URL"] = relative_url
                 else: scheme_info["Source URL"] = "N/A"
                 scheme_info["Scheme Name"] = link_element.text.strip()
                 try: scheme_info["Ministry"] = card.find_element(By.CSS_SELECTOR, ministry_state_selector).text.strip()
                 except NoSuchElementException: scheme_info["Ministry"] = "N/A"
                 try: scheme_info["Description"] = card.find_element(By.CSS_SELECTOR, description_selector).text.strip()
                 except NoSuchElementException: scheme_info["Description"] = "N/A"
                 try:
                     tags_elements = card.find_elements(By.CSS_SELECTOR, tag_selector)
                     tags = [tag.text.strip() for tag in tags_elements if tag.text.strip()]
                     scheme_info["Category"] = ", ".join(tags) if tags else "N/A"
                 except NoSuchElementException: scheme_info["Category"] = "N/A"
                 scheme_info["Eligibility"] = "Details on Source URL"; scheme_info["Benefits"] = "Details on Source URL"; scheme_info["Application Process"] = "Details on Source URL"; scheme_info["Documents"] = "Details on Source URL"; scheme_info["Last Updated"] = "N/A"

                 if scheme_info.get("Scheme Name"):
                     scheme_data_list.append(scheme_info)
                     page_schemes_extracted_count += 1
                 else: logging.warning(f"Page {current_page_number}, Card {card_index}: Scheme name empty.")
             except StaleElementReferenceException: logging.warning(f"Page {current_page_number}, Card {card_index}: Stale element. Skipping.")
             except NoSuchElementException as e: logging.warning(f"Page {current_page_number}, Card {card_index}: Required element not found ({e}). Skipping.")
             except Exception as e: logging.error(f"Page {current_page_number}, Card {card_index}: Error processing card: {e}", exc_info=True)
        # --- End Extraction Logic ---


        page_duration = time.time() - page_start_time
        logging.info(f"Extracted {page_schemes_extracted_count} schemes from page {current_page_number} in {page_duration:.2f} seconds. Total schemes now: {len(scheme_data_list)}")

        if len(scheme_data_list) >= target_scheme_count:
            logging.info(f"Target of {target_scheme_count} schemes reached.")
            break

    except TimeoutException:
        logging.error(f"TimeoutException occurred while trying to navigate to or confirm page {page_to_click}.")
        try:
            current_active_page_text = driver.find_element(By.XPATH, active_page_xpath).text.strip()
            logging.error(f"Timeout occurred. Current active page: '{current_active_page_text}'. Failed to load/confirm page '{page_to_click_str}'.")
        except Exception as e_debug:
             logging.error(f"Could not determine active page after timeout. Error: {e_debug}")
        logging.error("Stopping pagination due to Timeout.")
        break
    except NoSuchElementException:
        # This means the specific page number li wasn't found
        logging.info(f"Page number button '{page_to_click_str}' (XPath: {page_number_to_click_xpath}) not found. Assuming end of available pages.")
        break
    except Exception as e:
        logging.error(f"An critical error occurred trying to process page {page_to_click}: {e}", exc_info=True)
        print(f"A critical error stopped the pagination process: {e}")
        break

# --- Loop Finished ---
final_count = len(scheme_data_list)
logging.info(f"Pagination loop finished. Processed up to page {current_page_number}. Total schemes collected: {final_count}")
print(f"\nPagination complete.")
print(f"Collected a total of {final_count} schemes across {current_page_number} pages.")

if final_count >= target_scheme_count:
    print(f"Successfully collected {final_count} schemes (target was {target_scheme_count}).")
elif current_page_number >= max_pages_to_scrape:
     print(f"Reached the maximum page limit ({max_pages_to_scrape}). Collected {final_count} schemes.")
else:
    print(f"Collected {final_count} schemes. Stopped because page navigation failed (Timeout/Error/Not Found).")

2025-04-22 23:20:39,567 - INFO - Starting pagination process (Attempt 4 - Clicking Page Number 'li')...
2025-04-22 23:20:39,569 - INFO - Starting pagination loop. Have 10 schemes from page 1.
2025-04-22 23:20:39,576 - INFO - --- Attempting to Navigate to Page 2 by clicking its number ---
2025-04-22 23:20:39,579 - INFO - Looking for page number '2' button using XPath: //ul[contains(@class, 'list-none')]//li[normalize-space(text())='2']
2025-04-22 23:20:40,581 - INFO - Waiting for page '2' button presence...
2025-04-22 23:20:40,620 - INFO - Page '2' button is present.
2025-04-22 23:20:41,645 - INFO - Waiting for page '2' button to be clickable...
2025-04-22 23:20:41,734 - INFO - Page '2' button appears clickable.
2025-04-22 23:20:41,735 - INFO - Attempting direct click on page '2' button...
2025-04-22 23:20:42,013 - INFO - Direct click successful.
2025-04-22 23:20:42,016 - INFO - Waiting up to 30s for active page number indicator to become '2'...
2025-04-22 23:20:42,885 - INFO - Active p


Pagination complete.
Collected a total of 100 schemes across 10 pages.
Successfully collected 100 schemes (target was 100).


In [9]:
# Cell 6: Save Data to File

logging.info("Preparing to save the collected data...")

# Check if the list has data and is not empty
if 'scheme_data_list' in locals() and scheme_data_list:
    try:
        # Create a Pandas DataFrame from the list of dictionaries
        # The keys of the dictionaries become the column headers
        df = pd.DataFrame(scheme_data_list)
        logging.info(f"Created Pandas DataFrame with shape: {df.shape}") # (rows, columns)

        # Define output filenames
        csv_filename = "myscheme_schemes_data.csv"
        json_filename = "myscheme_schemes_data.json"

        # --- Save to CSV ---
        # index=False prevents pandas from writing the DataFrame index as a column
        # encoding='utf-8-sig' helps ensure correct display of characters (like ₹) in Excel
        df.to_csv(csv_filename, index=False, encoding='utf-8-sig')
        logging.info(f"Data successfully saved to CSV file: {csv_filename}")
        print(f"Data successfully saved to CSV: {csv_filename}")

        # --- Save to JSON ---
        # orient='records' creates a JSON array of objects (one object per row)
        # indent=4 makes the JSON file human-readable
        # force_ascii=False preserves non-ASCII characters (important for Indian languages/names)
        df.to_json(json_filename, orient='records', indent=4, force_ascii=False)
        logging.info(f"Data successfully saved to JSON file: {json_filename}")
        print(f"Data successfully saved to JSON: {json_filename}")

        # Display the first 5 rows of the DataFrame for a quick preview
        print("\n--- First 5 Rows of Collected Data ---")
        # Using to_markdown for potentially better formatting in some environments
        try:
            print(df.head().to_markdown(index=False))
        except ImportError: # Fallback if tabulate is not installed (dependency for to_markdown)
             print(df.head())
        print("--- ---")

    except Exception as e:
        logging.error(f"An error occurred while creating DataFrame or saving data: {e}", exc_info=True)
        print(f"An error occurred while saving the data: {e}")
else:
    # This case shouldn't happen given the previous step's success, but good practice to check
    logging.warning("scheme_data_list is empty or not defined. No data available to save.")
    print("No data was collected or the data list is empty. Skipping file save.")

2025-04-22 23:23:30,207 - INFO - Preparing to save the collected data...
2025-04-22 23:23:30,247 - INFO - Created Pandas DataFrame with shape: (100, 10)
2025-04-22 23:23:30,294 - INFO - Data successfully saved to CSV file: myscheme_schemes_data.csv
2025-04-22 23:23:30,313 - INFO - Data successfully saved to JSON file: myscheme_schemes_data.json


Data successfully saved to CSV: myscheme_schemes_data.csv
Data successfully saved to JSON: myscheme_schemes_data.json

--- First 5 Rows of Collected Data ---
                                          Source URL  \
0     https://www.myscheme.gov.in/schemes/fadsp1012e   
1       https://www.myscheme.gov.in/schemes/icmr-pdf   
2         https://www.myscheme.gov.in/schemes/tkgthe   
3        https://www.myscheme.gov.in/schemes/skerala   
4  https://www.myscheme.gov.in/schemes/sgassobcan...   

                                         Scheme Name  \
0  Financial Assistance To Disabled Students Purs...   
1                     ICMR- Post Doctoral Fellowship   
2  Tool Kit Grant for Traditional Handicrafts Exp...   
3                                    Snehasanthwanam   
4  Scheme for Grant of Additional Scholarship to ...   

                              Ministry  \
0                               Kerala   
1  Ministry Of Health & Family Welfare   
2                               Kerala   


In [13]:
# Cell 7 (Revised for Plain Text): Scrape Detail Pages

logging.info(f"Starting detail page scraping for plain text (up to {len(scheme_data_list)} schemes)...")

# --- Detail Page Selectors (Should still be correct) ---
selectors = {
    "Benefits": "//div[@id='benefits']//div[contains(@class, 'markdown-options')]",
    "Eligibility": "//div[@id='eligibility']//div[contains(@class, 'markdown-options')]",
    "Application Process": "//div[@id='application-process']//div[contains(@class, 'markdown-options')]",
    "Documents": "//div[@id='documents-required']//div[contains(@class, 'markdown-options')]",
}
# --- ---

# Ensure scheme_data_list exists and has data
if 'scheme_data_list' not in locals() or not isinstance(scheme_data_list, list) or not scheme_data_list:
    logging.error("scheme_data_list not found or empty. Run previous cells first.")
    print("Error: scheme_data_list is missing or empty. Cannot scrape details.")
    raise NameError("scheme_data_list not defined or empty")

# --- MODIFIED Helper function to extract PLAIN TEXT ---
def extract_section_content(driver, section_xpath):
    content = "N/A" # Default value
    try:
        # Wait for the content element within the section to be present
        element = WebDriverWait(driver, 10).until( # Wait up to 10s for section content
            EC.presence_of_element_located((By.XPATH, section_xpath))
        )
        # Get PLAIN TEXT content; .text gets visible text of element and sub-elements
        content_text = element.text.strip()
        if content_text:
            content = content_text
        else:
            # Handle cases where the element exists but has no visible text
            content = "N/A (empty section)"
            logging.warning(f"Element found with XPath '{section_xpath}' but contains no visible text.")

        # Clean up extra whitespace/newlines often produced by .text on complex elements
        if isinstance(content, str):
             content = ' '.join(content.split())

        return content

    except TimeoutException:
        logging.warning(f"Timeout waiting for section content element: {section_xpath}")
        return "N/A (section content not found)" # Element not found
    except Exception as e:
        logging.warning(f"Error extracting detail with {section_xpath}: {e}")
        return "N/A (extraction error)" # Error during extraction
# --- End of Modified Helper Function ---


# --- Loop through schemes (Process a subset for testing first) ---
# ===> Run for first 5 first to check plain text output <===
schemes_to_process_indices = range(5)
# ===> Then change to this line to process all 100 <===
# schemes_to_process_indices = range(len(scheme_data_list))

processed_detail_count = 0
total_detail_to_process = len(schemes_to_process_indices)

logging.info(f"Will process details for the first {total_detail_to_process} schemes.")

for i in schemes_to_process_indices:
    scheme_info = scheme_data_list[i]
    url = scheme_info.get("Source URL")
    name = scheme_info.get("Scheme Name", f"Scheme at index {i}")
    logging.info(f"--- Processing Detail Page {i+1}/{total_detail_to_process} for plain text: '{name}' ---")

    if not url or url == "N/A" or not url.startswith("http"):
        logging.warning(f"Skipping scheme '{name}' (Index {i}) due to invalid URL: {url}")
        for field in selectors.keys():
             scheme_data_list[i][field] = "N/A (Invalid URL)" # Update original list
        continue

    try:
        logging.debug(f"Navigating to: {url}")
        driver.get(url)

        detail_page_wait_element_xpath = "//h1[contains(@class, 'font-bold text-xl sm:text-2xl')]"
        logging.debug(f"Waiting for detail page H1 title...")
        WebDriverWait(driver, 25).until(EC.presence_of_element_located((By.XPATH, detail_page_wait_element_xpath)))
        logging.debug("Detail page H1 title found.")
        time.sleep(3)

        # Extract plain text details using the modified helper function
        for field, section_xpath in selectors.items():
            logging.debug(f"Extracting '{field}' plain text...")
            detail_content = extract_section_content(driver, section_xpath)
            scheme_data_list[i][field] = detail_content # Update original list
            logging.debug(f" -> {field} updated with plain text.")

        processed_detail_count += 1
        logging.info(f"Finished extracting plain text details for: '{name}'")

    except TimeoutException:
        logging.error(f"Timeout occurred loading detail page or finding H1 title for: {url} (Index {i})")
        for field in selectors.keys():
             scheme_data_list[i][field] = "N/A (Page Load Timeout)"
    except Exception as e:
        logging.error(f"Unexpected error processing detail page {url} (Index {i}): {e}", exc_info=True)
        for field in selectors.keys():
             scheme_data_list[i][field] = "N/A (Processing Error)"

    logging.debug("Waiting 3 seconds before next request...")
    time.sleep(3)

# --- Loop Finished ---
logging.info(f"Plain text detail scraping finished. Attempted {total_detail_to_process} URLs, successfully processed {processed_detail_count}.")
print(f"\nPlain text detail scraping phase complete for the first {total_detail_to_process} schemes.")
print(f"Successfully processed details for {processed_detail_count} schemes.")
print("The `scheme_data_list` has been updated in place with plain text content.")

# Display updated data for the first scheme processed
if total_detail_to_process > 0 and processed_detail_count > 0:
     first_processed_index = -1
     for idx in schemes_to_process_indices:
         if isinstance(scheme_data_list[idx].get("Eligibility"), str) and \
            not scheme_data_list[idx].get("Eligibility", "").startswith("N/A ("):
              first_processed_index = idx
              break
     if first_processed_index != -1:
          updated_first_scheme = scheme_data_list[first_processed_index]
          print(f"\n--- Updated PLAIN TEXT data for First Successfully Processed Scheme (Index {first_processed_index}) ---")
          for key, value in updated_first_scheme.items():
               print(f"- {key}: {str(value)[:300]}...") # Print snippet
          print("--- ---")
     else:
           print("\nNote: Could not verify successful detail extraction based on 'Eligibility' field check.")
elif total_detail_to_process > 0:
     print("\nNo schemes appear to have been successfully processed in this detail scraping phase.")

2025-04-22 23:54:32,890 - INFO - Starting detail page scraping for plain text (up to 100 schemes)...
2025-04-22 23:54:32,896 - INFO - Will process details for the first 5 schemes.
2025-04-22 23:54:32,898 - INFO - --- Processing Detail Page 1/5 for plain text: 'Financial Assistance To Disabled Students Pursuing (10th, 11th, 12th Equivalent Exams)' ---
2025-04-22 23:54:38,343 - INFO - Finished extracting plain text details for: 'Financial Assistance To Disabled Students Pursuing (10th, 11th, 12th Equivalent Exams)'
2025-04-22 23:54:41,349 - INFO - --- Processing Detail Page 2/5 for plain text: 'ICMR- Post Doctoral Fellowship' ---
2025-04-22 23:54:46,723 - INFO - Finished extracting plain text details for: 'ICMR- Post Doctoral Fellowship'
2025-04-22 23:54:49,725 - INFO - --- Processing Detail Page 3/5 for plain text: 'Tool Kit Grant for Traditional Handicrafts Experts' ---
2025-04-22 23:54:55,598 - INFO - Finished extracting plain text details for: 'Tool Kit Grant for Traditional Handicra


Plain text detail scraping phase complete for the first 5 schemes.
Successfully processed details for 5 schemes.
The `scheme_data_list` has been updated in place with plain text content.

--- Updated PLAIN TEXT data for First Successfully Processed Scheme (Index 0) ---
- Source URL: https://www.myscheme.gov.in/schemes/fadsp1012e...
- Scheme Name: Financial Assistance To Disabled Students Pursuing (10th, 11th, 12th Equivalent Exams)...
- Ministry: Kerala...
- Description: The scheme “Financial Assistance to Disabled Students Pursuing (10th, 11th, 12th Equivalent Exams)” was launched by the Department of Social Justice, Government of Kerala....
- Category: APL, BPL, Disabled, Financial Assistance, PwD, Student...
- Eligibility: The applicant should be a resident of Kerala State. The differently abled students with 40% or more disability are eligible to apply under the scheme. Financial assistance will be provided to students falling in Above Poverty Line (APL) as well as Below Poverty L

In [14]:
# Cell 7 (Run for ALL Schemes - Plain Text): Scrape Detail Pages

logging.info(f"Starting detail page scraping for plain text for ALL {len(scheme_data_list)} schemes...")

# --- Detail Page Selectors (Using ID-based selectors) ---
selectors = {
    "Benefits": "//div[@id='benefits']//div[contains(@class, 'markdown-options')]",
    "Eligibility": "//div[@id='eligibility']//div[contains(@class, 'markdown-options')]",
    "Application Process": "//div[@id='application-process']//div[contains(@class, 'markdown-options')]",
    "Documents": "//div[@id='documents-required']//div[contains(@class, 'markdown-options')]",
}
# --- ---

# Ensure scheme_data_list exists and has data
if 'scheme_data_list' not in locals() or not isinstance(scheme_data_list, list) or not scheme_data_list:
    logging.error("scheme_data_list not found or empty. Run previous cells first.")
    print("Error: scheme_data_list is missing or empty. Cannot scrape details.")
    raise NameError("scheme_data_list not defined or empty")

# Helper function to extract PLAIN TEXT from a section XPath
def extract_section_content(driver, section_xpath):
    content = "N/A" # Default value
    try:
        element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, section_xpath))
        )
        content_text = element.text.strip()
        if content_text:
            content = content_text
        else:
            content = "N/A (empty section)"
            logging.warning(f"Element found with XPath '{section_xpath}' but contains no visible text.")

        # Clean up extra whitespace/newlines
        if isinstance(content, str):
             content = ' '.join(content.split())
        return content

    except TimeoutException:
        logging.warning(f"Timeout waiting for section content element: {section_xpath}")
        return "N/A (section content not found)"
    except Exception as e:
        logging.warning(f"Error extracting detail with {section_xpath}: {e}")
        return "N/A (extraction error)"
# --- End of Helper Function ---


# --- Loop through ALL schemes ---
# ===> This range is now set to process the entire list <===
schemes_to_process_indices = range(len(scheme_data_list))

processed_detail_count = 0
total_detail_to_process = len(schemes_to_process_indices) # Should be 100

logging.info(f"Will process details for all {total_detail_to_process} schemes.")

for i in schemes_to_process_indices:
    scheme_info = scheme_data_list[i]
    url = scheme_info.get("Source URL")
    name = scheme_info.get("Scheme Name", f"Scheme at index {i}")
    # Log progress less frequently for the full run
    if (i + 1) % 10 == 0 or i == 0:
         logging.info(f"--- Processing Detail Page {i+1}/{total_detail_to_process} for plain text: '{name}' ---")
    else:
         logging.debug(f"--- Processing Detail Page {i+1}/{total_detail_to_process} for plain text: '{name}' ---")

    if not url or url == "N/A" or not url.startswith("http"):
        logging.warning(f"Skipping scheme '{name}' (Index {i}) due to invalid URL: {url}")
        for field in selectors.keys():
             scheme_data_list[i][field] = "N/A (Invalid URL)" # Update original list
        continue

    try:
        logging.debug(f"Navigating to: {url}")
        driver.get(url)

        detail_page_wait_element_xpath = "//h1[contains(@class, 'font-bold text-xl sm:text-2xl')]"
        logging.debug(f"Waiting for detail page H1 title...")
        WebDriverWait(driver, 25).until(EC.presence_of_element_located((By.XPATH, detail_page_wait_element_xpath)))
        logging.debug("Detail page H1 title found.")
        time.sleep(3) # Allow render time

        # Extract plain text details using the modified helper function
        for field, section_xpath in selectors.items():
            logging.debug(f"Extracting '{field}' plain text...")
            detail_content = extract_section_content(driver, section_xpath)
            scheme_data_list[i][field] = detail_content # Update original list
            logging.debug(f" -> {field} updated with plain text.")

        processed_detail_count += 1
        logging.info(f"Finished extracting plain text details for: '{name}'") # Log success more visibly

    except TimeoutException:
        logging.error(f"Timeout occurred loading detail page or finding H1 title for: {url} (Index {i})")
        for field in selectors.keys():
             scheme_data_list[i][field] = "N/A (Page Load Timeout)"
    except Exception as e:
        logging.error(f"Unexpected error processing detail page {url} (Index {i}): {e}", exc_info=True)
        for field in selectors.keys():
             scheme_data_list[i][field] = "N/A (Processing Error)"

    # --- Rate Limiting ---
    logging.debug("Waiting 3 seconds before next request...")
    time.sleep(3)

# --- Loop Finished ---
logging.info(f"Plain text detail scraping finished. Attempted {total_detail_to_process} URLs, successfully processed {processed_detail_count}.")
print(f"\nPlain text detail scraping phase complete for all {total_detail_to_process} schemes.")
print(f"Successfully processed details for {processed_detail_count} schemes.")
print("The `scheme_data_list` has been updated in place with plain text content.")

# Display updated data for the first scheme processed
if total_detail_to_process > 0 and processed_detail_count > 0:
     first_processed_index = -1
     for idx in schemes_to_process_indices:
         if isinstance(scheme_data_list[idx].get("Eligibility"), str) and \
            not scheme_data_list[idx].get("Eligibility", "").startswith("N/A ("):
              first_processed_index = idx
              break
     if first_processed_index != -1:
          updated_first_scheme = scheme_data_list[first_processed_index]
          print(f"\n--- Updated PLAIN TEXT data for First Successfully Processed Scheme (Index {first_processed_index}) ---")
          for key, value in updated_first_scheme.items():
               print(f"- {key}: {str(value)[:300]}...") # Print snippet
          print("--- ---")
     else:
           print("\nNote: Could not verify successful detail extraction based on 'Eligibility' field check.")
elif total_detail_to_process > 0:
     print("\nNo schemes appear to have been successfully processed in this detail scraping phase.")

2025-04-22 23:58:58,119 - INFO - Starting detail page scraping for plain text for ALL 100 schemes...
2025-04-22 23:58:58,126 - INFO - Will process details for all 100 schemes.
2025-04-22 23:58:58,131 - INFO - --- Processing Detail Page 1/100 for plain text: 'Financial Assistance To Disabled Students Pursuing (10th, 11th, 12th Equivalent Exams)' ---
2025-04-22 23:59:03,571 - INFO - Finished extracting plain text details for: 'Financial Assistance To Disabled Students Pursuing (10th, 11th, 12th Equivalent Exams)'
2025-04-22 23:59:11,832 - INFO - Finished extracting plain text details for: 'ICMR- Post Doctoral Fellowship'
2025-04-22 23:59:20,643 - INFO - Finished extracting plain text details for: 'Tool Kit Grant for Traditional Handicrafts Experts'
2025-04-22 23:59:28,771 - INFO - Finished extracting plain text details for: 'Snehasanthwanam'
2025-04-22 23:59:36,997 - INFO - Finished extracting plain text details for: 'Scheme for Grant of Additional Scholarship to the Students of Other Ba


Plain text detail scraping phase complete for all 100 schemes.
Successfully processed details for 100 schemes.
The `scheme_data_list` has been updated in place with plain text content.

--- Updated PLAIN TEXT data for First Successfully Processed Scheme (Index 0) ---
- Source URL: https://www.myscheme.gov.in/schemes/fadsp1012e...
- Scheme Name: Financial Assistance To Disabled Students Pursuing (10th, 11th, 12th Equivalent Exams)...
- Ministry: Kerala...
- Description: The scheme “Financial Assistance to Disabled Students Pursuing (10th, 11th, 12th Equivalent Exams)” was launched by the Department of Social Justice, Government of Kerala....
- Category: APL, BPL, Disabled, Financial Assistance, PwD, Student...
- Eligibility: The applicant should be a resident of Kerala State. The differently abled students with 40% or more disability are eligible to apply under the scheme. Financial assistance will be provided to students falling in Above Poverty Line (APL) as well as Below Poverty Lin

In [15]:
# Cell 8: Save Final Updated Data

logging.info("Preparing to save the final updated data (plain text)...")

if 'scheme_data_list' in locals() and scheme_data_list:
    try:
        # Create DataFrame from the fully updated list
        df_final = pd.DataFrame(scheme_data_list)
        logging.info(f"Created final DataFrame with shape: {df_final.shape}") # Should be (100, number_of_columns)

        # Define final output filenames
        csv_filename_final = "myscheme_schemes_data_FINAL.csv"
        json_filename_final = "myscheme_schemes_data_FINAL.json"

        # --- Save to CSV ---
        df_final.to_csv(csv_filename_final, index=False, encoding='utf-8-sig')
        logging.info(f"Final data successfully saved to CSV: {csv_filename_final}")
        print(f"Final updated data saved to {csv_filename_final}")

        # --- Save to JSON ---
        df_final.to_json(json_filename_final, orient='records', indent=4, force_ascii=False)
        logging.info(f"Final data successfully saved to JSON: {json_filename_final}")
        print(f"Final updated data saved to {json_filename_final}")

        # Display the first 5 rows of the final DataFrame for a quick check
        print("\n--- First 5 Rows of Final Data ---")
        try:
            print(df_final.head().to_markdown(index=False))
        except ImportError:
             print(df_final.head()) # Fallback print
        print("--- ---")

    except Exception as e:
        logging.error(f"Error saving final data: {e}", exc_info=True)
        print(f"An error occurred while saving the final data: {e}")
else:
    logging.warning("scheme_data_list is empty or not defined. No final data to save.")
    print("No data available, skipping final file save.")

2025-04-23 00:14:33,922 - INFO - Preparing to save the final updated data (plain text)...
2025-04-23 00:14:34,020 - INFO - Created final DataFrame with shape: (100, 10)
2025-04-23 00:14:34,071 - INFO - Final data successfully saved to CSV: myscheme_schemes_data_FINAL.csv
2025-04-23 00:14:34,113 - INFO - Final data successfully saved to JSON: myscheme_schemes_data_FINAL.json


Final updated data saved to myscheme_schemes_data_FINAL.csv
Final updated data saved to myscheme_schemes_data_FINAL.json

--- First 5 Rows of Final Data ---
                                          Source URL  \
0     https://www.myscheme.gov.in/schemes/fadsp1012e   
1       https://www.myscheme.gov.in/schemes/icmr-pdf   
2         https://www.myscheme.gov.in/schemes/tkgthe   
3        https://www.myscheme.gov.in/schemes/skerala   
4  https://www.myscheme.gov.in/schemes/sgassobcan...   

                                         Scheme Name  \
0  Financial Assistance To Disabled Students Purs...   
1                     ICMR- Post Doctoral Fellowship   
2  Tool Kit Grant for Traditional Handicrafts Exp...   
3                                    Snehasanthwanam   
4  Scheme for Grant of Additional Scholarship to ...   

                              Ministry  \
0                               Kerala   
1  Ministry Of Health & Family Welfare   
2                               Kerala   
3

In [16]:
# Cell 9: Quit WebDriver

try:
    if 'driver' in locals() and driver:
        logging.info("Closing the WebDriver...")
        driver.quit()
        logging.info("WebDriver closed successfully.")
        print("\nBrowser window closed.")
        # Optional: Remove driver variable from namespace
        # del driver
    else:
        logging.info("WebDriver instance not found or already closed.")
        print("\nBrowser window may already be closed or was not initialized.")
except NameError:
    logging.info("WebDriver variable 'driver' not defined.")
    print("\nBrowser window may already be closed or was not initialized.")
except Exception as e:
    logging.error(f"An error occurred while closing the WebDriver: {e}")
    print(f"\nAn error occurred closing browser: {e}")

2025-04-23 00:23:34,430 - INFO - Closing the WebDriver...
2025-04-23 00:23:38,765 - INFO - WebDriver closed successfully.



Browser window closed.
