In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from datetime import datetime
import logging
import time
import json
import os
import re
import sys

# Configure logging - sets up logging to both a file and the console
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('vahan_scraper.log'),  # Logs to this file
        logging.StreamHandler()  # Logs to the console
    ]
)

class VahanScraper:
    """
    Handles the scraping process for Vahan portal data.
    """
    def __init__(self, chrome_binary_path):
        """
        Initializes the scraper with Chrome binary path, download directory, and progress handling.
        Handles resuming from a previous session if available.

        Args:
            chrome_binary_path (str): The path to your Chrome executable.
        """
        self.chrome_binary_path = chrome_binary_path  # Stores the Chrome binary path
        self.driver = None  # Selenium WebDriver instance
        self.wait = None  # WebDriverWait instance for explicit waits
        self.current_state_index = 0  # Tracks the current state being processed (index in states_ut list)
        self.current_rto_index = 0  # Tracks the current RTO being processed within a state

        # Set base download path - where files will be saved
        self.base_download_path = r"C:\Users\ASUS\OneDrive\Documents\Ajax Reports"  # Customize this path
        if not os.path.exists(self.base_download_path):
            os.makedirs(self.base_download_path)  # Creates the directory if it doesn't exist

        # Create date folder dynamically - a folder for each day's downloads
        self.current_date = datetime.now().strftime("%d-%m-%Y")  # Gets the current date in DD-MM-YYYY format
        self.date_folder = os.path.join(self.base_download_path, self.current_date)  # Creates a path for the date folder
        os.makedirs(self.date_folder, exist_ok=True)  # Creates the date folder if it doesn't exist

        # Initialize progress handling - load progress from a file or start fresh
        self.progress_file = 'scraping_progress.json'  # Name of the file to store scraping progress
        self.setup_driver()  # Set up the Chrome driver
        self.load_progress()  # Load scraping progress from the progress file


    def load_progress(self):
        """
        Loads scraping progress from a JSON file.  Allows the script to resume
        from where it left off.  If the file does not exist, it initializes the
        progress to the beginning.
        """
        if os.path.exists(self.progress_file):
            with open(self.progress_file, 'r') as f:
                self.progress = json.load(f)  # Loads the JSON data into the 'progress' attribute
                self.current_state_index = int(self.progress.get('current_state_index', 0))  # Retrieves the last processed state index
                self.current_rto_index = int(self.progress.get('current_rto_index', 0))  # Retrieves the last processed RTO index
        else:
            # If the progress file doesn't exist, start from scratch
            self.progress = {
                'current_state_index': 0,  # Start at the first state
                'current_rto_index': 0,  # Start at the first RTO
                'completed_states': {},  # Initializes an empty dictionary to track completed states
            }


    def save_progress(self):
        """
        Saves the current scraping progress to a JSON file. This is called
        periodically to store the current state and RTO indices, allowing the
        script to be resumed later.
        """
        self.progress['current_state_index'] = self.current_state_index  # Saves the current state index
        self.progress['current_rto_index'] = self.current_rto_index  # Saves the current RTO index
        with open(self.progress_file, 'w') as f:
            json.dump(self.progress, f)  # Writes the progress data to the JSON file


    def setup_driver(self):
        """
        Initializes the Chrome driver with the required options.  This sets up the
        browser environment for web automation.
        """
        chrome_options = webdriver.ChromeOptions()
        chrome_options.binary_location = self.chrome_binary_path  # Specifies the location of the Chrome executable
        chrome_options.add_argument('--start-maximized')  # Maximizes the browser window
        chrome_options.add_argument('--disable-popup-blocking')  # Disables popup blocking
        chrome_options.add_argument('--disable-notifications')  # Disables browser notifications
        chrome_options.add_experimental_option('prefs', {  # Sets Chrome download preferences
            'download.default_directory': self.date_folder,  # Sets the default download directory
            'download.prompt_for_download': False,  # Disables the download prompt
            'download.directory_upgrade': True,  # Enables directory upgrade
            'safebrowsing.enabled': True,  # Enables safe browsing
        })

        # Create the Chrome driver instance
        self.driver = webdriver.Chrome(options=chrome_options)  # Creates a new Chrome driver instance
        self.wait = WebDriverWait(self.driver, 30)  # Creates a WebDriverWait instance with a 30-second timeout
        self.actions = ActionChains(self.driver)  # Creates an ActionChains instance for advanced interactions


    def select_primefaces_dropdown(self, dropdown_id, option_text, max_retries=3):
        """
        Selects an option from a PrimeFaces dropdown with retry mechanism.
        PrimeFaces dropdowns are common on the Vahan website, and this function
        automates the selection process, handling potential failures.

        Args:
            dropdown_id (str): The ID of the dropdown element.
            option_text (str): The text of the option to select.
            max_retries (int): The maximum number of retries.

        Returns:
            bool: True if the selection was successful, False otherwise.
        """
        for attempt in range(max_retries):  # Retries the dropdown selection up to max_retries times
            try:
                # Get current selected value for comparison to avoid unnecessary clicks.
                current_value = self.driver.find_element(By.CSS_SELECTOR, f"#{dropdown_id}_label").text.strip()
                if current_value == option_text:
                    logging.info(f"'{option_text}' is already selected in dropdown '{dropdown_id}'")  # Log if the option is already selected
                    return True

                # Find the dropdown trigger element (the part you click to open the dropdown)
                trigger = self.wait.until(EC.element_to_be_clickable(
                    (By.CSS_SELECTOR, f"#{dropdown_id} .ui-selectonemenu-trigger")
                ))
                self.driver.execute_script("arguments[0].click();", trigger)  # Clicks the dropdown trigger to open the options
                time.sleep(1)  # Short delay to allow the dropdown to open

                # Construct the XPath for the option element (the item to select within the dropdown)
                option_xpath = f"//li[contains(@class, 'ui-selectonemenu-item') and text()='{option_text}']"
                option = self.wait.until(EC.element_to_be_clickable((By.XPATH, option_xpath)))  # Waits for the option to be clickable
                self.driver.execute_script("arguments[0].click();", option)  # Clicks the option to select it
                time.sleep(1)  # Short delay after selection

                logging.info(f"Selected '{option_text}' from dropdown '{dropdown_id}'")  # Logs a successful selection
                return True  # Returns True if the selection was successful
            except Exception as e:
                if attempt == max_retries - 1:  # If it's the last retry
                    logging.error(f"Error selecting '{option_text}' from PrimeFaces dropdown '{dropdown_id}': {str(e)}")  # Logs the error after the last retry
                    return False  # Returns False if the selection failed after all retries
                logging.warning(f"Retry {attempt + 1} for dropdown '{dropdown_id}'")  # Logs a retry attempt
                time.sleep(2)  # Short delay before retrying

    def initialize_filters(self):
        """
        Initializes the dropdown filters on the Vahan portal.
        This function sets the initial filter values for the data selection.
        """
        try:
            # Set X-axis to "Month Wise"
            if not self.select_primefaces_dropdown('xaxisVar', 'Month Wise'):  # Selects "Month Wise" for the X-axis
                raise Exception("Failed to set X-axis to Month Wise")  # Raises an exception if the selection fails
            time.sleep(2)  # Short delay

            # Set Y-axis to "Maker"
            if not self.select_primefaces_dropdown('yaxisVar', 'Maker'):  # Selects "Maker" for the Y-axis
                raise Exception("Failed to set Y-axis to Maker")  # Raises an exception if the selection fails
            time.sleep(2)  # Short delay

            # Set year to 2025
            if not self.select_primefaces_dropdown('selectedYear', '2025'):  # Selects the year 2025
                raise Exception("Failed to set Year to 2025")  # Raises an exception if the selection fails
            time.sleep(2)  # Short delay

            return True  # Returns True if all filters were initialized successfully
        except Exception as e:
            logging.error(f"Error initializing filters: {str(e)}")  # Logs any error that occurs during filter initialization
            return False  # Returns False if an error occurred

    def sanitize_filename(self, filename):
        """
        Sanitizes a filename by removing or replacing invalid characters.
        This function ensures that the generated filenames are valid across
        different operating systems and file systems.  It prevents errors
        that could arise from illegal characters.

        Args:
            filename (str): The filename to sanitize.

        Returns:
            str: The sanitized filename.
        """
        # Remove or replace invalid characters
        invalid_chars = '<>:"/\\|?*'  # Defines a string of invalid characters for filenames
        for char in invalid_chars:
            filename = filename.replace(char, '_')  # Replaces invalid characters with underscores

        # Remove parenthetical numbers (e.g., "(123)")
        filename = re.sub(r'\s*\([^)]*\)', '', filename)  # Removes patterns like "(123)" including surrounding spaces

        # Remove extra spaces and trim - cleans up any extra spaces.
        filename = ' '.join(filename.split())  # Reduces multiple spaces to single spaces
        return filename.strip()  # Removes leading/trailing spaces


    def wait_for_download_complete(self, timeout=30):
        """
        Waits for the download to complete by checking for temporary Chrome
        download files (.crdownload).  This ensures the script doesn't try
        to process the file before it's fully downloaded.

        Args:
            timeout (int): The maximum time in seconds to wait for the download.

        Raises:
            Exception: If the download times out.
        """
        start_time = time.time()  # Records the start time
        while True:
            # Check if any .crdownload files exist (Chrome's temporary download files)
            if not any(f.endswith('.crdownload') for f in os.listdir(self.date_folder)):  # Checks for .crdownload files in the download directory
                break  # If no .crdownload files are found, the download is considered complete
            if time.time() - start_time > timeout:  # Checks if the timeout has been reached
                raise Exception("Download timed out")  # Raises an exception if the download takes too long
            time.sleep(1)  # Waits for 1 second before checking again


    def move_file_to_state_folder(self, rto_name, state):
        """
        Moves the downloaded Excel file to a state-specific folder and renames it.
        This function organizes the downloaded files into a hierarchical directory
        structure (date -> state -> filename) for easier management. It also handles
        duplicate filenames by appending a counter.

        Args:
            rto_name (str): The name of the RTO.
            state (str): The name of the state.

        Returns:
            bool: True if the file was moved successfully, False otherwise.
        """
        try:
            self.wait_for_download_complete()  # Waits for the download to complete

            # Get the most recent Excel file
            files = [f for f in os.listdir(self.date_folder) if f.endswith('.xlsx')]  # Lists all Excel files in the download directory
            if not files:
                raise FileNotFoundError("No Excel files found in download directory")  # Raises an exception if no Excel files are found

            latest_file = max(  # Finds the most recently modified file
                [os.path.join(self.date_folder, f) for f in files],  # Creates a full path for each file
                key=os.path.getmtime  # Uses the modification time to determine the latest file
            )

            # Create state folder with sanitized name
            sanitized_state = self.sanitize_filename(state)  # Sanitizes the state name for use in the folder path
            state_folder = os.path.join(self.date_folder, sanitized_state)  # Constructs the path to the state folder
            os.makedirs(state_folder, exist_ok=True)  # Creates the state folder if it doesn't already exist

            # Create sanitized RTO filename
            sanitized_rto = self.sanitize_filename(rto_name)  # Sanitizes the RTO name for the filename
            new_filename = f"{sanitized_rto}.xlsx"  # Creates the new filename
            new_filepath = os.path.join(state_folder, new_filename)  # Constructs the full path for the new file

            # Handle duplicate filenames
            counter = 1  # Initializes a counter for handling duplicate filenames
            while os.path.exists(new_filepath):  # Checks if the new filename already exists
                new_filename = f"{sanitized_rto}_{counter}.xlsx"  # Creates a new filename with a counter
                new_filepath = os.path.join(state_folder, new_filename)  # Constructs the new full path with a counter
                counter += 1  # Increments the counter

            # Move and rename file
            os.rename(latest_file, new_filepath)  # Renames the file
            logging.info(f"Successfully moved file to: {new_filepath}")  # Logs a success message
            return True  # Returns True if the move was successful

        except Exception as e:
            logging.error(f"Error in move_file_to_state_folder: {str(e)}")  # Logs any errors that occur
            return False  # Returns False if an error occurred


    def process_state(self, state):
        """
        Processes a single state, iterating through its RTOs, selecting them,
        downloading the associated data, and saving the files.  This is the
        core logic for scraping data for a single state.

        Args:
            state (str): The name of the state to process.

        Returns:
            bool: True if the state was processed successfully (all RTOs),
                  False otherwise.
        """
        success = False  # Initializes a success flag to False
        try:
            # Select the state
            if not self.select_primefaces_dropdown('j_idt38', state):  # Selects the state from the dropdown
                raise Exception(f"Failed to select state: {state}")  # Raises an exception if the state selection fails
            time.sleep(4)  # Short delay

            # Open the RTO dropdown
            rto_dropdown_id = 'selectedRto'  # ID of the RTO dropdown
            trigger = self.wait.until(EC.element_to_be_clickable(
                (By.CSS_SELECTOR, f"#{rto_dropdown_id} .ui-selectonemenu-trigger")  # Locates the dropdown trigger
            ))
            self.driver.execute_script("arguments[0].click();", trigger)  # Clicks the dropdown trigger to open the options
            time.sleep(1)  # Short delay

            # Get all RTO options
            rto_options = self.driver.find_elements(By.CSS_SELECTOR, f"#{rto_dropdown_id}_items .ui-selectonemenu-item")  # Finds all RTO options

            # Loop through each RTO option starting from the current RTO index
            for i in range(self.current_rto_index, len(rto_options)):  # Iterates through RTOs starting from the current index
                rto_option = rto_options[i]  # Gets the current RTO option
                rto_text = rto_option.text  # Gets the text of the RTO option

                if "All Vahan4 Running Office" in rto_text:  # Skips specific RTOs
                    logging.info(f"Skipping RTO: {rto_text}")  # Logs that this RTO is being skipped
                    continue  # Skips the current iteration and goes to the next RTO

                try:
                    # Select the RTO
                    self.driver.execute_script("arguments[0].click();", rto_option)  # Clicks the RTO option to select it
                    time.sleep(2)  # Short delay

                    # Refresh the data
                    refresh_button = self.wait.until(EC.presence_of_element_located((By.ID, 'j_idt68')))  # Locates the refresh button
                    self.driver.execute_script("arguments[0].click();", refresh_button)  # Clicks the refresh button
                    time.sleep(4)  # Short delay

                    # Download the Excel file
                    download_button = self.wait.until(EC.presence_of_element_located(
                        (By.ID, 'groupingTable:j_idt83')  # Locates the download button
                    ))
                    self.driver.execute_script("arguments[0].click();", download_button)  # Clicks the download button
                    time.sleep(4)  # Short delay

                    # Move and rename the downloaded file
                    if not self.move_file_to_state_folder(rto_text, state):  # Moves the downloaded file to the correct folder
                        raise Exception("Failed to move downloaded file")  # Raises an exception if the move fails

                    # Update progress
                    self.current_rto_index = i + 1  # Updates the current RTO index
                    self.save_progress()  # Saves the current progress to the file
                    logging.info(f"Successfully processed RTO: {rto_text} in state: {state}")  # Logs a successful RTO processing

                except Exception as e:
                    logging.error(f"Error processing RTO {rto_text}: {str(e)}")  # Logs an error for the specific RTO
                    continue  # Continues to the next RTO if an error occurs

            # Successfully processed all RTOs - sets success to True
            success = True
            return success  # Returns True if all RTOs in the state were processed successfully

        except Exception as e:
            logging.error(f"Error processing state {state}: {str(e)}")  # Logs a general error for the state
            return success  # Returns the current success state (False if an exception occurred)


    def scrape_data(self):
        """
        Main scraping method that iterates through states and RTOs, downloads the data,
        and handles errors and retries.  This is the top-level function that orchestrates
        the entire scraping process.
        """
        states_ut = [  # Defines a list of states and union territories to scrape
            'Andaman & Nicobar Island(3)',
            'Andhra Pradesh(83)',
            'Arunachal Pradesh(29)',
            'Assam(33)',
            'Bihar(48)',
            'Chhattisgarh(30)',
            'Chandigarh(1)',
            'UT of DNH and DD(3)',
            'Delhi(16)',
            'Goa(13)',
            'Gujarat(37)',
            'Himachal Pradesh(96)',
            'Haryana(98)',
            'Jharkhand(25)',
            'Jammu and Kashmir(21)',
            'Karnataka(68)',
            'Kerala(87)',
            'Ladakh(3)',
            'Lakshadweep(4)',
            'Maharashtra(56)',
            'Meghalaya(13)',
            'Manipur(12)',
            'Madhya Pradesh(53)',
            'Mizoram(10)',
            'Nagaland(9)',
            'Odisha(39)',
            'Punjab(96)',
            'Puducherry(8)',
            'Rajasthan(59)',
            'Sikkim(9)',
            'Tamil Nadu(148)',
            'Tripura(9)',
            'Uttarakhand(21)',
            'Uttar Pradesh(77)',
            'West Bengal(57)'
        ]

        max_retries = 3  # Sets the maximum number of retries for a state
        retry_delay = 60  # Sets the delay in seconds before retrying

        while self.current_state_index < len(states_ut):  # Loops through the list of states
            try:
                # Start new session - crucial for stability
                self.setup_driver()  # Sets up a new driver instance
                self.driver.get('https://vahan.parivahan.gov.in/vahan4dashboard/vahan/view/reportview.xhtml')  # Navigates to the Vahan dashboard
                time.sleep(4)  # Short delay

                # Initialize filters
                if not self.initialize_filters():  # Initializes the filters
                    raise Exception("Failed to initialize filters")  # Raises an exception if filter initialization fails

                # Process the current state
                state = states_ut[self.current_state_index]  # Gets the current state from the list
                if self.process_state(state):  # Processes the current state, including all RTOs
                    # Update progress only after successful completion
                    self.progress['completed_states'][state] = True  # Marks the state as completed in the progress dictionary
                    self.current_state_index += 1  # Increments the current state index
                    self.current_rto_index = 0  # Resets the RTO index for the next state
                    self.save_progress()  # Saves the progress
                    logging.info(f"Successfully completed state: {state}")  # Logs successful state completion
                else:
                    raise Exception(f"Failed to process state: {state}")  # Raises an exception if state processing fails

            except Exception as e:
                logging.error(f"Session error: {str(e)}")  # Logs any session-level error

                retry_count = self.progress.get(f'retry_count_{self.current_state_index}', 0) + 1  # Gets the retry count for the current state
                self.progress[f'retry_count_{self.current_state_index}'] = retry_count  # Increments the retry count in the progress dictionary

                if retry_count >= max_retries:  # Checks if the retry limit has been reached
                    logging.error(f"Skipping state {states_ut[self.current_state_index]} after {max_retries} failures")  # Logs a message if the state is being skipped
                    self.current_state_index += 1  # Moves to the next state
                    self.current_rto_index = 0  # Resets the RTO index
                
                self.save_progress()  # Saves the progress
                
                logging.info(f"Waiting {retry_delay} seconds before retrying...")  # Logs the retry delay
                time.sleep(retry_delay)  # Pauses execution for the retry delay

            finally:
                if self.driver:
                    try:
                        self.driver.quit()  # Quits the driver in the finally block to ensure the driver is closed
                    except:
                        pass  # Handles any exception during driver quit


        logging.info("Scraping completed!")  # Logs a completion message

if __name__ == "__main__":
    try:
        # Provide the correct path to your chrome executable. VERY IMPORTANT
        chrome_path = r"C:/Program Files (x86)/chrome-win64/chrome.exe"  # Replace with the actual path to your Chrome executable
        if not os.path.exists(chrome_path):
            raise FileNotFoundError(f"Chrome binary not found at: {chrome_path}")  # Raises an exception if the Chrome binary is not found

        scraper = VahanScraper(chrome_path)  # Creates a VahanScraper instance
        scraper.scrape_data()  # Starts the scraping process
    except Exception as e:
        logging.critical(f"Fatal error: {str(e)}")  # Logs a critical error if any exception occurs during the main execution
        sys.exit(1)  # Exits the program with an error code