In [3]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from datetime import datetime
from typing import Tuple, Optional
import logging
import time
import json
import os
import sys
import random

# --- Configuration Constants ---
BASE_DOWNLOAD_PATH = r"C:\Users\ASUS\OneDrive\Documents\Ajax Reports\State Wise"  # Make configurable later
PROGRESS_FILE_NAME = 'scraping_progress.json'
MAX_RETRIES = 3
RETRY_DELAY = 40  # seconds
TIMEOUT_WEBDRIVER = 30 # seconds for WebDriverWait

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('vahan_scraper.log'),
        logging.StreamHandler()
    ]
)

class VahanScraper:
    def __init__(self, chrome_binary_path):
        self.chrome_binary_path = chrome_binary_path
        self.driver = None
        self.wait = None
        self.actions = None
        self.current_state_index = 0
        self.base_download_path = BASE_DOWNLOAD_PATH
        self.progress_file = PROGRESS_FILE_NAME
        self.max_retries = MAX_RETRIES
        self.retry_delay = RETRY_DELAY
        self.timeout_webdriver = TIMEOUT_WEBDRIVER

        # Initialize IDs to None
        self.state_dropdown_id = None
        self.refresh_button_id = None
        self.excel_icon_id = None

        self._setup_directories()
        self._load_or_init_progress()
        self._initialize_driver()


    def _setup_directories(self):
        """Sets up the base download directory and date-specific folder."""
        if not os.path.exists(self.base_download_path):
            os.makedirs(self.base_download_path)

        self.current_date = datetime.now().strftime("%d-%m-%Y")
        self.date_folder = os.path.join(self.base_download_path, self.current_date)
        if not os.path.exists(self.date_folder):
            os.makedirs(self.date_folder)

    def _load_or_init_progress(self):
        """Loads progress from file or initializes new progress."""
        if os.path.exists(self.progress_file):
            user_input = input("Previous progress file found. Continue from last session? (y/n): ")
            if user_input.lower() != 'y':
                os.remove(self.progress_file)
        self.load_progress()

    def _configure_chrome_options(self):
        """Configures Chrome options for the webdriver."""
        chrome_options = webdriver.ChromeOptions()
        chrome_options.binary_location = self.chrome_binary_path
        chrome_options.add_argument('--start-maximized')
        chrome_options.add_argument('--disable-popup-blocking')
        chrome_options.add_argument('--disable-notifications')
        chrome_options.add_experimental_option('prefs', {
            'download.default_directory': self.date_folder,
            'download.prompt_for_download': False,
            'download.directory_upgrade': True,
            'safebrowsing.enabled': True,
            'profile.default_content_setting_values.automatic_downloads': 1,
        })
        return chrome_options

    def _initialize_driver(self):
        """Initializes the Chrome webdriver."""
        try:
            chrome_options = self._configure_chrome_options()
            self.driver = webdriver.Chrome(options=chrome_options)
            self.wait = WebDriverWait(self.driver, self.timeout_webdriver)
            self.actions = ActionChains(self.driver)
        except Exception as e:
            logging.critical(f"Failed to initialize webdriver: {e}")
            raise

    def load_progress(self):
        """Load progress from file if it exists"""
        if os.path.exists(self.progress_file):
            with open(self.progress_file, 'r') as f:
                self.progress = json.load(f)
                self.current_state_index = int(self.progress.get('current_state_index', 0))
        else:
            self.progress = {
                'current_state_index': 0,
                'completed_states': {},
            }

    def save_progress(self):
        """Save progress to file"""
        self.progress['current_state_index'] = self.current_state_index
        with open(self.progress_file, 'w') as f:
            json.dump(self.progress, f)

    def select_primefaces_dropdown(self, dropdown_id: str, option_text: str, max_retries: int = 3) -> bool:
        """
        Handle PrimeFaces dropdown selection with retries.
        """
        for attempt in range(max_retries):
            try:
                # Open the dropdown using the dynamically retrieved ID
                trigger = self.wait.until(EC.element_to_be_clickable(
                    (By.CSS_SELECTOR, f"#{dropdown_id} .ui-selectonemenu-trigger")
                ))
                self.driver.execute_script("arguments[0].click();", trigger)
                time.sleep(1)

                # Select the desired option
                option_xpath = f"//li[contains(@class, 'ui-selectonemenu-item') and text()='{option_text}']"
                option = self.wait.until(EC.element_to_be_clickable((By.XPATH, option_xpath)))
                self.driver.execute_script("arguments[0].click();", option)
                time.sleep(1)

                logging.info(f"Selected '{option_text}' from dropdown '{dropdown_id}'")
                return True

            except Exception as e:
                if attempt == max_retries - 1:
                    logging.error(f"Error selecting '{option_text}' from PrimeFaces dropdown '{dropdown_id}': {str(e)}")
                    return False
                logging.warning(f"Retry {attempt + 1} for dropdown '{dropdown_id}'")
                time.sleep(2)

    def initialize_filters(self):
        """Initialize all dropdown filters"""
        try:
            # Set X-axis to "Month Wise"
            if not self.select_primefaces_dropdown('xaxisVar', 'Month Wise'):
                raise Exception("Failed to set X-axis to Month Wise")
            time.sleep(2)

            # Set Y-axis to "Maker"
            if not self.select_primefaces_dropdown('yaxisVar', 'Maker'):
                raise Exception("Failed to set Y-axis to Maker")
            time.sleep(2)

            # Set year to 2024  (Corrected to 2025 as per original logic)
            if not self.select_primefaces_dropdown('selectedYear', '2025'):
                raise Exception("Failed to set Year to 2025")
            time.sleep(2)

            return True
        except Exception as e:
            logging.error(f"Error initializing filters: {str(e)}")
            return False

    def sanitize_filename(self, filename):
        """Sanitize filename by removing everything after '('"""
        if '(' in filename:
            filename = filename.split('(')[0].strip()  # Remove everything after '('
        return filename

    def wait_for_download_complete(self, timeout=30):
        """Wait for the download to complete by checking if the file is no longer being written"""
        start_time = time.time()
        while True:
            # Check if any .crdownload files exist (Chrome's temporary download files)
            if not any(f.endswith('.crdownload') for f in os.listdir(self.date_folder)):
                break
            if time.time() - start_time > timeout:
                raise Exception("Download timed out")
            time.sleep(1)

    def rename_file_to_state(self, state):
        """Rename the most recently downloaded file to the state name"""
        try:
            # Get the most recent Excel file in the download directory
            files = [f for f in os.listdir(self.date_folder) if f.endswith('.xlsx')]
            if not files:
                raise FileNotFoundError("No Excel files found in download directory")

            # Find the most recently downloaded file
            latest_file = max(
                [os.path.join(self.date_folder, f) for f in files],
                key=os.path.getctime  # Use creation time to find the most recent file
            )

            # Log the latest file for debugging
            logging.info(f"Latest downloaded file: {latest_file}")

            # Create sanitized state name
            sanitized_state = self.sanitize_filename(state)
            new_filename = f"{sanitized_state}.xlsx"
            new_filepath = os.path.join(self.date_folder, new_filename)

            # Handle duplicate filenames
            counter = 1
            while os.path.exists(new_filepath):
                new_filename = f"{sanitized_state}_{counter}.xlsx"
                new_filepath = os.path.join(self.date_folder, new_filename)
                counter += 1

            # Log the new file path for debugging
            logging.info(f"Renaming file to: {new_filepath}")

            # Rename the file
            os.rename(latest_file, new_filepath)
            logging.info(f"Successfully renamed file to: {new_filepath}")
            return True

        except Exception as e:
            logging.error(f"Error renaming file: {str(e)}")
            return False

    def get_dropdown_label(self) -> Optional[str]:
        """Dynamically retrieve the label of the state dropdown."""
        try:
            # Locate the dropdown label by its parent container
            dropdown_label = WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located(
                    (By.XPATH, "//label[text()='State:']/following-sibling::div//label[contains(@class, 'ui-selectonemenu-label')]")
                )
            )
            label_text = dropdown_label.text.strip()
            logging.info(f"Dropdown label found: {label_text}")
            return label_text
        except Exception as e:
            logging.error(f"Failed to locate dropdown label: {str(e)}")
            return None

    def get_dynamic_ids(self) -> Tuple[Optional[str], Optional[str], Optional[str]]:
        """Dynamically retrieve HTML IDs for state dropdown, refresh button, and Excel icon."""
        state_id = None
        refresh_button_id = None
        excel_icon_id = None
        try:
            # Wait for the label "State:" to load
            WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located((By.XPATH, "//label[text()='State:']"))
            )

            # Locate the dropdown container using the label
            state_dropdown = self.driver.find_element(
                By.XPATH,
                "//label[text()='State:']/following-sibling::div[contains(@class, 'ui-selectonemenu')]"
            )
            state_id = state_dropdown.get_attribute("id")
            logging.info(f"State dropdown ID found: {state_id}")

            # Find the refresh button using a more specific XPath
            refresh_buttons = self.driver.find_elements(
                By.XPATH,
                "//button[contains(@class, 'ui-button') and contains(text(), 'Refresh')]"
            )
            if refresh_buttons:
                refresh_button_id = refresh_buttons[0].get_attribute("id")

            # Find the Excel icon using a more specific XPath
            excel_icons = self.driver.find_elements(
                By.XPATH,
                "//a[contains(@class, 'ui-button') and .//span[contains(@class, 'ui-icon-excel')]]"
            )
            if excel_icons:
                excel_icon_id = excel_icons[0].get_attribute("id")

            logging.info(f"Successfully located elements - State ID: {state_id}, Refresh ID: {refresh_button_id}, Excel ID: {excel_icon_id}")
            return state_id, refresh_button_id, excel_icon_id

          
    def refresh_data(self, refresh_button_id: str) -> bool:
        """Refresh the data using the refresh button ID."""
        try:
            refresh_button = WebDriverWait(self.driver, 10).until(
                EC.element_to_be_clickable((By.ID, refresh_button_id))
            )
            self.driver.execute_script("arguments[0].click();", refresh_button)

            # Wait for data to refresh (staleness of the button indicates refresh completion)
            WebDriverWait(self.driver, 10).until(
                EC.staleness_of(refresh_button)
            )
            return True
        except Exception as e:
            logging.error(f"Failed to refresh data: {str(e)}")
            return False

    

    def find_rto_dropdown(self) -> Optional[str]:
        """Find the RTO dropdown ID dynamically using PrimeFaces structure."""
        try:
            # Look for RTO dropdown with PrimeFaces-specific attributes
            rto_dropdown = WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located(
                    (By.CSS_SELECTOR, "div.ui-selectonemenu[id*='selectedRto']")
                )
            )
            return rto_dropdown.get_attribute("id")
        except Exception as e:
            logging.error(f"Failed to find RTO dropdown: {str(e)}")
            return None


    def handle_retry(self, state: str) -> None:
        """Handle retry logic for failed states."""
        try:
            retry_count = self.progress.get(f'retry_count_{self.current_state_index}', 0) + 1
            self.progress[f'retry_count_{self.current_state_index}'] = retry_count

            if retry_count >= self.max_retries:
                logging.warning(f"Skipping state {state} after {self.max_retries} failures")
                self.current_state_index += 1

            self.save_progress()

            logging.info(f"Waiting {self.retry_delay} seconds before retrying...")
            time.sleep(self.retry_delay)
        except Exception as e:
            logging.error(f"Error in retry handling: {str(e)}")


    def process_state(self, state: str) -> bool:
        """Process a single state and download the Excel file for 'All Vahan4 Running Office'"""
        success = False
        try:
            # Now using the IDs retrieved in scrape_data
            state_id = self.state_dropdown_id
            refresh_button_id = self.refresh_button_id
            excel_icon_id = self.excel_icon_id

            # Select the state
            if not self.select_primefaces_dropdown(state_id, state):
                raise Exception(f"Failed to select state: {state}")

            # Open the RTO dropdown
            rto_dropdown_id = self.find_rto_dropdown()
            if not rto_dropdown_id:
                raise Exception("Failed to locate RTO dropdown")

            # Select 'All Vahan4 Running Office(1373/1434)'
            if not self.select_rto_option(rto_dropdown_id):
                raise Exception("Failed to select RTO option")

            # Refresh the data
            if not self.refresh_data(refresh_button_id):
                raise Exception("Failed to refresh data")

            # Download the Excel file
            if not self.download_excel(excel_icon_id): # Assuming you have a download_excel method
                raise Exception("Failed to download Excel file")

            # Rename the file to the state name
            if not self.rename_file_to_state(state):
                raise Exception("Failed to rename downloaded file")

            logging.info(f"Successfully processed state: {state}")
            success = True
            return success

        except Exception as e:
            logging.error(f"Error processing state {state}: {str(e)}")
            self.handle_retry(state)
            return success

    def select_rto_option(self, rto_dropdown_id: str) -> bool:
        """Select the 'All Vahan4 Running Office' option dynamically."""
        try:
            # Open the RTO dropdown
            trigger = self.wait.until(EC.element_to_be_clickable(
                (By.CSS_SELECTOR, f"#{rto_dropdown_id} .ui-selectonemenu-trigger")
            ))
            self.driver.execute_script("arguments[0].click();", trigger)
            time.sleep(1)

            # Select the desired option
            option_xpath = "//li[contains(@class, 'ui-selectonemenu-item') and contains(text(), 'All Vahan4 Running Office')]"
            option = self.wait.until(EC.element_to_be_clickable((By.XPATH, option_xpath)))
            self.driver.execute_script("arguments[0].click();", option)
            time.sleep(1)

            logging.info("Selected 'All Vahan4 Running Office' from RTO dropdown")
            return True

        except Exception as e:
            logging.error(f"Failed to select RTO option: {str(e)}")
            return False

    def download_excel(self, excel_icon_id: str) -> bool: # Added dummy download_excel method
        """Dummy method for downloading excel - replace with your actual logic"""
        try:
            logging.info(f"Downloading Excel using icon ID: {excel_icon_id}")
            excel_button = self.wait.until(EC.element_to_be_clickable((By.ID, excel_icon_id)))
            self.driver.execute_script("arguments[0].click();", excel_button)
            self.wait_for_download_complete() # Assuming you have this method
            return True
        except Exception as e:
            logging.error(f"Failed to download excel: {e}")
            return False


    def scrape_data(self):
        """Main scraping method with recovery mechanism"""
        states_ut = [
            'Andaman & Nicobar Island(3)',
            'Andhra Pradesh(83)',
            'Arunachal Pradesh(29)',
            'Assam(33)',
            'Bihar(48)',
            'Chhattisgarh(30)',
            'Chandigarh(1)',
            'UT of DNH and DD(3)',
            'Delhi(16)',
            'Goa(13)',
            'Gujarat(37)',
            'Himachal Pradesh(96)',
            'Haryana(98)',
            'Jharkhand(25)',
            'Jammu and Kashmir(21)',
            'Karnataka(68)',
            'Kerala(87)',
            'Ladakh(3)',
            'Lakshadweep(4)',
            'Maharashtra(56)',
            'Meghalaya(13)',
            'Manipur(12)',
            'Madhya Pradesh(53)',
            'Mizoram(10)',
            'Nagaland(9)',
            'Odisha(39)',
            'Punjab(96)',
            'Puducherry(8)',
            'Rajasthan(59)',
            'Sikkim(9)',
            'Tamil Nadu(148)',
            'Tripura(9)',
            'Uttarakhand(21)',
            'Uttar Pradesh(77)',
            'West Bengal(57)'
        ]

        max_retries = 3
        retry_delay = 60  # seconds

        # --- MOVE setup_driver() HERE, BEFORE the try block ---
        # Start new session for initial ID retrieval
        self.setup_driver()
        try: # Added try-except block for initial ID retrieval
            self.driver.get('https://vahan.parivahan.gov.in/vahan4dashboard/vahan/view/reportview.xhtml')
            time.sleep(2)

            # Initialize filters - do this before getting IDs as it might affect element presence
            if not self.initialize_filters():
                raise Exception("Failed to initialize filters before ID retrieval")

            # Dynamically retrieve IDs ONCE at the beginning
            self.state_dropdown_id, self.refresh_button_id, self.excel_icon_id = self.get_dynamic_ids()

            # Check if IDs were retrieved successfully
            if not all([self.state_dropdown_id, self.refresh_button_id, self.excel_icon_id]):
                logging.critical("Failed to retrieve all dynamic element IDs. Scraping cannot proceed.")
                return  # Exit the scrape_data method if IDs are missing

            logging.info("Successfully retrieved dynamic element IDs. Proceeding with scraping.")

        except Exception as initial_id_error:
            logging.critical(f"Fatal error during initial ID retrieval: {initial_id_error}")
            return # Exit if initial ID retrieval fails
        finally: # Ensure driver is quit even if initial ID retrieval fails
            if self.driver:
                try:
                    self.driver.quit()
                    self.driver = None # Reset driver to None after quitting
                except:
                    pass


        while self.current_state_index < len(states_ut):
            try:
                # Start new session for each state (if you still want to do this)
                self.setup_driver() # Re-setup driver for each state as before
                self.driver.get('https://vahan.parivahan.gov.in/vahan4dashboard/vahan/view/reportview.xhtml')
                time.sleep(2)

                # Initialize filters (you might want to do this for each state or only once initially - depends on website behavior)
                if not self.initialize_filters():
                    raise Exception("Failed to initialize filters for state processing")


                # Process the current state - now using stored IDs
                state = states_ut[self.current_state_index]
                if self.process_state(state):
                    # Update progress only after successful completion
                    self.progress['completed_states'][state] = True
                    self.save_progress()
                    logging.info(f"Successfully completed state: {state}")
                else:
                    raise Exception(f"Failed to process state: {state}")

            except Exception as e:
                logging.error(f"Session error during state processing: {str(e)}")

                retry_count = self.progress.get(f'retry_count_{self.current_state_index}', 0) + 1
                self.progress[f'retry_count_{self.current_state_index}'] = retry_count

                if retry_count >= max_retries:
                    logging.error(f"Skipping state {states_ut[self.current_state_index]} after {max_retries} failures")
                    self.current_state_index += 1

                self.save_progress()

                logging.info(f"Waiting {retry_delay} seconds before retrying...")
                time.sleep(retry_delay)

            finally:
                if self.driver:
                    try:
                        self.driver.quit()
                        self.driver = None # Reset driver to None after quitting
                    except:
                        pass

        logging.info("Scraping completed!")

if __name__ == "__main__":
    try:
        chrome_path = r"C:/Program Files (x86)/chrome-win64/chrome.exe"
        if not os.path.exists(chrome_path):
            raise FileNotFoundError(f"Chrome binary not found at: {chrome_path}")

        scraper = VahanScraper(chrome_path)
        scraper.scrape_data()
    except Exception as e:
        logging.critical(f"Fatal error: {str(e)}")
        sys.exit(1)

SyntaxError: expected 'except' or 'finally' block (703238.py, line 282)

In [None]:
import os
import pandas as pd

# Folder containing the Excel files
folder_path = r"C:\Users\ASUS\OneDrive\Documents\Ajax Reports\State Wise\24-01-2025"  # Replace with the path to your folder

# Output directory and file path
output_directory = r"C:\Users\ASUS\OneDrive\Documents\Ajax Reports\State Wise"
output_file = os.path.join(output_directory, "maker_consolidated_data.xlsx")

# Initialize an empty DataFrame to store consolidated data
consolidated_data = pd.DataFrame()

# Loop through each file in the folder
for file_name in os.listdir(folder_path):
    if file_name.endswith(".xlsx"):
        # Extract the state name from the file name
        state_name = file_name.replace(".xlsx", "").strip()

        # Full path to the file
        file_path = os.path.join(folder_path, file_name)

        # Read the Excel file, skipping the first row and using the second row as the header
        df = pd.read_excel(file_path, skiprows=1, header=0)

        # Drop the third row (unnecessary merged headers)
        df = df.drop(0)  # Drop the first row of data (which is the third row in the file)

        # Reset the index after dropping the row
        df = df.reset_index(drop=True)

        # Clean column names by stripping leading/trailing spaces
        df.columns = df.columns.str.strip()

        # Debug: Print column names for inspection
        print(f"Columns in {file_name}: {df.columns.tolist()}")

        # Add a "State" column with the state name
        df["State"] = state_name

        # Reorder columns to match the desired format
        columns = ["State", "S No", "Maker"] + [col for col in df.columns if col not in ["State", "S No", "Maker"]]
        df = df[columns]

        # Append the transformed data to the consolidated DataFrame
        consolidated_data = pd.concat([consolidated_data, df], ignore_index=True)

# Save the consolidated data to the specified output file
consolidated_data.to_excel(output_file, index=False)

print(f"Consolidation complete! Data saved to {output_file}")