In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import pandas as pd

# Initialize WebDriver (assuming you have set up the WebDriver for your browser)
driver = webdriver.Chrome()

def navigate_to_page(page_number):
    """
    Navigate to the specified page number within the pagination table.
    
    Args:
        page_number (int): The page number to navigate to.
        
    Returns:
        bool: True if navigation to the page was successful, False otherwise.
    """
    try:
        # Wait for the pagination table to be present
        pagination_table = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'DC_117_paginationTable')))
        
        # Find all page tabs within the pagination table
        page_tabs = pagination_table.find_elements(By.CLASS_NAME, 'DC_117_pageTabs')
        
        # Loop through each page tab to find the one with the correct text
        for tab in page_tabs:
            if tab.text.strip() == str(page_number):
                # Using ActionChains to click on the page tab
                ActionChains(driver).move_to_element(tab).click().perform()
                
                # Wait for elements with class="route" and href attributes to be present on the new page
                WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'a.route[href]')))
                return True  # Return True if successfully navigated to the page
        return False  # Return False if page number is not found
    except Exception as e:
        print(f"Error navigating to page {page_number}: {str(e)}")
        return False

# List of URLs to scrape bus data from
bus_links_dfs = [
    "https://www.redbus.in/online-booking/ktcl/?utm_source=rtchometil",
    "https://www.redbus.in/online-booking/tsrtc/?utm_source=rtchometile",
    "https://www.redbus.in/online-booking/chandigarh-transport-undertaking-ctu",
    "https://www.redbus.in/online-booking/pepsu/?utm_source=rtchometile",
    "https://www.redbus.in/online-booking/wbtc-ctc",
    "https://www.redbus.in/online-booking/apsrtc",
    "https://www.redbus.in/online-booking/ksrtc-kerala",
    "https://www.redbus.in/online-booking/rsrtc",
    "https://www.redbus.in/online-booking/astc",
    "https://www.redbus.in/online-booking/meghalaya-transport-corporation-mtc"
]

# Variable names for DataFrames to store bus route data
bus_routes_dfs = ["df_kt", "df_t", "df_ch", "df_pu", "df_wb", "df_ap", "df_kr", "df_rj", "df_as", "df_mg"]

try:
    """
    Main block to scrape bus data from multiple URLs, navigate through pagination,
    and store the scraped data into corresponding DataFrames.
    """
    for idx, url in enumerate(bus_links_dfs):
        # Load the initial webpage
        driver.get(url)
        driver.maximize_window()

        # Wait for elements with class="route" and href attributes to be present
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'a.route[href]')))

        # Initialize lists to store data
        bus_links = []
        bus_routes = []

        # Wait for the pagination table to be present
        pagination_table = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'DC_117_paginationTable')))
        
        # Get all page numbers
        page_tabs = pagination_table.find_elements(By.CLASS_NAME, 'DC_117_pageTabs')
        page_numbers = [tab.text.strip() for tab in page_tabs]

        # Loop through each page
        for page_number in page_numbers:
            # Navigate to the specified page
            if navigate_to_page(page_number):
                # Find all elements with both class="route" and href attributes
                elements = driver.find_elements(By.CSS_SELECTOR, 'a.route[href]')
                
                # Loop through each element found
                for element in elements:
                    href_value = element.get_attribute('href')
                    class_value = element.get_attribute('title')
                    bus_links.append(href_value)
                    bus_routes.append(class_value)
                    print(f"bus_link: {href_value}, bus_route: {class_value}")
            else:
                print(f"Failed to navigate to page {page_number}")

        # Create a DataFrame from the lists and store it in the corresponding variable
        globals()[bus_routes_dfs[idx]] = pd.DataFrame({
            'bus_link': bus_links,
            'bus_route': bus_routes
        })

finally:
    """
    Ensure that the WebDriver is closed properly after the scraping is complete.
    """
    driver.quit()

# Print DataFrames to check the data
for df_name in bus_routes_dfs:
    print(globals()[df_name])

# Save DataFrames to CSV (if needed)
for idx, df_name in enumerate(bus_routes_dfs):
    globals()[df_name].to_csv(f'bus_routes_{df_name}.csv', index=False)


In [4]:
# Concatenate all individual DataFrames into a single DataFrame
# This combines data from multiple sources into one DataFrame for easier analysis.
df_concatinated_data = pd.concat([df_kt, df_t, df_ch, df_pu, df_wb, df_ap, df_kr, df_rj, df_as, df_mg], ignore_index=True)

# Print the column names of the concatenated DataFrame to verify
df_concatinated_data.columns

# Save the concatenated DataFrame to a CSV file
# This file will contain all the scraped data combined into a single CSV file.
df_concatinated_data.to_csv('redbus_scarped_datas_routes_links.csv', index=False)