In [10]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd

all_bus_details = []  # List to hold all bus details

In [11]:
def initialize_driver():
    driver = webdriver.Chrome()
    driver.maximize_window()
    return driver

In [12]:
def load_page(driver, url):
    driver.get(url)
    time.sleep(5)  # Wait for the page to load

In [13]:
# Function to scrape bus routes
def scrape_bus_routes(driver):
    route_elements = driver.find_elements(By.CLASS_NAME, 'route')
    bus_routes_link = [route.get_attribute('href') for route in route_elements]
    bus_routes_name = [route.text.strip() for route in route_elements]
    return bus_routes_link, bus_routes_name

In [14]:
# Function to scrape bus details
def scrape_bus_details(driver, url, route_name):
    try:
        driver.get(url)
        time.sleep(5)  # Allow the page to load
        
        # Click the "View Buses" button if it exists
        try:
            view_buses_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.CLASS_NAME, "button"))
            )
            driver.execute_script("arguments[0].click();", view_buses_button)
            time.sleep(5)  # Wait for buses to load
            
            # Scroll down to load all bus items
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(5)  # Wait for the page to load more content

            # Find bus item details
            try:
                bus_name_elements = driver.find_elements(By.CLASS_NAME, "travels.lh-24.f-bold.d-color")
            except:
                bus_name_elements = "NA"
            try:
                bus_type_elements = driver.find_elements(By.CLASS_NAME, "bus-type.f-12.m-top-16.l-color.evBus")
            except:
                bus_type_elements = "NA"
            try:
                departing_time_elements = driver.find_elements(By.CLASS_NAME, "dp-time.f-19.d-color.f-bold")
            except:
                departing_time_elements = "NA"
            try:
                duration_elements = driver.find_elements(By.CLASS_NAME, "dur.l-color.lh-24")
            except:
                duration_elements = "NA"
            try:
                reaching_time_elements = driver.find_elements(By.CLASS_NAME, "bp-time.f-19.d-color.disp-Inline")
            except:
                reaching_time_elements = "NA"
            try:
                star_rating_elements = driver.find_elements(By.XPATH, "//div[@class='rating-sec lh-24']")
            except:
                star_rating_elements = "NA"
            try:
                price_elements = driver.find_elements(By.CLASS_NAME, "fare.d-block")
            except:
                price_elements = "NA"

            # Use XPath to handle both seat availability classes
            try:
                seat_availability_elements = driver.find_elements(By.XPATH, "//div[contains(@class, 'seat-left m-top-30') or contains(@class, 'seat-left m-top-16')]")
            except:
                seat_availability = "NA"

            bus_details = []
            for i in range(len(bus_name_elements)):
                bus_detail = {
                    "Route_Name": route_name,
                    "Route_Link": url,
                    "Bus_Name": bus_name_elements[i].text,
                    "Bus_Type": bus_type_elements[i].text,
                    "Departing_Time": departing_time_elements[i].text,
                    "Duration": duration_elements[i].text,
                    "Reaching_Time": reaching_time_elements[i].text,
                    "Star_Rating": star_rating_elements[i].text if i < len(star_rating_elements) else '0',
                    "Price": price_elements[i].text.replace("INR ", ""),
                    "Seat_Availability": (''.join(filter(str.isdigit, seat_availability_elements[i].text)) if i < len(seat_availability_elements) 
                                          else '0'),
                    "Bus_Exist" : "YES"
                }
                bus_details.append(bus_detail)
            return bus_details
        
        except Exception as e:
            bus_details=[]
            bus_detail = {
                "Route_Name" : route_name,
                "Route_Link" : url,
                "Bus_Name" : "NA",
                "Bus_Type" : "NA",
                "Departing_Time" : "NA",
                "Duration" : "NA",
                "Reaching_Time" : "NA",
                "Star_Rating" : "NA",
                "Price" : "NA",
                "Seat_Availability" : "NA",
                "Bus_Exist" : "NO"
            }
            bus_details.append(bus_detail)
            return bus_details

    except Exception as e:
        print(f"Error occurred while accessing {url}: {str(e)}")
        return []

In [15]:
# Function to navigate through pages
def navigate_through_pages(URL):
    """Navigate through pages and scrape bus routes and details."""
    global all_bus_details

    try:
        driver = initialize_driver()
        load_page(driver, URL)

        # Find total number of pages
        try:
            page_tabs = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "DC_117_pageTabs")))
            num_pages = len(page_tabs)
            print(f"Total pages found: {num_pages}")
        except Exception as e:
            print(f"Error locating pagination tabs: {e}")
            #num_pages = 1

        # Loop through each page
        for page in range(1, num_pages + 1):
            try:
                driver = initialize_driver()
                load_page(driver, URL)
                print(f"Scraping page {page}...")

                # Handle pagination for pages > 1
                if page > 1:
                    # Re-locate pagination element to avoid stale element reference
                    pagination_tab = WebDriverWait(driver, 10).until(
                        EC.element_to_be_clickable(
                            (By.XPATH, f"//div[contains(@class, 'DC_117_pageTabs') and text()='{page}']")
                        )
                    )
                    driver.execute_script("arguments[0].scrollIntoView();", pagination_tab)
                    driver.execute_script("arguments[0].click();", pagination_tab)

                    # Wait for routes to load
                    WebDriverWait(driver, 10).until(
                        EC.presence_of_all_elements_located((By.CLASS_NAME, "route"))
                    )

                # Scrape routes from the current page
                bus_routes_link, bus_routes_name = scrape_bus_routes(driver)

                # Scrape details for each route
                for link, name in zip(bus_routes_link, bus_routes_name):
                    print(f"Scraping route: {name}")
                    bus_details = scrape_bus_details(driver, link, name)
                    if bus_details:
                        all_bus_details.extend(bus_details)

            except Exception as e:
                print(f"Error navigating to page {page}: {e}")

    except Exception as e:
        print(f"Error occurred while navigating pages: {e}")

    finally:
        driver.quit()

In [19]:
def main():
    # Scrape routes and details from all pages
    url = ['https://www.redbus.in/online-booking/astc', 'https://www.redbus.in/online-booking/tnstc', 
         'https://www.redbus.in/online-booking/upsrtc', 'https://www.redbus.in/online-booking/tsrtc', 
         'https://www.redbus.in/online-booking/rsrtc', 'https://www.redbus.in/online-booking/gsrtc', 
         'https://www.redbus.in/online-booking/apsrtc', 'https://www.redbus.in/online-booking/jksrtc', 
         'https://www.redbus.in/online-booking/hrtc', 'https://www.redbus.in/online-booking/south-bengal-state-transport-corporation-sbstc',
         'https://www.redbus.in/online-booking/puducherry-road-transport-corporation-prtc', 'https://www.redbus.in/online-booking/ksrtc-karnataka', 
         'https://www.redbus.in/online-booking/ksrtc-kerala']
    #to 
    for ite in url:
        navigate_through_pages(ite)
        # Convert the list of dictionaries to a DataFrame
        df = pd.DataFrame(all_bus_details)
        # Save the DataFrame to a CSV file
        df.to_csv('bus_details.csv', index=False)
    

In [18]:
if __name__ == "__main__":
    main()

Total pages found: 5
Scraping page 1...
Scraping route: Tezpur to Guwahati
Scraping route: Guwahati to Tezpur
Scraping route: Nagaon (Assam) to Guwahati
Scraping route: Guwahati to Nagaon (Assam)
Scraping route: Goalpara to Guwahati
Scraping route: Dhubri to Guwahati
Scraping route: Sibsagar (Assam) to North Lakhimpur
Scraping route: North Lakhimpur to Sibsagar (Assam)
Scraping route: Guwahati to Dhubri
Scraping route: Jorhat to North Lakhimpur
Scraping page 2...
Scraping route: Dhekiajuli to Guwahati
Scraping route: Jorhat to Dibrugarh
Scraping route: North Lakhimpur to Jorhat
Scraping route: Jorhat to Dhemaji
Scraping route: Dhemaji to Jorhat
Scraping route: Jorhat to Tinsukia
Scraping route: Tezpur to Dibrugarh
Scraping route: Tinsukia to Jorhat
Scraping route: Dibrugarh to Jorhat
Scraping route: North Lakhimpur to Dibrugarh
Scraping page 3...
Scraping route: Guwahati to Biswanath Charali
Scraping route: Biswanath Charali to Guwahati
Scraping route: Biswanath Charali to Dibrugarh
Sc