Bus Route and Link Extraction

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd
import time

# Initialize dictionaries for storing route and bus details
driver = webdriver.Chrome()

route_link = {'routename': [], 'routelink': []}

def extract_route(url):
    """Extracts route information from the provided URL."""
    driver.get(url)   
    time.sleep(5)  # Wait for the page to load
    try:
        driver.execute_script("window.scrollBy(0, 1600);")  # Scroll down to load elements
        time.sleep(2)  # Allow time for elements to load after scroll
        web_container = driver.find_element(By.CLASS_NAME, 'D117_main')
        page_tabs = web_container.find_elements(By.CLASS_NAME, 'DC_117_pageTabs')
        for page in page_tabs:
            page.click()  # Click on each page tab
            time.sleep(5)  # Wait for the page content to load
            elements = web_container.find_elements(By.CLASS_NAME, 'route_details')
            for element in elements:
                try:
                    anchor = element.find_element(By.TAG_NAME, 'a')
                    route_link['routename'].append(anchor.text)  # Store route name
                    route_link['routelink'].append(anchor.get_attribute('href'))  # Store route link
                except Exception as e:
                    pass  # Handle any exceptions that occur within the inner try block
            time.sleep(1)  # Brief pause before moving to the next set of elements
    except Exception as e:
        pass  # Handle any exceptions that occur within the outer try block

# List of URLs to extract route information from
bus_dict = [
    'https://www.redbus.in/online-booking/apsrtc',
    'https://www.redbus.in/online-booking/tsrtc',
    'https://www.redbus.in/online-booking/rsrtc',
    'https://www.redbus.in/online-booking/hrtc',
    'https://www.redbus.in/online-booking/uttar-pradesh-state-road-transport-corporation-upsrtc',
    'https://www.redbus.in/online-booking/pepsu',
    'https://www.redbus.in/online-booking/pepsu-punjab',
    'https://www.redbus.in/online-booking/ksrtc-kerala',
    'https://www.redbus.in/online-booking/sikkim-nationalised-transport-snt',
    'https://www.redbus.in/online-booking/meghalaya-transport-corporation-mtc'
]

# Extract route information from each URL in the list
for url in bus_dict:
    extract_route(url)

driver.quit()  # Close the browser

# Convert the route data to a DataFrame and save it as a CSV file
route_data = pd.DataFrame(route_link)
route_data.to_csv('route_data.csv', index=False)



Bus Detials Extraction

In [None]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By

# Initialize dictionary for storing bus details
route_dict = {
    'busname': [], 'bustype': [], 'starttime': [], 'endtime': [], 
    'dploc': [], 'ratings': [], 'fare': [], 'seats': []
}

# Initialize the WebDriver
driver = webdriver.Chrome()

def scroll(driver):
    """Scrolls to the bottom of the page to load dynamic content."""
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

def extract_time(url):
    """Extracts bus timing and other details from the provided URL."""
    driver.get(url)
    time.sleep(4)  # Wait for the page to load
    try:
        buttons = driver.find_elements(By.CLASS_NAME, 'p-left-10')
    except:
        buttons = []
    if buttons and len(buttons) > 1:
        for button in reversed(buttons):
            button.click()
            time.sleep(1)  # Allow time for content to load after click
    scroll(driver)
    time.sleep(2)  # Wait for content to load after scrolling
    web_container = driver.find_elements(By.CLASS_NAME, 'bus-item-details')
    for element in web_container:
        time.sleep(3)  # Pause to allow for element loading
        route_dict['busname'].append(element.find_element(By.CLASS_NAME, 'travels').text)
        route_dict['bustype'].append(element.find_element(By.CLASS_NAME, 'bus-type').text)
        route_dict['starttime'].append(element.find_element(By.CLASS_NAME, 'dp-time').text)
        route_dict['endtime'].append(element.find_element(By.CLASS_NAME, 'bp-time').text)
        route_dict['dploc'].append(element.find_element(By.CLASS_NAME, 'dp-loc').text)
        try:
            route_dict['ratings'].append(element.find_element(By.CLASS_NAME, 'rating').find_element(By.TAG_NAME, 'span').text)
        except:
            route_dict['ratings'].append("*")  # Default rating if not found
        route_dict['fare'].append(element.find_element(By.CLASS_NAME, 'fare').text)
        route_dict['seats'].append(element.find_element(By.CLASS_NAME, 'seat-left').text)

# List of route links (assuming route_link is already defined and populated)
for i, link in enumerate(route_link['routelink']):
    print(f"Processing link {i + 1}/{len(route_link['routelink'])}: {link}")
    extract_time(link)

# Close the WebDriver
driver.quit()

# Convert the route data to a DataFrame and save it as a CSV file
bus_data = pd.DataFrame(route_dict)
bus_data.to_csv('bus_data.csv', index=False)

