In [51]:
#1. JKRTC Bus Routes & Timings

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException, ElementClickInterceptedException
import pandas as pd
import time

num_pages = 3
# Initialize the WebDriver
driver = webdriver.Chrome()

# Function to extract route data from the current page
def extract_routes(driver):
    route_container = driver.find_element(By.CLASS_NAME, 'D117_main')
    route_links = route_container.find_elements(By.TAG_NAME, 'a')
    
    route_name = []
    route_link = []
    
    for link in route_links:
        href = link.get_attribute('href')
        name = link.get_attribute('title')
        route_link.append(href)
        route_name.append(name)
    
    return route_name, route_link

# Function to scroll to the bottom of the page
def scroll_to_bottom(driver):
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(3)  # Wait for new content to load
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

# Function to click an element
def click_element(driver, element):
    try:
        action = ActionChains(driver)
        action.move_to_element(element).perform()  # Move to the element
        WebDriverWait(driver, 10).until(EC.element_to_be_clickable(element)).click()
    except ElementClickInterceptedException:
        print("Element is not clickable. It might be obscured.")
    except Exception as e:
        print(f"Exception occurred while clicking: {e}")

# Function to extract bus data from the current page
def extract_bus_data(driver, route_name, link):
    try:
        wait = WebDriverWait(driver, 30)
        buscontainer = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//div[@class='clearfix bus-item']")))

        for bus in buscontainer:
            try:
                busname = bus.find_element(By.XPATH, ".//div[@class='travels lh-24 f-bold d-color']").text
            except:
                busname = "N/A"
            try:
                bustype = bus.find_element(By.XPATH, ".//div[@class='bus-type f-12 m-top-16 l-color evBus']").text
            except:
                bustype = "N/A"
            try:
                busstrttm = bus.find_element(By.XPATH, ".//div[@class='dp-time f-19 d-color f-bold']").text
            except:
                busstrttm = "N/A"
            try:
                busdur = bus.find_element(By.XPATH, ".//div[@class='dur l-color lh-24']").text
            except:
                busdur = "N/A"
            try:
                busendtm = bus.find_element(By.XPATH, ".//div[@class='bp-time f-19 d-color disp-Inline']").text
            except:
                busendtm = "N/A"
            try:
                str_rating = bus.find_element(By.XPATH, ".//span[@class='']").text
            except:
                str_rating = "0"
            try:
                seat = bus.find_element(By.XPATH, ".//div[@class='seat-left m-top-16']").text.replace(" Seats available", "")
            except:
                seat = "0"
            try:
                price = bus.find_element(By.XPATH, ".//div[@class='fare d-block']").text.replace("INR ", "")
            except:
                price = "N/A"

            BUS_NAME.append(busname)
            BUS_TYPE.append(bustype)
            START_TM.append(busstrttm)
            DUR_TM.append(busdur)
            END_TM.append(busendtm)
            STR.append(str_rating)
            SEATAV.append(seat)
            PRICE.append(price)
            ROUTE_NAME.append(route_name)
            ROUTE_LINK.append(link)

    except TimeoutException:
        print("Timeout while waiting for bus elements on link:", link)

# Function to extract data from all pages
def extract_all_routes_and_buses(driver, num_pages):
    all_route_names_1 = []
    all_route_links_1 = []

    # Scrape data from each page
    for page_number in range(1, num_pages + 1):
        # Extract routes from the current page
        route_names, route_links = extract_routes(driver)
        all_route_names_1.extend(route_names)
        all_route_links_1.extend(route_links)
        
        # Find and click the 'Next' button if it exists
        if page_number < num_pages:
            try:
                pagination_container = WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.XPATH, '//*[@id="root"]/div/div[4]/div[12]'))
                )
                next_page_button = pagination_container.find_element(By.XPATH, f'.//div[contains(@class,"DC_117_pageTabs") and text()="{page_number + 1}"]')
                
                # Scroll to the 'Next' button and click it
                actions = ActionChains(driver)
                actions.move_to_element(next_page_button).perform()
                time.sleep(2)
                next_page_button.click()
                
                # Wait for the next page to load
                WebDriverWait(driver, 10).until(
                    EC.text_to_be_present_in_element((By.XPATH, f'//div[contains(@class, "DC_117_pageTabs DC_117_pageActive") and text()="{page_number + 1}"]'), str(page_number + 1))
                )
                time.sleep(5)  # Ensure this wait time is enough for the next page to load
            except Exception as e:
                #print(f"Could not navigate to page {page_number + 1}: {e}")
                break

    return all_route_names_1, all_route_links_1

# Initialize lists to hold bus data
BUS_NAME = []
BUS_TYPE = []
START_TM = []
DUR_TM = []
END_TM = []
STR = []
SEATAV = []
PRICE = []
ROUTE_NAME = []
ROUTE_LINK = []

# Open the website
driver.get("https://www.redbus.in/online-booking/jksrtc")

# Wait for the page to load
wait = WebDriverWait(driver, 10)
time.sleep(5)

# Extract all routes and bus details
all_route_names_1, all_route_links_1 = extract_all_routes_and_buses(driver, num_pages=num_pages)

# Extract bus details for each route link
for route_name, link in zip(all_route_names_1, all_route_links_1):
    driver.get(link)
    time.sleep(5)

    # Scroll down to ensure all content is loaded
    scroll_to_bottom(driver)

    # Click all "View Buses" buttons and extract data
    while True:
        view_buses_buttons = driver.find_elements(By.XPATH, '//div[@class="button" and contains(text(),"View Buses")]')

        if view_buses_buttons:
            seen_buttons = set()  # Track buttons already clicked
            for button in view_buses_buttons:
                button_id = button.get_attribute('id')  # Use a unique attribute like ID or another identifier
                if button_id not in seen_buttons:
                    seen_buttons.add(button_id)
                    try:
                        click_element(driver, button)
                        print("Clicked on 'View Buses' button")
                        time.sleep(5)  # Wait for the page to load

                        # Extract bus data
                        extract_bus_data(driver, route_name, link)
                        
                        # Scroll to the bottom to load more content
                        scroll_to_bottom(driver)
                        
                        time.sleep(5)  # Give some time before clicking the next button
                    except (NoSuchElementException, ElementClickInterceptedException, TimeoutException) as e:
                        print(f"Exception occurred: {e}")
                        continue
        else:
            # No "View Buses" buttons found, directly attempt to extract bus data
            extract_bus_data(driver, route_name, link)
            # Scroll to ensure all bus data is loaded
            scroll_to_bottom(driver)
            break  # Move to the next route after processing

all_data_1 = pd.DataFrame({
    "route_name": ROUTE_NAME,
    "route_link": ROUTE_LINK,
    "busname": BUS_NAME,
    "bustype": BUS_TYPE,
    "departing_time": START_TM,
    "duration": DUR_TM,
    "reaching_time": END_TM,
    "star_rating": STR,
    "price": PRICE,
    "seats_available": SEATAV
})

# Display the DataFrame
print(all_data_1)


Timeout while waiting for bus elements on link: https://www.redbus.in/bus-tickets/delhi-to-srinagar
Timeout while waiting for bus elements on link: https://www.redbus.in/bus-tickets/srinagar-to-jammu
Timeout while waiting for bus elements on link: https://www.redbus.in/bus-tickets/mendhar-j-k-to-jammu
Timeout while waiting for bus elements on link: https://www.redbus.in/bus-tickets/jammu-to-poonch
Timeout while waiting for bus elements on link: https://www.redbus.in/bus-tickets/jammu-to-kishtwar
Timeout while waiting for bus elements on link: https://www.redbus.in/bus-tickets/jammu-to-mendhar-j-k
                      route_name  \
0    Jammu (j and k) to Srinagar   
1    Jammu (j and k) to Srinagar   
2    Jammu (j and k) to Srinagar   
3    Jammu (j and k) to Srinagar   
4    Jammu (j and k) to Srinagar   
..                           ...   
209  Jammu (j and k) to Amritsar   
210  Jammu (j and k) to Amritsar   
211  Jammu (j and k) to Amritsar   
212  Jammu (j and k) to Amritsar   


In [52]:
#2. West bengal transport corporation Bus Routes & Timings

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException, ElementClickInterceptedException
import pandas as pd
import time

num_pages = 2
# Initialize the WebDriver
driver = webdriver.Chrome()

# Function to extract route data from the current page
def extract_routes(driver):
    route_container = driver.find_element(By.CLASS_NAME, 'D117_main')
    route_links = route_container.find_elements(By.TAG_NAME, 'a')
    
    route_name = []
    route_link = []
    
    for link in route_links:
        href = link.get_attribute('href')
        name = link.get_attribute('title')
        route_link.append(href)
        route_name.append(name)
    
    return route_name, route_link

# Function to scroll to the bottom of the page
def scroll_to_bottom(driver):
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(3)  # Wait for new content to load
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

# Function to click an element
def click_element(driver, element):
    try:
        action = ActionChains(driver)
        action.move_to_element(element).perform()  # Move to the element
        WebDriverWait(driver, 10).until(EC.element_to_be_clickable(element)).click()
    except ElementClickInterceptedException:
        print("Element is not clickable. It might be obscured.")
    except Exception as e:
        print(f"Exception occurred while clicking: {e}")

# Function to extract bus data from the current page
def extract_bus_data(driver, route_name, link):
    try:
        wait = WebDriverWait(driver, 30)
        buscontainer = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//div[@class='clearfix bus-item']")))

        for bus in buscontainer:
            try:
                busname = bus.find_element(By.XPATH, ".//div[@class='travels lh-24 f-bold d-color']").text
            except:
                busname = "N/A"
            try:
                bustype = bus.find_element(By.XPATH, ".//div[@class='bus-type f-12 m-top-16 l-color evBus']").text
            except:
                bustype = "N/A"
            try:
                busstrttm = bus.find_element(By.XPATH, ".//div[@class='dp-time f-19 d-color f-bold']").text
            except:
                busstrttm = "N/A"
            try:
                busdur = bus.find_element(By.XPATH, ".//div[@class='dur l-color lh-24']").text
            except:
                busdur = "N/A"
            try:
                busendtm = bus.find_element(By.XPATH, ".//div[@class='bp-time f-19 d-color disp-Inline']").text
            except:
                busendtm = "N/A"
            try:
                str_rating = bus.find_element(By.XPATH, ".//span[@class='']").text
            except:
                str_rating = "0"
            try:
                seat = bus.find_element(By.XPATH, ".//div[@class='seat-left m-top-16']").text.replace(" Seats available", "")
            except:
                seat = "0"
            try:
                price = bus.find_element(By.XPATH, ".//div[@class='fare d-block']").text.replace("INR ", "")
            except:
                price = "N/A"

            BUS_NAME.append(busname)
            BUS_TYPE.append(bustype)
            START_TM.append(busstrttm)
            DUR_TM.append(busdur)
            END_TM.append(busendtm)
            STR.append(str_rating)
            SEATAV.append(seat)
            PRICE.append(price)
            ROUTE_NAME.append(route_name)
            ROUTE_LINK.append(link)

    except TimeoutException:
        print("Timeout while waiting for bus elements on link:", link)

# Function to extract data from all pages
def extract_all_routes_and_buses(driver, num_pages):
    all_route_names_1 = []
    all_route_links_1 = []

    # Scrape data from each page
    for page_number in range(1, num_pages + 1):
        # Extract routes from the current page
        route_names, route_links = extract_routes(driver)
        all_route_names_1.extend(route_names)
        all_route_links_1.extend(route_links)
        
        # Find and click the 'Next' button if it exists
        if page_number < num_pages:
            try:
                pagination_container = WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.XPATH, '//*[@id="root"]/div/div[4]/div[12]'))
                )
                next_page_button = pagination_container.find_element(By.XPATH, f'.//div[contains(@class,"DC_117_pageTabs") and text()="{page_number + 1}"]')
                
                # Scroll to the 'Next' button and click it
                actions = ActionChains(driver)
                actions.move_to_element(next_page_button).perform()
                time.sleep(2)
                next_page_button.click()
                
                # Wait for the next page to load
                WebDriverWait(driver, 10).until(
                    EC.text_to_be_present_in_element((By.XPATH, f'//div[contains(@class, "DC_117_pageTabs DC_117_pageActive") and text()="{page_number + 1}"]'), str(page_number + 1))
                )
                time.sleep(5)  # Ensure this wait time is enough for the next page to load
            except Exception as e:
                #print(f"Could not navigate to page {page_number + 1}: {e}")
                break

    return all_route_names_1, all_route_links_1

# Initialize lists to hold bus data
BUS_NAME = []
BUS_TYPE = []
START_TM = []
DUR_TM = []
END_TM = []
STR = []
SEATAV = []
PRICE = []
ROUTE_NAME = []
ROUTE_LINK = []

# Open the website
driver.get("https://www.redbus.in/online-booking/west-bengal-transport-corporation?utm_source=rtchometile")

# Wait for the page to load
wait = WebDriverWait(driver, 10)
time.sleep(5)

# Extract all routes and bus details
all_route_names_1, all_route_links_1 = extract_all_routes_and_buses(driver, num_pages=num_pages)

# Extract bus details for each route link
for route_name, link in zip(all_route_names_1, all_route_links_1):
    driver.get(link)
    time.sleep(5)

    # Scroll down to ensure all content is loaded
    scroll_to_bottom(driver)

    # Click all "View Buses" buttons and extract data
    while True:
        view_buses_buttons = driver.find_elements(By.XPATH, '//div[@class="button" and contains(text(),"View Buses")]')

        if view_buses_buttons:
            seen_buttons = set()  # Track buttons already clicked
            for button in view_buses_buttons:
                button_id = button.get_attribute('id')  # Use a unique attribute like ID or another identifier
                if button_id not in seen_buttons:
                    seen_buttons.add(button_id)
                    try:
                        click_element(driver, button)
                        print("Clicked on 'View Buses' button")
                        time.sleep(5)  # Wait for the page to load

                        # Extract bus data
                        extract_bus_data(driver, route_name, link)
                        
                        # Scroll to the bottom to load more content
                        scroll_to_bottom(driver)
                        
                        time.sleep(5)  # Give some time before clicking the next button
                    except (NoSuchElementException, ElementClickInterceptedException, TimeoutException) as e:
                        print(f"Exception occurred: {e}")
                        continue
        else:
            # No "View Buses" buttons found, directly attempt to extract bus data
            extract_bus_data(driver, route_name, link)
            # Scroll to ensure all bus data is loaded
            scroll_to_bottom(driver)
            break  # Move to the next route after processing

all_data_2 = pd.DataFrame({
    "route_name": ROUTE_NAME,
    "route_link": ROUTE_LINK,
    "busname": BUS_NAME,
    "bustype": BUS_TYPE,
    "departing_time": START_TM,
    "duration": DUR_TM,
    "reaching_time": END_TM,
    "star_rating": STR,
    "price": PRICE,
    "seats_available": SEATAV
})

# Display the DataFrame
print(all_data_2)


Clicked on 'View Buses' button
Timeout while waiting for bus elements on link: https://www.redbus.in/bus-tickets/kolkata-to-bakkhali
                route_name                                         route_link  \
0         Kolkata to Digha  https://www.redbus.in/bus-tickets/kolkata-to-d...   
1         Kolkata to Digha  https://www.redbus.in/bus-tickets/kolkata-to-d...   
2         Kolkata to Digha  https://www.redbus.in/bus-tickets/kolkata-to-d...   
3         Kolkata to Digha  https://www.redbus.in/bus-tickets/kolkata-to-d...   
4         Kolkata to Digha  https://www.redbus.in/bus-tickets/kolkata-to-d...   
..                     ...                                                ...   
166  Kolkata to Mandarmani  https://www.redbus.in/bus-tickets/kolkata-to-m...   
167  Kolkata to Mandarmani  https://www.redbus.in/bus-tickets/kolkata-to-m...   
168  Kolkata to Mandarmani  https://www.redbus.in/bus-tickets/kolkata-to-m...   
169  Kolkata to Mandarmani  https://www.redbus.in/bus-tic

In [55]:
#3.KSRTC (Kerala) Bus Routes & Timings

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException, ElementClickInterceptedException
import pandas as pd
import time

num_pages = 3
# Initialize the WebDriver
driver = webdriver.Chrome()

# Function to extract route data from the current page
def extract_routes(driver):
    route_container = driver.find_element(By.CLASS_NAME, 'D117_main')
    route_links = route_container.find_elements(By.TAG_NAME, 'a')
    
    route_name = []
    route_link = []
    
    for link in route_links:
        href = link.get_attribute('href')
        name = link.get_attribute('title')
        route_link.append(href)
        route_name.append(name)
    
    return route_name, route_link

# Function to scroll to the bottom of the page
def scroll_to_bottom(driver):
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(3)  # Wait for new content to load
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

# Function to click an element
def click_element(driver, element):
    try:
        action = ActionChains(driver)
        action.move_to_element(element).perform()  # Move to the element
        WebDriverWait(driver, 10).until(EC.element_to_be_clickable(element)).click()
    except ElementClickInterceptedException:
        print("Element is not clickable. It might be obscured.")
    except Exception as e:
        print(f"Exception occurred while clicking: {e}")

# Function to extract bus data from the current page
def extract_bus_data(driver, route_name, link):
    try:
        wait = WebDriverWait(driver, 30)
        buscontainer = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//div[@class='clearfix bus-item']")))

        for bus in buscontainer:
            try:
                busname = bus.find_element(By.XPATH, ".//div[@class='travels lh-24 f-bold d-color']").text
            except:
                busname = "N/A"
            try:
                bustype = bus.find_element(By.XPATH, ".//div[@class='bus-type f-12 m-top-16 l-color evBus']").text
            except:
                bustype = "N/A"
            try:
                busstrttm = bus.find_element(By.XPATH, ".//div[@class='dp-time f-19 d-color f-bold']").text
            except:
                busstrttm = "N/A"
            try:
                busdur = bus.find_element(By.XPATH, ".//div[@class='dur l-color lh-24']").text
            except:
                busdur = "N/A"
            try:
                busendtm = bus.find_element(By.XPATH, ".//div[@class='bp-time f-19 d-color disp-Inline']").text
            except:
                busendtm = "N/A"
            try:
                str_rating = bus.find_element(By.XPATH, ".//span[@class='']").text
            except:
                str_rating = "0"
            try:
                seat = bus.find_element(By.XPATH, ".//div[@class='seat-left m-top-16']").text.replace(" Seats available", "")
            except:
                seat = "0"
            try:
                price = bus.find_element(By.XPATH, ".//div[@class='fare d-block']").text.replace("INR ", "")
            except:
                price = "N/A"

            BUS_NAME.append(busname)
            BUS_TYPE.append(bustype)
            START_TM.append(busstrttm)
            DUR_TM.append(busdur)
            END_TM.append(busendtm)
            STR.append(str_rating)
            SEATAV.append(seat)
            PRICE.append(price)
            ROUTE_NAME.append(route_name)
            ROUTE_LINK.append(link)

    except TimeoutException:
        print("Timeout while waiting for bus elements on link:", link)

# Function to extract data from all pages
def extract_all_routes_and_buses(driver, num_pages):
    all_route_names_1 = []
    all_route_links_1 = []

    # Scrape data from each page
    for page_number in range(1, num_pages + 1):
        # Extract routes from the current page
        route_names, route_links = extract_routes(driver)
        all_route_names_1.extend(route_names)
        all_route_links_1.extend(route_links)
        
        # Find and click the 'Next' button if it exists
        if page_number < num_pages:
            try:
                pagination_container = WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.XPATH, '//*[@id="root"]/div/div[4]/div[12]'))
                )
                next_page_button = pagination_container.find_element(By.XPATH, f'.//div[contains(@class,"DC_117_pageTabs") and text()="{page_number + 1}"]')
                
                # Scroll to the 'Next' button and click it
                actions = ActionChains(driver)
                actions.move_to_element(next_page_button).perform()
                time.sleep(2)
                next_page_button.click()
                
                # Wait for the next page to load
                WebDriverWait(driver, 10).until(
                    EC.text_to_be_present_in_element((By.XPATH, f'//div[contains(@class, "DC_117_pageTabs DC_117_pageActive") and text()="{page_number + 1}"]'), str(page_number + 1))
                )
                time.sleep(5)  # Ensure this wait time is enough for the next page to load
            except Exception as e:
                #print(f"Could not navigate to page {page_number + 1}: {e}")
                break

    return all_route_names_1, all_route_links_1

# Initialize lists to hold bus data
BUS_NAME = []
BUS_TYPE = []
START_TM = []
DUR_TM = []
END_TM = []
STR = []
SEATAV = []
PRICE = []
ROUTE_NAME = []
ROUTE_LINK = []

# Open the website
driver.get("https://www.redbus.in/online-booking/ksrtc-kerala/?utm_source=rtchometile")

# Wait for the page to load
wait = WebDriverWait(driver, 10)
time.sleep(5)

# Extract all routes and bus details
all_route_names_1, all_route_links_1 = extract_all_routes_and_buses(driver, num_pages=num_pages)

# Extract bus details for each route link
for route_name, link in zip(all_route_names_1, all_route_links_1):
    driver.get(link)
    time.sleep(5)

    # Scroll down to ensure all content is loaded
    scroll_to_bottom(driver)

    # Click all "View Buses" buttons and extract data
    while True:
        view_buses_buttons = driver.find_elements(By.XPATH, '//div[@class="button" and contains(text(),"View Buses")]')

        if view_buses_buttons:
            seen_buttons = set()  # Track buttons already clicked
            for button in view_buses_buttons:
                button_id = button.get_attribute('id')  # Use a unique attribute like ID or another identifier
                if button_id not in seen_buttons:
                    seen_buttons.add(button_id)
                    try:
                        click_element(driver, button)
                        print("Clicked on 'View Buses' button")
                        time.sleep(5)  # Wait for the page to load

                        # Extract bus data
                        extract_bus_data(driver, route_name, link)
                        
                        # Scroll to the bottom to load more content
                        scroll_to_bottom(driver)
                        
                        time.sleep(5)  # Give some time before clicking the next button
                    except (NoSuchElementException, ElementClickInterceptedException, TimeoutException) as e:
                        print(f"Exception occurred: {e}")
                        continue
        else:
            # No "View Buses" buttons found, directly attempt to extract bus data
            extract_bus_data(driver, route_name, link)
            # Scroll to ensure all bus data is loaded
            scroll_to_bottom(driver)
            break  # Move to the next route after processing

all_data_3 = pd.DataFrame({
    "route_name": ROUTE_NAME,
    "route_link": ROUTE_LINK,
    "busname": BUS_NAME,
    "bustype": BUS_TYPE,
    "departing_time": START_TM,
    "duration": DUR_TM,
    "reaching_time": END_TM,
    "star_rating": STR,
    "price": PRICE,
    "seats_available": SEATAV
})

# Display the DataFrame
print(all_data_3)


Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
                         route_name  \
0            Bangalore to Kozhikode   
1            Bangalore to Kozhikode   
2            Bangalore to Kozhikode   
3            Bangalore to Kozhikode   
4            Bangalore to Kozhikode   
..                              ...   
595  Kalpetta (kerala) to Kozhikode   
596  Kalpetta (kerala) to Kozhikode   
597  Kalpetta (kerala) to Kozhikode   
598  Kalpetta (kerala) to Kozhikode   
599  Kalpetta (kerala) to Kozhikode   

                                            route_link                busname  \
0    https://www

In [58]:
#4.Bihar state road transport corporation (BSRTC) Bus Routes & Timings

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException, ElementClickInterceptedException
import pandas as pd
import time

num_pages = 5
# Initialize the WebDriver
driver = webdriver.Chrome()

# Function to extract route data from the current page
def extract_routes(driver):
    route_container = driver.find_element(By.CLASS_NAME, 'D117_main')
    route_links = route_container.find_elements(By.TAG_NAME, 'a')
    
    route_name = []
    route_link = []
    
    for link in route_links:
        href = link.get_attribute('href')
        name = link.get_attribute('title')
        route_link.append(href)
        route_name.append(name)
    
    return route_name, route_link

# Function to scroll to the bottom of the page
def scroll_to_bottom(driver):
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(3)  # Wait for new content to load
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

# Function to click an element
def click_element(driver, element):
    try:
        action = ActionChains(driver)
        action.move_to_element(element).perform()  # Move to the element
        WebDriverWait(driver, 10).until(EC.element_to_be_clickable(element)).click()
    except ElementClickInterceptedException:
        print("Element is not clickable. It might be obscured.")
    except Exception as e:
        print(f"Exception occurred while clicking: {e}")

# Function to extract bus data from the current page
def extract_bus_data(driver, route_name, link):
    try:
        wait = WebDriverWait(driver, 30)
        buscontainer = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//div[@class='clearfix bus-item']")))

        for bus in buscontainer:
            try:
                busname = bus.find_element(By.XPATH, ".//div[@class='travels lh-24 f-bold d-color']").text
            except:
                busname = "N/A"
            try:
                bustype = bus.find_element(By.XPATH, ".//div[@class='bus-type f-12 m-top-16 l-color evBus']").text
            except:
                bustype = "N/A"
            try:
                busstrttm = bus.find_element(By.XPATH, ".//div[@class='dp-time f-19 d-color f-bold']").text
            except:
                busstrttm = "N/A"
            try:
                busdur = bus.find_element(By.XPATH, ".//div[@class='dur l-color lh-24']").text
            except:
                busdur = "N/A"
            try:
                busendtm = bus.find_element(By.XPATH, ".//div[@class='bp-time f-19 d-color disp-Inline']").text
            except:
                busendtm = "N/A"
            try:
                str_rating = bus.find_element(By.XPATH, ".//span[@class='']").text
            except:
                str_rating = "0"
            try:
                seat = bus.find_element(By.XPATH, ".//div[@class='seat-left m-top-30']").text.replace(" Seats available", "")
            except:
                seat = "0"
            try:
                price = bus.find_element(By.XPATH, ".//div[@class='fare d-block']").text.replace("INR ", "")
            except:
                price = "N/A"

            BUS_NAME.append(busname)
            BUS_TYPE.append(bustype)
            START_TM.append(busstrttm)
            DUR_TM.append(busdur)
            END_TM.append(busendtm)
            STR.append(str_rating)
            SEATAV.append(seat)
            PRICE.append(price)
            ROUTE_NAME.append(route_name)
            ROUTE_LINK.append(link)

    except TimeoutException:
        print("Timeout while waiting for bus elements on link:", link)

# Function to extract data from all pages
def extract_all_routes_and_buses(driver, num_pages):
    all_route_names_1 = []
    all_route_links_1 = []

    # Scrape data from each page
    for page_number in range(1, num_pages + 1):
        # Extract routes from the current page
        route_names, route_links = extract_routes(driver)
        all_route_names_1.extend(route_names)
        all_route_links_1.extend(route_links)
        
        # Find and click the 'Next' button if it exists
        if page_number < num_pages:
            try:
                pagination_container = WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.XPATH, '//*[@id="root"]/div/div[4]/div[12]'))
                )
                next_page_button = pagination_container.find_element(By.XPATH, f'.//div[contains(@class,"DC_117_pageTabs") and text()="{page_number + 1}"]')
                
                # Scroll to the 'Next' button and click it
                actions = ActionChains(driver)
                actions.move_to_element(next_page_button).perform()
                time.sleep(2)
                next_page_button.click()
                
                # Wait for the next page to load
                WebDriverWait(driver, 10).until(
                    EC.text_to_be_present_in_element((By.XPATH, f'//div[contains(@class, "DC_117_pageTabs DC_117_pageActive") and text()="{page_number + 1}"]'), str(page_number + 1))
                )
                time.sleep(5)  # Ensure this wait time is enough for the next page to load
            except Exception as e:
                #print(f"Could not navigate to page {page_number + 1}: {e}")
                break

    return all_route_names_1, all_route_links_1

# Initialize lists to hold bus data
BUS_NAME = []
BUS_TYPE = []
START_TM = []
DUR_TM = []
END_TM = []
STR = []
SEATAV = []
PRICE = []
ROUTE_NAME = []
ROUTE_LINK = []

# Open the website
driver.get("https://www.redbus.in/online-booking/bihar-state-road-transport-corporation-bsrtc/?utm_source=rtchometile")

# Wait for the page to load
wait = WebDriverWait(driver, 10)
time.sleep(5)

# Extract all routes and bus details
all_route_names_1, all_route_links_1 = extract_all_routes_and_buses(driver, num_pages=num_pages)

# Extract bus details for each route link
for route_name, link in zip(all_route_names_1, all_route_links_1):
    driver.get(link)
    time.sleep(5)

    # Scroll down to ensure all content is loaded
    scroll_to_bottom(driver)

    # Click all "View Buses" buttons and extract data
    while True:
        view_buses_buttons = driver.find_elements(By.XPATH, '//div[@class="button" and contains(text(),"View Buses")]')

        if view_buses_buttons:
            seen_buttons = set()  # Track buttons already clicked
            for button in view_buses_buttons:
                button_id = button.get_attribute('id')  # Use a unique attribute like ID or another identifier
                if button_id not in seen_buttons:
                    seen_buttons.add(button_id)
                    try:
                        click_element(driver, button)
                        print("Clicked on 'View Buses' button")
                        time.sleep(5)  # Wait for the page to load

                        # Extract bus data
                        extract_bus_data(driver, route_name, link)
                        
                        # Scroll to the bottom to load more content
                        scroll_to_bottom(driver)
                        
                        time.sleep(5)  # Give some time before clicking the next button
                    except (NoSuchElementException, ElementClickInterceptedException, TimeoutException) as e:
                        print(f"Exception occurred: {e}")
                        continue
        else:
            # No "View Buses" buttons found, directly attempt to extract bus data
            extract_bus_data(driver, route_name, link)
            # Scroll to ensure all bus data is loaded
            scroll_to_bottom(driver)
            break  # Move to the next route after processing

all_data_4 = pd.DataFrame({
    "route_name": ROUTE_NAME,
    "route_link": ROUTE_LINK,
    "busname": BUS_NAME,
    "bustype": BUS_TYPE,
    "departing_time": START_TM,
    "duration": DUR_TM,
    "reaching_time": END_TM,
    "star_rating": STR,
    "price": PRICE,
    "seats_available": SEATAV
})

# Display the DataFrame
print(all_data_4)


Timeout while waiting for bus elements on link: https://www.redbus.in/bus-tickets/delhi-to-motihari
Timeout while waiting for bus elements on link: https://www.redbus.in/bus-tickets/patna-to-kathmandu
Timeout while waiting for bus elements on link: https://www.redbus.in/bus-tickets/muzaffarpur-to-kathmandu
Timeout while waiting for bus elements on link: https://www.redbus.in/bus-tickets/kathmandu-to-patna
Timeout while waiting for bus elements on link: https://www.redbus.in/bus-tickets/motihari-to-kathmandu
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Timeout while waiting for bus elements on link: https://www.redbus.in/bus-tickets/hajipur-to-kathmandu
                       route_name  \
0        Patna (Bihar) to Bettiah   
1        Patna (Bihar) to Bettiah   
2        Patna (Bihar) to Bettiah   
3        Patna (Bihar) to Bettiah   
4      Gopalganj (Bihar) to Delhi   
..                            ...   
130  Lucknow to Gopalganj (Bihar)   
131  Lucknow to Gopalganj 

In [59]:
#5.NORTH BENGAL STATE TRANSPORT CORPORATION Bus Routes & Timings

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException, ElementClickInterceptedException
import pandas as pd
import time

num_pages = 6
# Initialize the WebDriver
driver = webdriver.Chrome()

# Function to extract route data from the current page
def extract_routes(driver):
    route_container = driver.find_element(By.CLASS_NAME, 'D117_main')
    route_links = route_container.find_elements(By.TAG_NAME, 'a')
    
    route_name = []
    route_link = []
    
    for link in route_links:
        href = link.get_attribute('href')
        name = link.get_attribute('title')
        route_link.append(href)
        route_name.append(name)
    
    return route_name, route_link

# Function to scroll to the bottom of the page
def scroll_to_bottom(driver):
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(3)  # Wait for new content to load
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

# Function to click an element
def click_element(driver, element):
    try:
        action = ActionChains(driver)
        action.move_to_element(element).perform()  # Move to the element
        WebDriverWait(driver, 10).until(EC.element_to_be_clickable(element)).click()
    except ElementClickInterceptedException:
        print("Element is not clickable. It might be obscured.")
    except Exception as e:
        print(f"Exception occurred while clicking: {e}")

# Function to extract bus data from the current page
def extract_bus_data(driver, route_name, link):
    try:
        wait = WebDriverWait(driver, 30)
        buscontainer = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//div[@class='clearfix bus-item']")))

        for bus in buscontainer:
            try:
                busname = bus.find_element(By.XPATH, ".//div[@class='travels lh-24 f-bold d-color']").text
            except:
                busname = "N/A"
            try:
                bustype = bus.find_element(By.XPATH, ".//div[@class='bus-type f-12 m-top-16 l-color evBus']").text
            except:
                bustype = "N/A"
            try:
                busstrttm = bus.find_element(By.XPATH, ".//div[@class='dp-time f-19 d-color f-bold']").text
            except:
                busstrttm = "N/A"
            try:
                busdur = bus.find_element(By.XPATH, ".//div[@class='dur l-color lh-24']").text
            except:
                busdur = "N/A"
            try:
                busendtm = bus.find_element(By.XPATH, ".//div[@class='bp-time f-19 d-color disp-Inline']").text
            except:
                busendtm = "N/A"
            try:
                str_rating = bus.find_element(By.XPATH, ".//span[@class='']").text
            except:
                str_rating = "0"
            try:
                seat = bus.find_element(By.XPATH, ".//div[@class='seat-left m-top-30']").text.replace(" Seats available", "")
            except:
                seat = "0"
            try:
                price = bus.find_element(By.XPATH, ".//div[@class='fare d-block']").text.replace("INR ", "")
            except:
                price = "N/A"

            BUS_NAME.append(busname)
            BUS_TYPE.append(bustype)
            START_TM.append(busstrttm)
            DUR_TM.append(busdur)
            END_TM.append(busendtm)
            STR.append(str_rating)
            SEATAV.append(seat)
            PRICE.append(price)
            ROUTE_NAME.append(route_name)
            ROUTE_LINK.append(link)

    except TimeoutException:
        print("Timeout while waiting for bus elements on link:", link)

# Function to extract data from all pages
def extract_all_routes_and_buses(driver, num_pages):
    all_route_names_1 = []
    all_route_links_1 = []

    # Scrape data from each page
    for page_number in range(1, num_pages + 1):
        # Extract routes from the current page
        route_names, route_links = extract_routes(driver)
        all_route_names_1.extend(route_names)
        all_route_links_1.extend(route_links)
        
        # Find and click the 'Next' button if it exists
        if page_number < num_pages:
            try:
                pagination_container = WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.XPATH, '//*[@id="root"]/div/div[4]/div[12]'))
                )
                next_page_button = pagination_container.find_element(By.XPATH, f'.//div[contains(@class,"DC_117_pageTabs") and text()="{page_number + 1}"]')
                
                # Scroll to the 'Next' button and click it
                actions = ActionChains(driver)
                actions.move_to_element(next_page_button).perform()
                time.sleep(2)
                next_page_button.click()
                
                # Wait for the next page to load
                WebDriverWait(driver, 10).until(
                    EC.text_to_be_present_in_element((By.XPATH, f'//div[contains(@class, "DC_117_pageTabs DC_117_pageActive") and text()="{page_number + 1}"]'), str(page_number + 1))
                )
                time.sleep(5)  # Ensure this wait time is enough for the next page to load
            except Exception as e:
                #print(f"Could not navigate to page {page_number + 1}: {e}")
                break

    return all_route_names_1, all_route_links_1

# Initialize lists to hold bus data
BUS_NAME = []
BUS_TYPE = []
START_TM = []
DUR_TM = []
END_TM = []
STR = []
SEATAV = []
PRICE = []
ROUTE_NAME = []
ROUTE_LINK = []

# Open the website
driver.get("https://www.redbus.in/online-booking/north-bengal-state-transport-corporation")

# Wait for the page to load
wait = WebDriverWait(driver, 10)
time.sleep(5)

# Extract all routes and bus details
all_route_names_1, all_route_links_1 = extract_all_routes_and_buses(driver, num_pages=num_pages)

# Extract bus details for each route link
for route_name, link in zip(all_route_names_1, all_route_links_1):
    driver.get(link)
    time.sleep(5)

    # Scroll down to ensure all content is loaded
    scroll_to_bottom(driver)

    # Click all "View Buses" buttons and extract data
    while True:
        view_buses_buttons = driver.find_elements(By.XPATH, '//div[@class="button" and contains(text(),"View Buses")]')

        if view_buses_buttons:
            seen_buttons = set()  # Track buttons already clicked
            for button in view_buses_buttons:
                button_id = button.get_attribute('id')  # Use a unique attribute like ID or another identifier
                if button_id not in seen_buttons:
                    seen_buttons.add(button_id)
                    try:
                        click_element(driver, button)
                        print("Clicked on 'View Buses' button")
                        time.sleep(5)  # Wait for the page to load

                        # Extract bus data
                        extract_bus_data(driver, route_name, link)
                        
                        # Scroll to the bottom to load more content
                        scroll_to_bottom(driver)
                        
                        time.sleep(5)  # Give some time before clicking the next button
                    except (NoSuchElementException, ElementClickInterceptedException, TimeoutException) as e:
                        print(f"Exception occurred: {e}")
                        continue
        else:
            # No "View Buses" buttons found, directly attempt to extract bus data
            extract_bus_data(driver, route_name, link)
            # Scroll to ensure all bus data is loaded
            scroll_to_bottom(driver)
            break  # Move to the next route after processing

all_data_5 = pd.DataFrame({
    "route_name": ROUTE_NAME,
    "route_link": ROUTE_LINK,
    "busname": BUS_NAME,
    "bustype": BUS_TYPE,
    "departing_time": START_TM,
    "duration": DUR_TM,
    "reaching_time": END_TM,
    "star_rating": STR,
    "price": PRICE,
    "seats_available": SEATAV
})

# Display the DataFrame
print(all_data_5)


Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
                                route_name  \
0                      Kolkata to Siliguri   
1                      Kolkata to Siliguri   
2                      Kolkata to Siliguri   
3                      Kolkata to Siliguri   
4                      Kolkata to Siliguri   
..                                     ...   
598  Cooch Behar (West Bengal) to Siliguri   
599  Cooch Behar (West Bengal) to Siliguri   
600  Cooch Behar (West Bengal) to Siliguri   
601  Cooch Behar (West Bengal) to Siliguri   
602  Cooch Behar (West Bengal

In [64]:
#6.PEPSU (Punjab) Bus Routes & Timings

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException, ElementClickInterceptedException
import pandas as pd
import time

num_pages = 4
# Initialize the WebDriver
driver = webdriver.Chrome()

# Function to extract route data from the current page
def extract_routes(driver):
    route_container = driver.find_element(By.CLASS_NAME, 'D117_main')
    route_links = route_container.find_elements(By.TAG_NAME, 'a')
    
    route_name = []
    route_link = []
    
    for link in route_links:
        href = link.get_attribute('href')
        name = link.get_attribute('title')
        route_link.append(href)
        route_name.append(name)
    
    return route_name, route_link

# Function to scroll to the bottom of the page
def scroll_to_bottom(driver):
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(3)  # Wait for new content to load
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

# Function to click an element
def click_element(driver, element):
    try:
        action = ActionChains(driver)
        action.move_to_element(element).perform()  # Move to the element
        WebDriverWait(driver, 10).until(EC.element_to_be_clickable(element)).click()
    except ElementClickInterceptedException:
        print("Element is not clickable. It might be obscured.")
    except Exception as e:
        print(f"Exception occurred while clicking: {e}")

# Function to extract bus data from the current page
def extract_bus_data(driver, route_name, link):
    try:
        wait = WebDriverWait(driver, 30)
        buscontainer = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//div[@class='clearfix bus-item']")))

        for bus in buscontainer:
            try:
                busname = bus.find_element(By.XPATH, ".//div[@class='travels lh-24 f-bold d-color']").text
            except:
                busname = "N/A"
            try:
                bustype = bus.find_element(By.XPATH, ".//div[@class='bus-type f-12 m-top-16 l-color evBus']").text
            except:
                bustype = "N/A"
            try:
                busstrttm = bus.find_element(By.XPATH, ".//div[@class='dp-time f-19 d-color f-bold']").text
            except:
                busstrttm = "N/A"
            try:
                busdur = bus.find_element(By.XPATH, ".//div[@class='dur l-color lh-24']").text
            except:
                busdur = "N/A"
            try:
                busendtm = bus.find_element(By.XPATH, ".//div[@class='bp-time f-19 d-color disp-Inline']").text
            except:
                busendtm = "N/A"
            try:
                str_rating = bus.find_element(By.XPATH, ".//span[@class='']").text
            except:
                str_rating = "0"
            try:
                seat = bus.find_element(By.XPATH, ".//div[@class='seat-left m-top-30']").text.replace(" Seats available", "")
            except:
                seat = "0"
            try:
                price = bus.find_element(By.XPATH, ".//div[@class='fare d-block']").text.replace("INR ", "")
            except:
                price = "N/A"

            BUS_NAME.append(busname)
            BUS_TYPE.append(bustype)
            START_TM.append(busstrttm)
            DUR_TM.append(busdur)
            END_TM.append(busendtm)
            STR.append(str_rating)
            SEATAV.append(seat)
            PRICE.append(price)
            ROUTE_NAME.append(route_name)
            ROUTE_LINK.append(link)

    except TimeoutException:
        print("Timeout while waiting for bus elements on link:", link)

# Function to extract data from all pages
def extract_all_routes_and_buses(driver, num_pages):
    all_route_names_1 = []
    all_route_links_1 = []

    # Scrape data from each page
    for page_number in range(1, num_pages + 1):
        # Extract routes from the current page
        route_names, route_links = extract_routes(driver)
        all_route_names_1.extend(route_names)
        all_route_links_1.extend(route_links)
        
        # Find and click the 'Next' button if it exists
        if page_number < num_pages:
            try:
                pagination_container = WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.XPATH, '//*[@id="root"]/div/div[4]/div[12]'))
                )
                next_page_button = pagination_container.find_element(By.XPATH, f'.//div[contains(@class,"DC_117_pageTabs") and text()="{page_number + 1}"]')
                
                # Scroll to the 'Next' button and click it
                actions = ActionChains(driver)
                actions.move_to_element(next_page_button).perform()
                time.sleep(2)
                next_page_button.click()
                
                # Wait for the next page to load
                WebDriverWait(driver, 10).until(
                    EC.text_to_be_present_in_element((By.XPATH, f'//div[contains(@class, "DC_117_pageTabs DC_117_pageActive") and text()="{page_number + 1}"]'), str(page_number + 1))
                )
                time.sleep(5)  # Ensure this wait time is enough for the next page to load
            except Exception as e:
                #print(f"Could not navigate to page {page_number + 1}: {e}")
                break

    return all_route_names_1, all_route_links_1

# Initialize lists to hold bus data
BUS_NAME = []
BUS_TYPE = []
START_TM = []
DUR_TM = []
END_TM = []
STR = []
SEATAV = []
PRICE = []
ROUTE_NAME = []
ROUTE_LINK = []

# Open the website
driver.get("https://www.redbus.in/online-booking/pepsu/?utm_source=rtchometile")

# Wait for the page to load
wait = WebDriverWait(driver, 10)
time.sleep(5)

# Extract all routes and bus details
all_route_names_1, all_route_links_1 = extract_all_routes_and_buses(driver, num_pages=num_pages)

# Extract bus details for each route link
for route_name, link in zip(all_route_names_1, all_route_links_1):
    driver.get(link)
    time.sleep(5)

    # Scroll down to ensure all content is loaded
    scroll_to_bottom(driver)

    # Click all "View Buses" buttons and extract data
    while True:
        view_buses_buttons = driver.find_elements(By.XPATH, '//div[@class="button" and contains(text(),"View Buses")]')

        if view_buses_buttons:
            seen_buttons = set()  # Track buttons already clicked
            for button in view_buses_buttons:
                button_id = button.get_attribute('id')  # Use a unique attribute like ID or another identifier
                if button_id not in seen_buttons:
                    seen_buttons.add(button_id)
                    try:
                        click_element(driver, button)
                        print("Clicked on 'View Buses' button")
                        time.sleep(5)  # Wait for the page to load

                        # Extract bus data
                        extract_bus_data(driver, route_name, link)
                        
                        # Scroll to the bottom to load more content
                        scroll_to_bottom(driver)
                        
                        time.sleep(5)  # Give some time before clicking the next button
                    except (NoSuchElementException, ElementClickInterceptedException, TimeoutException) as e:
                        print(f"Exception occurred: {e}")
                        continue
        else:
            # No "View Buses" buttons found, directly attempt to extract bus data
            extract_bus_data(driver, route_name, link)
            # Scroll to ensure all bus data is loaded
            scroll_to_bottom(driver)
            break  # Move to the next route after processing

all_data_6 = pd.DataFrame({
    "route_name": ROUTE_NAME,
    "route_link": ROUTE_LINK,
    "busname": BUS_NAME,
    "bustype": BUS_TYPE,
    "departing_time": START_TM,
    "duration": DUR_TM,
    "reaching_time": END_TM,
    "star_rating": STR,
    "price": PRICE,
    "seats_available": SEATAV
})

# Display the DataFrame
print(all_data_6)


Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
                 route_name  \
0          Patiala to Delhi   
1          Patiala to Delhi   
2          Patiala to Delhi   
3          Patiala to Delhi   
4          Patiala to Delhi   
...                     ...   
1535  Chandigarh to Patiala   
1536  Chandigarh to Patiala   
1537  Chandigarh to Patiala   
1538  Chandigarh to Patiala   
1539  Chandigarh to Patiala   

                                             route_link  \
0     https://www.redbus.in/bus-tickets/p

In [66]:
#7.Chandigarh Transport Undertaking (CTU) Bus Routes & Timings

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException, ElementClickInterceptedException
import pandas as pd
import time

num_pages = 6
# Initialize the WebDriver
driver = webdriver.Chrome()

# Function to extract route data from the current page
def extract_routes(driver):
    route_container = driver.find_element(By.CLASS_NAME, 'D117_main')
    route_links = route_container.find_elements(By.TAG_NAME, 'a')
    
    route_name = []
    route_link = []
    
    for link in route_links:
        href = link.get_attribute('href')
        name = link.get_attribute('title')
        route_link.append(href)
        route_name.append(name)
    
    return route_name, route_link

# Function to scroll to the bottom of the page
def scroll_to_bottom(driver):
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(3)  # Wait for new content to load
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

# Function to click an element
def click_element(driver, element):
    try:
        action = ActionChains(driver)
        action.move_to_element(element).perform()  # Move to the element
        WebDriverWait(driver, 10).until(EC.element_to_be_clickable(element)).click()
    except ElementClickInterceptedException:
        print("Element is not clickable. It might be obscured.")
    except Exception as e:
        print(f"Exception occurred while clicking: {e}")

# Function to extract bus data from the current page
def extract_bus_data(driver, route_name, link):
    try:
        wait = WebDriverWait(driver, 30)
        buscontainer = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//div[@class='clearfix bus-item']")))

        for bus in buscontainer:
            try:
                busname = bus.find_element(By.XPATH, ".//div[@class='travels lh-24 f-bold d-color']").text
            except:
                busname = "N/A"
            try:
                bustype = bus.find_element(By.XPATH, ".//div[@class='bus-type f-12 m-top-16 l-color evBus']").text
            except:
                bustype = "N/A"
            try:
                busstrttm = bus.find_element(By.XPATH, ".//div[@class='dp-time f-19 d-color f-bold']").text
            except:
                busstrttm = "N/A"
            try:
                busdur = bus.find_element(By.XPATH, ".//div[@class='dur l-color lh-24']").text
            except:
                busdur = "N/A"
            try:
                busendtm = bus.find_element(By.XPATH, ".//div[@class='bp-time f-19 d-color disp-Inline']").text
            except:
                busendtm = "N/A"
            try:
                str_rating = bus.find_element(By.XPATH, ".//span[@class='']").text
            except:
                str_rating = "0"
            try:
                seat = bus.find_element(By.XPATH, ".//div[@class='seat-left m-top-30']").text.replace(" Seats available", "")
            except:
                seat = "0"
            try:
                price = bus.find_element(By.XPATH, ".//div[@class='fare d-block']").text.replace("INR ", "")
            except:
                price = "N/A"

            BUS_NAME.append(busname)
            BUS_TYPE.append(bustype)
            START_TM.append(busstrttm)
            DUR_TM.append(busdur)
            END_TM.append(busendtm)
            STR.append(str_rating)
            SEATAV.append(seat)
            PRICE.append(price)
            ROUTE_NAME.append(route_name)
            ROUTE_LINK.append(link)

    except TimeoutException:
        print("Timeout while waiting for bus elements on link:", link)

# Function to extract data from all pages
def extract_all_routes_and_buses(driver, num_pages):
    all_route_names_1 = []
    all_route_links_1 = []

    # Scrape data from each page
    for page_number in range(1, num_pages + 1):
        # Extract routes from the current page
        route_names, route_links = extract_routes(driver)
        all_route_names_1.extend(route_names)
        all_route_links_1.extend(route_links)
        
        # Find and click the 'Next' button if it exists
        if page_number < num_pages:
            try:
                pagination_container = WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.XPATH, '//*[@id="root"]/div/div[4]/div[12]'))
                )
                next_page_button = pagination_container.find_element(By.XPATH, f'.//div[contains(@class,"DC_117_pageTabs") and text()="{page_number + 1}"]')
                
                # Scroll to the 'Next' button and click it
                actions = ActionChains(driver)
                actions.move_to_element(next_page_button).perform()
                time.sleep(2)
                next_page_button.click()
                
                # Wait for the next page to load
                WebDriverWait(driver, 10).until(
                    EC.text_to_be_present_in_element((By.XPATH, f'//div[contains(@class, "DC_117_pageTabs DC_117_pageActive") and text()="{page_number + 1}"]'), str(page_number + 1))
                )
                time.sleep(5)  # Ensure this wait time is enough for the next page to load
            except Exception as e:
                #print(f"Could not navigate to page {page_number + 1}: {e}")
                break

    return all_route_names_1, all_route_links_1

# Initialize lists to hold bus data
BUS_NAME = []
BUS_TYPE = []
START_TM = []
DUR_TM = []
END_TM = []
STR = []
SEATAV = []
PRICE = []
ROUTE_NAME = []
ROUTE_LINK = []

# Open the website
driver.get("https://www.redbus.in/online-booking/chandigarh-transport-undertaking-ctu")

# Wait for the page to load
wait = WebDriverWait(driver, 10)
time.sleep(5)

# Extract all routes and bus details
all_route_names_1, all_route_links_1 = extract_all_routes_and_buses(driver, num_pages=num_pages)

# Extract bus details for each route link
for route_name, link in zip(all_route_names_1, all_route_links_1):
    driver.get(link)
    time.sleep(5)

    # Scroll down to ensure all content is loaded
    scroll_to_bottom(driver)

    # Click all "View Buses" buttons and extract data
    while True:
        view_buses_buttons = driver.find_elements(By.XPATH, '//div[@class="button" and contains(text(),"View Buses")]')

        if view_buses_buttons:
            seen_buttons = set()  # Track buttons already clicked
            for button in view_buses_buttons:
                button_id = button.get_attribute('id')  # Use a unique attribute like ID or another identifier
                if button_id not in seen_buttons:
                    seen_buttons.add(button_id)
                    try:
                        click_element(driver, button)
                        print("Clicked on 'View Buses' button")
                        time.sleep(5)  # Wait for the page to load

                        # Extract bus data
                        extract_bus_data(driver, route_name, link)
                        
                        # Scroll to the bottom to load more content
                        scroll_to_bottom(driver)
                        
                        time.sleep(5)  # Give some time before clicking the next button
                    except (NoSuchElementException, ElementClickInterceptedException, TimeoutException) as e:
                        print(f"Exception occurred: {e}")
                        continue
        else:
            # No "View Buses" buttons found, directly attempt to extract bus data
            extract_bus_data(driver, route_name, link)
            # Scroll to ensure all bus data is loaded
            scroll_to_bottom(driver)
            break  # Move to the next route after processing

all_data_7 = pd.DataFrame({
    "route_name": ROUTE_NAME,
    "route_link": ROUTE_LINK,
    "busname": BUS_NAME,
    "bustype": BUS_TYPE,
    "departing_time": START_TM,
    "duration": DUR_TM,
    "reaching_time": END_TM,
    "star_rating": STR,
    "price": PRICE,
    "seats_available": SEATAV
})

# Display the DataFrame
print(all_data_7)


Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked 

In [72]:
# 8. WBTC (CTC) Bus Routes & Timings

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException, ElementClickInterceptedException
import pandas as pd
import time

num_pages = 6
# Initialize the WebDriver
driver = webdriver.Chrome()

# Function to extract route data from the current page
def extract_routes(driver):
    route_container = driver.find_element(By.CLASS_NAME, 'D117_main')
    route_links = route_container.find_elements(By.TAG_NAME, 'a')
    
    route_name = []
    route_link = []
    
    for link in route_links:
        href = link.get_attribute('href')
        name = link.get_attribute('title')
        route_link.append(href)
        route_name.append(name)
    
    return route_name, route_link

# Function to scroll to the bottom of the page
def scroll_to_bottom(driver):
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(3)  # Wait for new content to load
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

# Function to click an element
def click_element(driver, element):
    try:
        action = ActionChains(driver)
        action.move_to_element(element).perform()  # Move to the element
        WebDriverWait(driver, 10).until(EC.element_to_be_clickable(element)).click()
    except ElementClickInterceptedException:
        print("Element is not clickable. It might be obscured.")
    except Exception as e:
        print(f"Exception occurred while clicking: {e}")

# Function to extract bus data from the current page
def extract_bus_data(driver, route_name, link):
    try:
        wait = WebDriverWait(driver, 30)
        buscontainer = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//div[@class='clearfix bus-item']")))

        for bus in buscontainer:
            try:
                busname = bus.find_element(By.XPATH, ".//div[@class='travels lh-24 f-bold d-color']").text
            except:
                busname = "N/A"
            try:
                bustype = bus.find_element(By.XPATH, ".//div[@class='bus-type f-12 m-top-16 l-color evBus']").text
            except:
                bustype = "N/A"
            try:
                busstrttm = bus.find_element(By.XPATH, ".//div[@class='dp-time f-19 d-color f-bold']").text
            except:
                busstrttm = "N/A"
            try:
                busdur = bus.find_element(By.XPATH, ".//div[@class='dur l-color lh-24']").text
            except:
                busdur = "N/A"
            try:
                busendtm = bus.find_element(By.XPATH, ".//div[@class='bp-time f-19 d-color disp-Inline']").text
            except:
                busendtm = "N/A"
            try:
                str_rating = bus.find_element(By.XPATH, ".//span[@class='']").text
            except:
                str_rating = "0"
            try:
                seat = bus.find_element(By.XPATH, ".//div[@class='seat-left m-top-30']").text.replace(" Seats available", "")
            except:
                seat = "0"
            try:
                price = bus.find_element(By.XPATH, ".//div[@class='fare d-block']").text.replace("INR ", "")
            except:
                price = "N/A"

            BUS_NAME.append(busname)
            BUS_TYPE.append(bustype)
            START_TM.append(busstrttm)
            DUR_TM.append(busdur)
            END_TM.append(busendtm)
            STR.append(str_rating)
            SEATAV.append(seat)
            PRICE.append(price)
            ROUTE_NAME.append(route_name)
            ROUTE_LINK.append(link)

    except TimeoutException:
        print("Timeout while waiting for bus elements on link:", link)

# Function to extract data from all pages
def extract_all_routes_and_buses(driver, num_pages):
    all_route_names_1 = []
    all_route_links_1 = []

    # Scrape data from each page
    for page_number in range(1, num_pages + 1):
        # Extract routes from the current page
        route_names, route_links = extract_routes(driver)
        all_route_names_1.extend(route_names)
        all_route_links_1.extend(route_links)
        
        # Find and click the 'Next' button if it exists
        if page_number < num_pages:
            try:
                pagination_container = WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.XPATH, '//*[@id="root"]/div/div[4]/div[12]'))
                )
                next_page_button = pagination_container.find_element(By.XPATH, f'.//div[contains(@class,"DC_117_pageTabs") and text()="{page_number + 1}"]')
                
                # Scroll to the 'Next' button and click it
                actions = ActionChains(driver)
                actions.move_to_element(next_page_button).perform()
                time.sleep(2)
                next_page_button.click()
                
                # Wait for the next page to load
                WebDriverWait(driver, 10).until(
                    EC.text_to_be_present_in_element((By.XPATH, f'//div[contains(@class, "DC_117_pageTabs DC_117_pageActive") and text()="{page_number + 1}"]'), str(page_number + 1))
                )
                time.sleep(5)  # Ensure this wait time is enough for the next page to load
            except Exception as e:
                #print(f"Could not navigate to page {page_number + 1}: {e}")
                break

    return all_route_names_1, all_route_links_1

# Initialize lists to hold bus data
BUS_NAME = []
BUS_TYPE = []
START_TM = []
DUR_TM = []
END_TM = []
STR = []
SEATAV = []
PRICE = []
ROUTE_NAME = []
ROUTE_LINK = []

# Open the website
driver.get("https://www.redbus.in/online-booking/wbtc-ctc/?utm_source=rtchometile")

# Wait for the page to load
wait = WebDriverWait(driver, 10)
time.sleep(5)

# Extract all routes and bus details
all_route_names_1, all_route_links_1 = extract_all_routes_and_buses(driver, num_pages=num_pages)

# Extract bus details for each route link
for route_name, link in zip(all_route_names_1, all_route_links_1):
    driver.get(link)
    time.sleep(5)

    # Scroll down to ensure all content is loaded
    scroll_to_bottom(driver)

    # Click all "View Buses" buttons and extract data
    while True:
        view_buses_buttons = driver.find_elements(By.XPATH, '//div[@class="button" and contains(text(),"View Buses")]')

        if view_buses_buttons:
            seen_buttons = set()  # Track buttons already clicked
            for button in view_buses_buttons:
                button_id = button.get_attribute('id')  # Use a unique attribute like ID or another identifier
                if button_id not in seen_buttons:
                    seen_buttons.add(button_id)
                    try:
                        click_element(driver, button)
                        print("Clicked on 'View Buses' button")
                        time.sleep(5)  # Wait for the page to load

                        # Extract bus data
                        extract_bus_data(driver, route_name, link)
                        
                        # Scroll to the bottom to load more content
                        scroll_to_bottom(driver)
                        
                        time.sleep(5)  # Give some time before clicking the next button
                    except (NoSuchElementException, ElementClickInterceptedException, TimeoutException) as e:
                        print(f"Exception occurred: {e}")
                        continue
        else:
            # No "View Buses" buttons found, directly attempt to extract bus data
            extract_bus_data(driver, route_name, link)
            # Scroll to ensure all bus data is loaded
            scroll_to_bottom(driver)
            break  # Move to the next route after processing

all_data_8 = pd.DataFrame({
    "route_name": ROUTE_NAME,
    "route_link": ROUTE_LINK,
    "busname": BUS_NAME,
    "bustype": BUS_TYPE,
    "departing_time": START_TM,
    "duration": DUR_TM,
    "reaching_time": END_TM,
    "star_rating": STR,
    "price": PRICE,
    "seats_available": SEATAV
})

# Display the DataFrame
print(all_data_8)


Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked 

In [73]:
# 9. Assam State Transport Corporation (ASTC) Bus Routes & Timings

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException, ElementClickInterceptedException
import pandas as pd
import time

num_pages = 6
# Initialize the WebDriver
driver = webdriver.Chrome()

# Function to extract route data from the current page
def extract_routes(driver):
    route_container = driver.find_element(By.CLASS_NAME, 'D117_main')
    route_links = route_container.find_elements(By.TAG_NAME, 'a')
    
    route_name = []
    route_link = []
    
    for link in route_links:
        href = link.get_attribute('href')
        name = link.get_attribute('title')
        route_link.append(href)
        route_name.append(name)
    
    return route_name, route_link

# Function to scroll to the bottom of the page
def scroll_to_bottom(driver):
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(3)  # Wait for new content to load
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

# Function to click an element
def click_element(driver, element):
    try:
        action = ActionChains(driver)
        action.move_to_element(element).perform()  # Move to the element
        WebDriverWait(driver, 10).until(EC.element_to_be_clickable(element)).click()
    except ElementClickInterceptedException:
        print("Element is not clickable. It might be obscured.")
    except Exception as e:
        print(f"Exception occurred while clicking: {e}")

# Function to extract bus data from the current page
def extract_bus_data(driver, route_name, link):
    try:
        wait = WebDriverWait(driver, 30)
        buscontainer = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//div[@class='clearfix bus-item']")))

        for bus in buscontainer:
            try:
                busname = bus.find_element(By.XPATH, ".//div[@class='travels lh-24 f-bold d-color']").text
            except:
                busname = "N/A"
            try:
                bustype = bus.find_element(By.XPATH, ".//div[@class='bus-type f-12 m-top-16 l-color evBus']").text
            except:
                bustype = "N/A"
            try:
                busstrttm = bus.find_element(By.XPATH, ".//div[@class='dp-time f-19 d-color f-bold']").text
            except:
                busstrttm = "N/A"
            try:
                busdur = bus.find_element(By.XPATH, ".//div[@class='dur l-color lh-24']").text
            except:
                busdur = "N/A"
            try:
                busendtm = bus.find_element(By.XPATH, ".//div[@class='bp-time f-19 d-color disp-Inline']").text
            except:
                busendtm = "N/A"
            try:
                str_rating = bus.find_element(By.XPATH, ".//span[@class='']").text
            except:
                str_rating = "0"
            try:
                seat = bus.find_element(By.XPATH, ".//div[@class='seat-left m-top-30']").text.replace(" Seats available", "")
            except:
                seat = "0"
            try:
                price = bus.find_element(By.XPATH, ".//div[@class='fare d-block']").text.replace("INR ", "")
            except:
                price = "N/A"

            BUS_NAME.append(busname)
            BUS_TYPE.append(bustype)
            START_TM.append(busstrttm)
            DUR_TM.append(busdur)
            END_TM.append(busendtm)
            STR.append(str_rating)
            SEATAV.append(seat)
            PRICE.append(price)
            ROUTE_NAME.append(route_name)
            ROUTE_LINK.append(link)

    except TimeoutException:
        print("Timeout while waiting for bus elements on link:", link)

# Function to extract data from all pages
def extract_all_routes_and_buses(driver, num_pages):
    all_route_names_1 = []
    all_route_links_1 = []

    # Scrape data from each page
    for page_number in range(1, num_pages + 1):
        # Extract routes from the current page
        route_names, route_links = extract_routes(driver)
        all_route_names_1.extend(route_names)
        all_route_links_1.extend(route_links)
        
        # Find and click the 'Next' button if it exists
        if page_number < num_pages:
            try:
                pagination_container = WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.XPATH, '//*[@id="root"]/div/div[4]/div[12]'))
                )
                next_page_button = pagination_container.find_element(By.XPATH, f'.//div[contains(@class,"DC_117_pageTabs") and text()="{page_number + 1}"]')
                
                # Scroll to the 'Next' button and click it
                actions = ActionChains(driver)
                actions.move_to_element(next_page_button).perform()
                time.sleep(2)
                next_page_button.click()
                
                # Wait for the next page to load
                WebDriverWait(driver, 10).until(
                    EC.text_to_be_present_in_element((By.XPATH, f'//div[contains(@class, "DC_117_pageTabs DC_117_pageActive") and text()="{page_number + 1}"]'), str(page_number + 1))
                )
                time.sleep(5)  # Ensure this wait time is enough for the next page to load
            except Exception as e:
                #print(f"Could not navigate to page {page_number + 1}: {e}")
                break

    return all_route_names_1, all_route_links_1

# Initialize lists to hold bus data
BUS_NAME = []
BUS_TYPE = []
START_TM = []
DUR_TM = []
END_TM = []
STR = []
SEATAV = []
PRICE = []
ROUTE_NAME = []
ROUTE_LINK = []

# Open the website
driver.get("https://www.redbus.in/online-booking/astc/?utm_source=rtchometile")

# Wait for the page to load
wait = WebDriverWait(driver, 10)
time.sleep(5)

# Extract all routes and bus details
all_route_names_1, all_route_links_1 = extract_all_routes_and_buses(driver, num_pages=num_pages)

# Extract bus details for each route link
for route_name, link in zip(all_route_names_1, all_route_links_1):
    driver.get(link)
    time.sleep(5)

    # Scroll down to ensure all content is loaded
    scroll_to_bottom(driver)

    # Click all "View Buses" buttons and extract data
    while True:
        view_buses_buttons = driver.find_elements(By.XPATH, '//div[@class="button" and contains(text(),"View Buses")]')

        if view_buses_buttons:
            seen_buttons = set()  # Track buttons already clicked
            for button in view_buses_buttons:
                button_id = button.get_attribute('id')  # Use a unique attribute like ID or another identifier
                if button_id not in seen_buttons:
                    seen_buttons.add(button_id)
                    try:
                        click_element(driver, button)
                        print("Clicked on 'View Buses' button")
                        time.sleep(5)  # Wait for the page to load

                        # Extract bus data
                        extract_bus_data(driver, route_name, link)
                        
                        # Scroll to the bottom to load more content
                        scroll_to_bottom(driver)
                        
                        time.sleep(5)  # Give some time before clicking the next button
                    except (NoSuchElementException, ElementClickInterceptedException, TimeoutException) as e:
                        print(f"Exception occurred: {e}")
                        continue
        else:
            # No "View Buses" buttons found, directly attempt to extract bus data
            extract_bus_data(driver, route_name, link)
            # Scroll to ensure all bus data is loaded
            scroll_to_bottom(driver)
            break  # Move to the next route after processing

all_data_9 = pd.DataFrame({
    "route_name": ROUTE_NAME,
    "route_link": ROUTE_LINK,
    "busname": BUS_NAME,
    "bustype": BUS_TYPE,
    "departing_time": START_TM,
    "duration": DUR_TM,
    "reaching_time": END_TM,
    "star_rating": STR,
    "price": PRICE,
    "seats_available": SEATAV
})

# Display the DataFrame
print(all_data_9)


Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Timeout while waiting for bus elements on link: https://www.redbus.in/bus-tickets/guwahati-to-nagaon
Timeout while waiting for bus elements on link: https://www.redbus.in/bus-tickets/jorhat-to-dhemaji
Timeout while waiting for bus elements on link: https://www.redbus.in/bus-tickets/sibsagar-to-north-lakhimpur
Timeout while waiting for bus elements on link: https://www.redbus.in/bus-tickets/dhemaji-to-jorhat
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Clicked on 'View Buses' button
Timeout while waiting for bus elements on link: https://www.redbus.in/bus-tickets/jorhat-to-gogamukh
Clicked on 'View Buses' button
Timeout while waiting for bus elements on link: https://www.redbus.in/bus-tickets/north-lakhimpur-to-golaghat
Timeout while waiting for bus elements on link: https://www.redbus.in/bus-tickets/golaghat-to-north-lakhimpur
Timeout while waiting for bus elements on link: htt

In [75]:
# 10. Kadamba Transport Corporation Limited (KTCL) Bus Routes & Timings

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException, ElementClickInterceptedException
import pandas as pd
import time

num_pages = 5
# Initialize the WebDriver
driver = webdriver.Chrome()

# Function to extract route data from the current page
def extract_routes(driver):
    route_container = driver.find_element(By.CLASS_NAME, 'D117_main')
    route_links = route_container.find_elements(By.TAG_NAME, 'a')
    
    route_name = []
    route_link = []
    
    for link in route_links:
        href = link.get_attribute('href')
        name = link.get_attribute('title')
        route_link.append(href)
        route_name.append(name)
    
    return route_name, route_link

# Function to scroll to the bottom of the page
def scroll_to_bottom(driver):
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(3)  # Wait for new content to load
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

# Function to click an element
def click_element(driver, element):
    try:
        action = ActionChains(driver)
        action.move_to_element(element).perform()  # Move to the element
        WebDriverWait(driver, 10).until(EC.element_to_be_clickable(element)).click()
    except ElementClickInterceptedException:
        print("Element is not clickable. It might be obscured.")
    except Exception as e:
        print(f"Exception occurred while clicking: {e}")

# Function to extract bus data from the current page
def extract_bus_data(driver, route_name, link):
    try:
        wait = WebDriverWait(driver, 30)
        buscontainer = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//div[@class='clearfix bus-item']")))

        for bus in buscontainer:
            try:
                busname = bus.find_element(By.XPATH, ".//div[@class='travels lh-24 f-bold d-color']").text
            except:
                busname = "N/A"
            try:
                bustype = bus.find_element(By.XPATH, ".//div[@class='bus-type f-12 m-top-16 l-color evBus']").text
            except:
                bustype = "N/A"
            try:
                busstrttm = bus.find_element(By.XPATH, ".//div[@class='dp-time f-19 d-color f-bold']").text
            except:
                busstrttm = "N/A"
            try:
                busdur = bus.find_element(By.XPATH, ".//div[@class='dur l-color lh-24']").text
            except:
                busdur = "N/A"
            try:
                busendtm = bus.find_element(By.XPATH, ".//div[@class='bp-time f-19 d-color disp-Inline']").text
            except:
                busendtm = "N/A"
            try:
                str_rating = bus.find_element(By.XPATH, ".//span[@class='']").text
            except:
                str_rating = "0"
            try:
                seat = bus.find_element(By.XPATH, ".//div[@class='seat-left m-top-30']").text.replace(" Seats available", "")
            except:
                seat = "0"
            try:
                price = bus.find_element(By.XPATH, ".//div[@class='fare d-block']").text.replace("INR ", "")
            except:
                price = "N/A"

            BUS_NAME.append(busname)
            BUS_TYPE.append(bustype)
            START_TM.append(busstrttm)
            DUR_TM.append(busdur)
            END_TM.append(busendtm)
            STR.append(str_rating)
            SEATAV.append(seat)
            PRICE.append(price)
            ROUTE_NAME.append(route_name)
            ROUTE_LINK.append(link)

    except TimeoutException:
        print("Timeout while waiting for bus elements on link:", link)

# Function to extract data from all pages
def extract_all_routes_and_buses(driver, num_pages):
    all_route_names_1 = []
    all_route_links_1 = []

    # Scrape data from each page
    for page_number in range(1, num_pages + 1):
        # Extract routes from the current page
        route_names, route_links = extract_routes(driver)
        all_route_names_1.extend(route_names)
        all_route_links_1.extend(route_links)
        
        # Find and click the 'Next' button if it exists
        if page_number < num_pages:
            try:
                pagination_container = WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.XPATH, '//*[@id="root"]/div/div[4]/div[12]'))
                )
                next_page_button = pagination_container.find_element(By.XPATH, f'.//div[contains(@class,"DC_117_pageTabs") and text()="{page_number + 1}"]')
                
                # Scroll to the 'Next' button and click it
                actions = ActionChains(driver)
                actions.move_to_element(next_page_button).perform()
                time.sleep(2)
                next_page_button.click()
                
                # Wait for the next page to load
                WebDriverWait(driver, 10).until(
                    EC.text_to_be_present_in_element((By.XPATH, f'//div[contains(@class, "DC_117_pageTabs DC_117_pageActive") and text()="{page_number + 1}"]'), str(page_number + 1))
                )
                time.sleep(5)  # Ensure this wait time is enough for the next page to load
            except Exception as e:
                #print(f"Could not navigate to page {page_number + 1}: {e}")
                break

    return all_route_names_1, all_route_links_1

# Initialize lists to hold bus data
BUS_NAME = []
BUS_TYPE = []
START_TM = []
DUR_TM = []
END_TM = []
STR = []
SEATAV = []
PRICE = []
ROUTE_NAME = []
ROUTE_LINK = []

# Open the website
driver.get("https://www.redbus.in/online-booking/ktcl/?utm_source=rtchometile")

# Wait for the page to load
wait = WebDriverWait(driver, 10)
time.sleep(5)

# Extract all routes and bus details
all_route_names_1, all_route_links_1 = extract_all_routes_and_buses(driver, num_pages=num_pages)

# Extract bus details for each route link
for route_name, link in zip(all_route_names_1, all_route_links_1):
    driver.get(link)
    time.sleep(5)

    # Scroll down to ensure all content is loaded
    scroll_to_bottom(driver)

    # Click all "View Buses" buttons and extract data
    while True:
        view_buses_buttons = driver.find_elements(By.XPATH, '//div[@class="button" and contains(text(),"View Buses")]')

        if view_buses_buttons:
            seen_buttons = set()  # Track buttons already clicked
            for button in view_buses_buttons:
                button_id = button.get_attribute('id')  # Use a unique attribute like ID or another identifier
                if button_id not in seen_buttons:
                    seen_buttons.add(button_id)
                    try:
                        click_element(driver, button)
                        print("Clicked on 'View Buses' button")
                        time.sleep(5)  # Wait for the page to load

                        # Extract bus data
                        extract_bus_data(driver, route_name, link)
                        
                        # Scroll to the bottom to load more content
                        scroll_to_bottom(driver)
                        
                        time.sleep(5)  # Give some time before clicking the next button
                    except (NoSuchElementException, ElementClickInterceptedException, TimeoutException) as e:
                        print(f"Exception occurred: {e}")
                        continue
        else:
            # No "View Buses" buttons found, directly attempt to extract bus data
            extract_bus_data(driver, route_name, link)
            # Scroll to ensure all bus data is loaded
            scroll_to_bottom(driver)
            break  # Move to the next route after processing

all_data_10 = pd.DataFrame({
    "route_name": ROUTE_NAME,
    "route_link": ROUTE_LINK,
    "busname": BUS_NAME,
    "bustype": BUS_TYPE,
    "departing_time": START_TM,
    "duration": DUR_TM,
    "reaching_time": END_TM,
    "star_rating": STR,
    "price": PRICE,
    "seats_available": SEATAV
})

# Display the DataFrame
print(all_data_10)


Timeout while waiting for bus elements on link: https://www.redbus.in/bus-tickets/ponda-to-belagavi
Timeout while waiting for bus elements on link: https://www.redbus.in/bus-tickets/marcel-to-belagavi
Timeout while waiting for bus elements on link: https://www.redbus.in/bus-tickets/belagavi-to-marcel
Timeout while waiting for bus elements on link: https://www.redbus.in/bus-tickets/belagavi-to-saquelim
Timeout while waiting for bus elements on link: https://www.redbus.in/bus-tickets/saquelim-to-belagavi
Clicked on 'View Buses' button
                    route_name  \
0                  Pune to Goa   
1                  Pune to Goa   
2                  Pune to Goa   
3                  Pune to Goa   
4                  Pune to Goa   
..                         ...   
443  Calangute (goa) to Panaji   
444      Goa Airport to Panaji   
445      Goa Airport to Panaji   
446      Goa Airport to Panaji   
447      Goa Airport to Panaji   

                                            route_li

In [76]:
red_bus_data=pd.concat([all_data_1,all_data_2,all_data_3,all_data_4,all_data_5,all_data_6,all_data_7,all_data_8,all_data_9,all_data_10],ignore_index=True)

In [122]:
red_bus_data=pd.DataFrame(red_bus_data)
red_bus_data

Unnamed: 0,route_name,route_link,busname,bustype,departing_time,duration,reaching_time,star_rating,price,seats_available
0,Jammu (j and k) to Srinagar,https://www.redbus.in/bus-tickets/jammu-to-sri...,New Pal Travels,NON A/C Seater / Sleeper (2+2),21:15,10h 00m,07:15,1.4,719.0,0
1,Jammu (j and k) to Srinagar,https://www.redbus.in/bus-tickets/jammu-to-sri...,North Kashmir Tour and Travels,NON A/C Sleeper (2+2),16:30,10h 00m,02:30,1.0,699.0,0
2,Jammu (j and k) to Srinagar,https://www.redbus.in/bus-tickets/jammu-to-sri...,Harikesh Tour N Travels,NON A/C Seater / Sleeper (2+2),20:30,09h 30m,06:00,1.0,719.0,0
3,Jammu (j and k) to Srinagar,https://www.redbus.in/bus-tickets/jammu-to-sri...,Jamna Travels-Jammu,Non AC Seater (2+2),21:30,10h 30m,08:00,2.3,522.0,0
4,Jammu (j and k) to Srinagar,https://www.redbus.in/bus-tickets/jammu-to-sri...,North Kashmir Tour and Travels,NON A/C Sleeper (2+2),17:15,10h 00m,03:15,1.0,699.0,0
...,...,...,...,...,...,...,...,...,...,...
8570,Calangute (goa) to Panaji,https://www.redbus.in/bus-tickets/calangute-go...,Kadamba Transport Corporation Limited (KTCL) -...,AC Seater (2+2),15:30,00h 30m,16:00,3.7,100.0,0
8571,Goa Airport to Panaji,https://www.redbus.in/bus-tickets/goa-airport-...,Kadamba Transport Corporation Limited (KTCL) -...,AC Seater (2+2),07:30,01h 00m,08:30,4.4,150.0,0
8572,Goa Airport to Panaji,https://www.redbus.in/bus-tickets/goa-airport-...,Kadamba Transport Corporation Limited (KTCL) -...,Volvo AC Seater 2+2,08:00,01h 00m,09:00,4.4,150.0,0
8573,Goa Airport to Panaji,https://www.redbus.in/bus-tickets/goa-airport-...,Kadamba Transport Corporation Limited (KTCL) -...,AC Seater (2+2),13:45,01h 00m,14:45,4.4,150.0,0


In [109]:
red_bus_data=red_bus_data.dropna()

In [111]:
red_bus_data.to_csv('red_bus_data.csv',index=False)

In [112]:
red_bus_data=pd.read_csv('red_bus_data.csv')

In [None]:
import pymysql
myconnection=pymysql.connect(host="127.0.0.1",user="root",passwd="Farvez@12345")
myconnection.cursor().execute("create database red_bus_project")

In [117]:
a = ",".join(f"{i} {j}"
for i,j in zip(red_bus_data.columns,red_bus_data.dtypes)).replace("object","text").replace("float64","float").replace("int64","int")

# Replace specific types
a = a.replace("departing_time text", "departing_time time")
a = a.replace("reaching_time text", "reaching_time time")
a = a.replace("seats_available int", "seats_available int")
a = a.replace("price int", "price decimal")

# Add primary key column
a = "id INT AUTO_INCREMENT PRIMARY KEY," + a

print(a)

id INT AUTO_INCREMENT PRIMARY KEY,route_name text,route_link text,busname text,bustype text,departing_time time,duration text,reaching_time time,star_rating float,price float,seats_available text


In [118]:
f"create table red_bus_data ({a})"

'create table red_bus_data (id INT AUTO_INCREMENT PRIMARY KEY,route_name text,route_link text,busname text,bustype text,departing_time time,duration text,reaching_time time,star_rating float,price float,seats_available text)'

In [119]:
myconnection.cursor().execute(f"create table red_bus_project.red_bus_data ({a})")

0

In [120]:
len(red_bus_data)

8575

In [105]:
red_bus_data.isnull().sum()

route_name           0
route_link           0
busname            563
bustype            581
departing_time     563
duration           563
reaching_time      563
star_rating        563
price              563
seats_available    131
dtype: int64

In [106]:
df=red_bus_data.dropna()

In [110]:
red_bus_data.isnull().sum()

route_name         0
route_link         0
busname            0
bustype            0
departing_time     0
duration           0
reaching_time      0
star_rating        0
price              0
seats_available    0
dtype: int64

In [108]:
len(df)

8575

In [133]:
import pymysql

# Establish the database connection
myconnection = pymysql.connect(host="127.0.0.1", user="root", passwd="Farvez@12345", database="red_bus_project")

# Create a cursor object
cursor = myconnection.cursor()

# Define the insert query
insert_query = """
INSERT INTO red_bus_data (route_name, route_link, busname, bustype, departing_time, duration, reaching_time, star_rating, price, seats_available) 
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
"""

# Iterate over the DataFrame and insert each row
for index, row in red_bus_data.iterrows():
    cursor.execute(insert_query, (row['route_name'], row['route_link'], row['busname'], row['bustype'], row['departing_time'], row['duration'], row['reaching_time'], row['star_rating'], row['price'], row['seats_available']))

# Commit the transaction
myconnection.commit()

# Close the cursor and connection
cursor.close()
myconnection.close()
