In [2]:
"""
Scrape bus information from a list of URLs, store the data in DataFrames, and save the combined data to a CSV file.

This script initializes a Selenium WebDriver, navigates to each URL, interacts with elements on the page to collect bus details, and appends the data to a list of DataFrames. The data is then concatenated into a single DataFrame and saved to a CSV file for further analysis.

Exceptions are handled for navigation and element interaction errors. The final DataFrame is saved to 'redbus_final_data.csv' if data is collected.
"""

import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import WebDriverException, TimeoutException, NoSuchElementException

# Initialize Chrome WebDriver
driver = webdriver.Chrome()
driver.maximize_window()

# Read URLs from CSV into a DataFrame, including bus_link and bus_route
urls_df = pd.read_csv('redbus_scraped_datas.csv', usecols=['bus_link', 'bus_route'])

# Initialize a list to store all DataFrames
all_dfs = []

# Loop through each URL in the DataFrame
for i, row in urls_df.iterrows():
    url = str(row['bus_link']).strip()  # Convert to string and strip whitespace
    bus_route = row['bus_route']
    try:
        # Check if URL is valid (not NaN or malformed)
        if not url or url.lower() == 'nan':
            print(f"Skipping invalid URL at index {i}: {url}")
            continue

        # Navigate to the URL
        driver.get(url)
        print(f"Scraping data from: {url}")

        try:
            # Wait until at least one button with the desired class name is present
            WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "button")))

            # Find all buttons with the desired class name
            buttons = driver.find_elements(By.CLASS_NAME, "button")

            # Loop through each button and click the ones with text "VIEW BUSES"
            for button in buttons:
                if button.text == "VIEW BUSES":
                    try:
                        # Scroll to the button to bring it into view
                        driver.execute_script("arguments[0].scrollIntoView(true);", button)
                        time.sleep(1)  # Small delay to ensure scrolling is complete

                        # Click the button using JavaScript
                        driver.execute_script("arguments[0].click();", button)
                        time.sleep(2)  # Wait for data to load after clicking
                    except Exception as e:
                        print(f"Error clicking button: {e}")

        except NoSuchElementException as e:
            print(f"No such element: {e}")
        except TimeoutException as e:
            print(f"Timeout: {e}")
        except WebDriverException as e:
            print(f"WebDriverException: {e}")

        # Wait for the main content to be loaded
        WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.XPATH, '//div[@class="clearfix bus-item"]')))

        # Scroll down to the bottom of the page to load all content
        last_height = driver.execute_script("return document.body.scrollHeight")

        while True:
            # Scroll down by one page height
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(5)
            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height

        # Extract the web elements
        results = driver.find_elements(By.XPATH, '//div[@class="clearfix bus-item"]')

        # Initialize an empty list to store the results for the current URL
        data_list = []

        for result in results:
            try:
                bus_name = result.find_element(By.XPATH, './/div[@class="travels lh-24 f-bold d-color"]').text.strip()
            except NoSuchElementException:
                bus_name = None

            try:
                bus_type = result.find_element(By.XPATH, './/div[@class="bus-type f-12 m-top-16 l-color evBus"]').text.strip()
            except NoSuchElementException:
                bus_type = None

            try:
                depart_time = result.find_element(By.XPATH, './/div[@class="dp-time f-19 d-color f-bold"]').text.strip()
            except NoSuchElementException:
                depart_time = None

            try:
                arr_time = result.find_element(By.XPATH, './/div[@class="bp-time f-19 d-color disp-Inline"]').text.strip()
            except NoSuchElementException:
                arr_time = None

            try:
                dur = result.find_element(By.XPATH, './/div[@class="dur l-color lh-24"]').text.strip()
            except NoSuchElementException:
                dur = None

            try:
                fare = result.find_element(By.CSS_SELECTOR, 'span.f-19.f-bold').text.strip()
            except NoSuchElementException:
                fare = None

            try:
                rating = result.find_element(By.XPATH, './/div[contains(@class, "rating-sec") and contains(@class, "lh-24")]').text.strip()
            except NoSuchElementException:
                rating = '0'

            try:
                seat_availability = result.find_element(By.XPATH, './/div[contains(@class, "seat-left") and contains(@class, "m-top-30")]').text.strip().split()[0]
            except NoSuchElementException:
                seat_availability = None

            # Append data as a tuple to the list, including bus_link and bus_route
            data_list.append((bus_name, bus_type, depart_time, arr_time, dur, fare, rating, seat_availability, url, bus_route))

        # Create DataFrame from the list of tuples for the current URL
        df = pd.DataFrame(data_list, columns=["Bus Name", "Bus Type", "Departure Time", "Arrival Time", "Duration", "Fare", "Rating", "Seat Availability", "Bus Link", "Bus Route"])

        # Append the DataFrame to the list of all DataFrames
        all_dfs.append(df)

    except (WebDriverException, TimeoutException) as e:
        print(f"Error navigating to URL: {url}")
        print(str(e))  # Print the exception details for debugging purposes

# Close the Selenium driver
driver.quit()

# Concatenate all DataFrames in the list into a single DataFrame
if all_dfs:
    total_results = pd.concat(all_dfs, ignore_index=True)
    # Save the concatenated DataFrame to a CSV file
    total_results.to_csv('redbus_final_data.csv', index=False)
    # Print the final concatenated DataFrame
    print(total_results)
else:
    print("No data collected.")


Scraping data from: https://www.redbus.in/bus-tickets/pune-to-goa
Scraping data from: https://www.redbus.in/bus-tickets/goa-to-pune
Scraping data from: https://www.redbus.in/bus-tickets/mumbai-to-goa
Scraping data from: https://www.redbus.in/bus-tickets/goa-to-mumbai
Scraping data from: https://www.redbus.in/bus-tickets/pandharpur-to-goa
Scraping data from: https://www.redbus.in/bus-tickets/bangalore-to-goa
Scraping data from: https://www.redbus.in/bus-tickets/goa-to-pandharpur
Scraping data from: https://www.redbus.in/bus-tickets/belagavi-to-goa
Scraping data from: https://www.redbus.in/bus-tickets/goa-to-bangalore
Scraping data from: https://www.redbus.in/bus-tickets/solapur-to-goa
Scraping data from: https://www.redbus.in/bus-tickets/goa-to-kolhapur-maharashtra
Scraping data from: https://www.redbus.in/bus-tickets/goa-to-solapur
Scraping data from: https://www.redbus.in/bus-tickets/goa-to-sangola
Scraping data from: https://www.redbus.in/bus-tickets/sangola-to-goa
Scraping data from