In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import re
import pandas as pd

#scrolling function
def scroll_to_bottom(driver):
    scrolling = True
    while scrolling:
        old_page_source = driver.page_source
        body=driver.find_element(By.TAG_NAME, 'body')
        body.send_keys(Keys.END)
        time.sleep(3)
        new_page_source = driver.page_source
        if old_page_source == new_page_source:
            scrolling = False

driver = webdriver.Chrome()
wait = WebDriverWait(driver, 20)

driver.get("https://www.redbus.in/online-booking/chandigarh-transport-undertaking-ctu");
driver.maximize_window()
scroll_to_bottom(driver)


#to check viewbus button for gvt buses
def click_view_buses_buttons(driver, wait):
    while True:
        try:
            # Find all "View Buses" buttons
            view_buses_buttons = driver.find_elements(By.XPATH, '//div[@class="clearfix"]//div[contains(@class, "button") and contains(text(), "View Buses")]')
            if not view_buses_buttons:
                break

            for view_buses_button in view_buses_buttons:
                try:
                    # Scroll the button into view 
                    actions = ActionChains(driver)
                    actions.move_to_element(view_buses_button).perform()
                    time.sleep(2)

                    # Click the button
                    view_buses_button.click()
                    wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'travels')))
                    time.sleep(3)
                except Exception as e:
                    print("Error at clicking view bus button:",e)
        except Exception as e:
            print("Error at finding view bus button:",e)
            break

#extracting all bus details
def extract_all_bus_items(driver, wait, route_link, route_title):
    driver.get(route_link)
    wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'travels')))
    time.sleep(2)

    scroll_to_bottom(driver)
    click_view_buses_buttons(driver, wait)

    bus_items = []
    bus_elements = driver.find_elements(By.CLASS_NAME, 'clearfix.bus-item')
    for bus in bus_elements:
        try:
            # Seat availability text
            seat_left_text = bus.find_element(By.CLASS_NAME, 'seat-left').text
                
            # Regular expression to find the first integer in the text
            match = re.search(r'\d+', seat_left_text)
            seat_left = int(match.group(0)) if match else 0
            
            details = {
               'route_name': route_title,
                'route_link': route_link,
                'busname': '',
                'bustype': '',
                'departing_time': '',
                'departing_location': '',
                'duration': '',
                'reaching_time': '',
                'boardingpoint_location': '',
                'star_rating': '',
                'price': '',
                'seats_available': ''
            }

            try:
                details['busname'] = bus.find_element(By.CLASS_NAME, 'travels').text
            except Exception as e:
                print("Error at extracting busname:",e)

            try:
                details['bustype'] = bus.find_element(By.CLASS_NAME, 'bus-type').text
            except Exception as e:
                print("Error at extracting bustype:",e)

            try:
                details['departing_time'] = bus.find_element(By.CLASS_NAME, 'dp-time').text
            except Exception as e:
                print("Error at extracting departing time:",e)

            try:
                details['departing_location'] = bus.find_element(By.CLASS_NAME, 'dp-loc').get_attribute('title')
            except Exception as e:
                print("Error at extracting departing location:",e)

            try:
                details['duration'] = bus.find_element(By.CLASS_NAME, 'dur').text
            except Exception as e:
                print("Error at extracting duration:",e)

            try:
                details['reaching_time'] = bus.find_element(By.CLASS_NAME, 'bp-time').text
            except Exception as e:
                print("Error at extracting reaching time:",e)

            try:
                details['boardingpoint_location'] = bus.find_element(By.CLASS_NAME, 'bp-loc').get_attribute('title')
            except Exception as e:
                print("Error at extracting boarding point location:",e)

            try:
                details['star_rating'] = bus.find_element(By.CLASS_NAME, 'rating').text
            except Exception as e:
                print("Error at extracting star rating:",e)

            try:
                details['price'] =  int(bus.find_element(By.CSS_SELECTOR, '.seat-fare .fare.d-block .f-19.f-bold').text)
            except Exception as e:
                print("Error at extracting price:",e)

            try:
                details['seats_available'] = seat_left
            except Exception as e:
                print("Error at extracting seats available:",e)

            bus_items.append(details)
            print(f"Loaded bus details: {details}")
        except Exception as e:
            print(f"Error extracting bus details: {e}")
            continue
    return bus_items

#find routename and route link
all_routes = []
while True:
    try:
        ktcl_routes = driver.find_elements(By.CLASS_NAME, "route")
        for route in ktcl_routes:
            title = route.text
            link = route.get_attribute('href')
            if title and link:
                all_routes.append({"title": title, "link": link})

        pagination_container = wait.until(EC.presence_of_element_located((By.CLASS_NAME, "DC_117_paginationTable")))
        current_page = pagination_container.find_element(By.CLASS_NAME, "DC_117_pageActive").text
        next_page_number = int(current_page) + 1
        next_page_button = pagination_container.find_element(By.XPATH, f'.//div[contains(@class, "DC_117_pageTabs") and text()="{next_page_number}"]')
        actions = ActionChains(driver)
        actions.move_to_element(next_page_button).perform()
        time.sleep(1)
        next_page_button.click()
        wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, "route")))
        time.sleep(2)
    except Exception as e:
        break

all_bus_details = []

print("Loading,...")

try:
    for route in all_routes:
        route_title = route['title']
        route_link = route['link']
        try:
            bus_items = extract_all_bus_items(driver, wait, route_link, route_title)
            all_bus_details.extend(bus_items)
        except Exception as e:
            print("Error at extracting route and link with bus details;",e)
except Exception as e:
    print("Error:",e)
finally:
    driver.quit()

df= pd.DataFrame(all_bus_details)

print("\nBus Details done:")


Loading,...
Error at extracting price: invalid literal for int() with base 10: '404.76'
Loaded bus details: {'route_name': 'Chandigarh to Delhi', 'route_link': 'https://www.redbus.in/bus-tickets/chandigarh-to-delhi', 'busname': 'Chandigarh Transport Undertaking (CTU) - 165720', 'bustype': 'HVAC Seater (2+3)', 'departing_time': '15:30', 'departing_location': 'Sector 17', 'duration': '05h 35m', 'reaching_time': '21:05', 'boardingpoint_location': 'ISBT Kashmiri Gate', 'star_rating': '4.1', 'price': '', 'seats_available': 47}
Error at extracting price: invalid literal for int() with base 10: '404.76'
Loaded bus details: {'route_name': 'Chandigarh to Delhi', 'route_link': 'https://www.redbus.in/bus-tickets/chandigarh-to-delhi', 'busname': 'Chandigarh Transport Undertaking (CTU) - 165722', 'bustype': 'HVAC Seater (2+3)', 'departing_time': '16:30', 'departing_location': 'Sector 17', 'duration': '05h 35m', 'reaching_time': '22:05', 'boardingpoint_location': 'ISBT Kashmiri Gate', 'star_rating':

In [2]:
df

Unnamed: 0,route_name,route_link,busname,bustype,departing_time,departing_location,duration,reaching_time,boardingpoint_location,star_rating,price,seats_available
0,Chandigarh to Delhi,https://www.redbus.in/bus-tickets/chandigarh-t...,Chandigarh Transport Undertaking (CTU) - 165720,HVAC Seater (2+3),15:30,Sector 17,05h 35m,21:05,ISBT Kashmiri Gate,4.1,,47
1,Chandigarh to Delhi,https://www.redbus.in/bus-tickets/chandigarh-t...,Chandigarh Transport Undertaking (CTU) - 165722,HVAC Seater (2+3),16:30,Sector 17,05h 35m,22:05,ISBT Kashmiri Gate,3.4,,46
2,Chandigarh to Delhi,https://www.redbus.in/bus-tickets/chandigarh-t...,Chandigarh Transport Undertaking (CTU) - 165724,HVAC Seater (2+3),17:00,Sector 17,05h 35m,22:35,ISBT Kashmiri Gate,3.5,,47
3,Chandigarh to Delhi,https://www.redbus.in/bus-tickets/chandigarh-t...,Chandigarh Transport Undertaking (CTU) - 165726,HVAC Seater (2+3),17:30,Sector 17,05h 35m,23:05,ISBT Kashmiri Gate,4.5,,47
4,Chandigarh to Delhi,https://www.redbus.in/bus-tickets/chandigarh-t...,RSRTC - 192348,Super Luxury Volvo AC Seater Pushback 2+2,21:15,SECTOR 17,04h 45m,02:00,DELHI,3.2,676,32
...,...,...,...,...,...,...,...,...,...,...,...,...
828,Narnaul to Chandigarh,https://www.redbus.in/bus-tickets/narnaul-to-c...,Chandigarh Transport Undertaking (CTU) - 165855,HVAC Seater (2+3),17:00,NARNAUL Bus Teminal,06h 00m,23:00,Sector 17,4.6,,42
829,Narnaul to Chandigarh,https://www.redbus.in/bus-tickets/narnaul-to-c...,Hari Das Tour &Travels,Bharat Benz A/C Seater /Sleeper (2+1),22:15,NH D Narnaul Bypass,06h 05m,04:20,Sec 43,3.7,664,27
830,Chandigarh to Jawala Ji,https://www.redbus.in/bus-tickets/chandigarh-t...,HRTC - 1514,Ordinary 3+2 Non AC Seater,16:00,CHANDIGARH,05h 45m,21:45,Bus Stand,3.9,318,37
831,Chandigarh to Jawala Ji,https://www.redbus.in/bus-tickets/chandigarh-t...,HRTC - 1285,Ordinary 3+2 Non AC Seater,18:40,CHANDIGARH,05h 20m,00:00,Bus Stand,3.9,293,37


In [3]:
#convert dataframe into csv
path = "D:\\CHITRA\\REDBUSPROJECT\\rb_chandigarh.csv"
df.to_csv(path, index=False)

In [None]:
#connect python to mysql

In [4]:
import pandas as pd
import pymysql
import re

#clean data
def clean_data(value, data_type):
    if pd.isna(value):  # Handle NaN values
        return None
    if data_type == 'int':
        return int(''.join(filter(str.isdigit, str(value))))   # example: a="123,4" str()='1','2','3','4', to 1234
    elif data_type == 'float':
        return float(''.join(filter(lambda x: x.isdigit() or x == '.', str(value))))
    else:
        return value

#convert duration to 'HH:MM:SS' format
def convert_duration(duration):
    if pd.isna(duration):  # Handle NaN values
        return None
    match = re.match(r'(\d+)h\s*(\d+)m', duration)  # s* for replace space
    if match:
        hours = int(match.group(1))
        minutes = int(match.group(2))
        return f'{hours:02}:{minutes:02}:00'    # like 02:30:00
    return None  # return None if format does not match

# Read CSV file
csv_file_path = r"D:\\CHITRA\\REDBUSPROJECT\\rb_chandigarh.csv"
df = pd.read_csv(csv_file_path)

# Connect to MySQL
connection = pymysql.connect(
    host='127.0.0.1',
    user='root',
    password='2210',
    database='finalredbusproject'
)
cursor = connection.cursor()

# Create table
create_table_query = """
CREATE TABLE IF NOT EXISTS rb_chandigarh (
    id INT AUTO_INCREMENT PRIMARY KEY,
    route_name VARCHAR(255),
    route_link VARCHAR(255),
    busname VARCHAR(255),
    bustype VARCHAR(255),
    departing_time TIME,
    duration TIME,
    reaching_time TIME,
    star_rating FLOAT,
    price INT,
    seats_available INT
);
"""
cursor.execute(create_table_query)

# all NaNs (not a number) are replaced with None
df = df.where(pd.notnull(df), None)

# Insert the extracted rows into the MySQL table...
# ensuring that all data from the DataFrame is inserted into the database table row by row.
for index, row in df.iterrows():  # df.iterrows each row
    insert_query = """
    INSERT INTO rb_chandigarh (route_name, route_link, busname, bustype, departing_time, duration, reaching_time, star_rating, price, seats_available)
    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
    """
    # %s is a placeholder for each column
    row_data = (
        row['route_name'],
        row['route_link'],
        row['busname'],
        row['bustype'],
        row['departing_time'],
        convert_duration(row['duration']),
        row['reaching_time'],
        clean_data(row['star_rating'], 'float'),
        clean_data(row['price'], 'int'),
        clean_data(row['seats_available'], 'int')
    )
    
    try:
        cursor.execute(insert_query, row_data)
    except Exception as e:
        print(f"Error inserting row {index}: {e}")
        print(f"Row data: {row_data}")

# Commit and close the connection
connection.commit()
cursor.close()
connection.close()

print("Data inserted successfully!")


Data inserted successfully!
