# Kayak - Data Scraping 

In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from bs4 import BeautifulSoup
import time
import pandas as pd

In [2]:
# Path to your ChromeDriver
driver_path = '/usr/local/bin/chromedriver'  # Ensure the path is correct and escaped properly

# Set up the service
service = Service(driver_path)

# Set up Chrome options (if needed)
#options = webdriver.ChromeOptions()

# Initialize the WebDriver
driver = webdriver.Chrome()

# Open the webpage
url = 'https://www.ca.kayak.com/flights/YYZ-HYD/2024-10-02?sort=bestflight_a'
driver.get(url)

# Wait for the page to load completely
time.sleep(10)  # Initial wait time

# Function to click the "Show more results" button
def click_show_more_button(driver):
    try:
        show_more_button = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'div.ULvh-button.show-more-button'))
        )
        show_more_button.click()
        return True
    except (NoSuchElementException, TimeoutException):
        return False

# Click the "Show more results" button until it no longer exists
''''n = 5
while n!=0:
    time.sleep(3)  # Wait for new results to load
    click_show_more_button(driver)
    n -= 1'''
time.sleep(15)
# Get the page source after loading all results
page_source = driver.page_source

# Close the WebDriver
driver.quit()

# Parse the page source with BeautifulSoup
soup = BeautifulSoup(page_source, 'html.parser')

# Find all flight result containers
flights = soup.find_all('div', class_='nrc6 nrc6-mod-pres-default')#class_='Fxw9')

# Initialize lists to store the data
airlines = []
sources = []
destinations = []
departures = []
arrivals = []
num_stops = []
stopover_details_list = []
prices = []
classes = []

# Extract and print specific details
for flight in flights:
    airline = flight.find('div', class_='J0g6-operator-text').text.strip() if flight.find('div', class_='J0g6-operator-text') else ''
    source = flight.find_all('div', class_='c_cgF c_cgF-mod-variant-full-airport-wide')[0]['title'].strip() if len(flight.find_all('div', class_='c_cgF c_cgF-mod-variant-full-airport-wide')) > 0 else ''
    destination = flight.find_all('div', class_='c_cgF c_cgF-mod-variant-full-airport-wide')[1]['title'].strip() if len(flight.find_all('div', class_='c_cgF c_cgF-mod-variant-full-airport-wide')) > 1 else ''
    departure_arrival_div = flight.find('div', class_='vmXl vmXl-mod-variant-large')
    departure_span, arrival_span = departure_arrival_div.find_all('span')[:3:2] if departure_arrival_div else (None, None)
    departure = departure_span.text.strip() if departure_span else ''
    arrival = arrival_span.text.strip() if arrival_span else ''
 
    jweo_div = flight.find('div', class_='JWEO')
    num_stops_div = jweo_div.find('div', class_='vmXl vmXl-mod-variant-default') if jweo_div else None
    num_stops_text = num_stops_div.find('span', class_='JWEO-stops-text').text.strip() if num_stops_div else ''
    num_stops.append(num_stops_text)
    
    stopover_div = jweo_div.find('div', class_='c_cgF c_cgF-mod-variant-full-airport') if jweo_div else None
    stopover_details = ', '.join([span.get('title', '') for span in stopover_div.find_all('span')]) if stopover_div else ''
    stopover_details_list.append(stopover_details)  # Corrected variable name

    price = flight.find('div', class_='f8F1-price-text').text.strip() if flight.find('div', class_='f8F1-price-text') else ''
    travel_class = flight.find('div', class_='aC3z-name')['title'].strip() if flight.find('div', class_='aC3z-name') else ''

    airlines.append(airline)
    sources.append(source)
    destinations.append(destination)
    departures.append(departure)
    arrivals.append(arrival)
    #number_of_stops.append(num_stops)
    #stopover_details_list.append(stopover_details)
    prices.append(price)
    classes.append(travel_class)

# Create a DataFrame
df = pd.DataFrame({
    'Airline': airlines,
    'Source': sources,
    'Destination': destinations,
    'Departure': departures,
    'Arrival': arrivals,
    'Number of Stops': num_stops,
    'Stopover Details': stopover_details_list,
    'Price': prices,
    'Class': classes
})

#print(df)

df.to_csv('flight_data.csv', index=False)



The chromedriver version (124.0.6367.207) detected in PATH at /usr/local/bin/chromedriver might not be compatible with the detected chrome version (125.0.6422.78); currently, chromedriver 125.0.6422.78 is recommended for chrome 125.*, so it is advised to delete the driver in PATH and retry


In [3]:
df.head()

Unnamed: 0,Airline,Source,Destination,Departure,Arrival,Number of Stops,Stopover Details,Price,Class
0,Air India,Toronto Pearson Intl,Hyderabad Rajiv Gandhi Intl,12:15 pm,4:45 pm+1,1 stop,", 3h 20m layover, <b>New Delhi Indira Gandhi I...","C$ 1,257",Economy
1,"Air Canada, Etihad Airways • Operated by Air C...",Toronto Pearson Intl,Hyderabad Rajiv Gandhi Intl,6:15 am,7:55 pm+1,2 stops,", 7h 42m layover, <b>Airport change LGA-JFK</b...",C$ 841,Basic Economy
2,"Air India, Vistara",Toronto Pearson Intl,Hyderabad Rajiv Gandhi Intl,12:15 pm,5:00 pm+1,1 stop,", 3h 20m layover, <b>New Delhi Indira Gandhi I...","C$ 1,283",Economy
3,Etihad Airways,Toronto Pearson Intl,Hyderabad Rajiv Gandhi Intl,10:10 pm,3:00 am+2,1 stop,", 2h 25m layover, <b>Abu Dhabi Zayed Intl</b>","C$ 1,338",Economy Basic
4,Emirates,Toronto Pearson Intl,Hyderabad Rajiv Gandhi Intl,2:30 pm,8:15 pm+1,1 stop,", 3h 50m layover, <b>Dubai Intl</b>","C$ 1,432",Eco Special
