# Final Script - Web scraping from Kayak's website

In [None]:
import pandas as pd
from datetime import datetime, timedelta
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException, ElementClickInterceptedException
from bs4 import BeautifulSoup
import time
import os

# Airport code mapping
airport_codes = {
    "Toronto Pearson International (YYZ)": "YYZ",
    "Bengaluru (BLR)": "BLR",
    "Indira Gandhi International (DEL)": "DEL",
    "Mumbai (BOM)": "BOM",
    "Hyderabad (HYD)": "HYD",
    "Chennai (MAA)": "MAA",
    "Ahmedabad (AMD)": "AMD",
    "Kochi (COK)": "COK",
    "Colombo Bandaranayake (CMB)": "CMB",
    "Kathmandu (KTM)": "KTM",
    "Mexico City Juarez International (MEX)": "MEX",
    "Sao Paulo Guarulhos (GRU)": "GRU",
    "Aminu Kano Intl (KAN)": "KAN"
}

def generate_flight_url(source, destination, date, travel_class="economy"):
    base_url = "https://www.ca.kayak.com/flights/"
    if travel_class in ["premium", "business"]:
        class_segment = travel_class
    else:
        class_segment = ""
    return f"{base_url}{source}-{destination}/{date}/{class_segment}?sort=bestflight_a"

# Read input data from Excel file
input_data = pd.read_excel("flight_data.xlsx")

# Get today's date
today = datetime.today()

# List to store all generated URLs
all_urls = []

# Iterate over rows in the Excel file
for index, row in input_data.iterrows():
    source = airport_codes.get(row['Source'], "")
    destination = airport_codes.get(row['Destination'], "")
    
    if source and destination:
        # Generate URLs for the next 50 days
        for day in range(50):
            date = today + timedelta(days=day)#+timedelta(days=10)
            formatted_date = date.strftime("%Y-%m-%d")
            
            # Generate flight URL for economy
            flight_url_economy = generate_flight_url(source, destination, formatted_date, travel_class="economy")
            all_urls.append(flight_url_economy)
            
            # Generate flight URL for premium economy
            flight_url_premium_economy = generate_flight_url(source, destination, formatted_date, travel_class="premium")
            all_urls.append(flight_url_premium_economy)
            
            # Generate flight URL for business class
            flight_url_business = generate_flight_url(source, destination, formatted_date, travel_class="business")
            all_urls.append(flight_url_business)

def click_show_more_button(driver):
    try:
        show_more_button = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'div.ULvh-button.show-more-button'))
        )
        driver.execute_script("arguments[0].click();", show_more_button)
        return True
    except (NoSuchElementException, TimeoutException, ElementClickInterceptedException):
        return False

def scrape(url):
    driver_path = '/usr/local/bin/chromedriver'
    service = Service(driver_path)
    driver = webdriver.Chrome(service=service)
    
    # Open the webpage
    driver.get(url)

    # Wait for the page to load completely
    time.sleep(5)  # Initial wait time

    # Click the "Show more results" button until it no longer exists
    n = 5
    while n != 0:
        time.sleep(3)  # Wait for new results to load
        if not click_show_more_button(driver):
            break
        n -= 1
    time.sleep(5)

    # Get the page source after loading all results
    page_source = driver.page_source

    # Close the WebDriver
    driver.quit()

    # Parse the page source with BeautifulSoup
    soup = BeautifulSoup(page_source, 'html.parser')

    # Find all flight result containers
    flights = soup.find_all('div', class_='nrc6 nrc6-mod-pres-default')

    # Initialize lists to store the data
    airlines = []
    sources = []
    destinations = []
    departures = []
    arrivals = []
    num_stops = []
    stopover_details_list = []
    prices = []
    classes = []
    dates = []

    # Extract the date from the URL
    date_in_url = url.split('/')[5]
    
    # Extract and print specific details
    for flight in flights:
        airline = flight.find('div', class_='J0g6-operator-text').text.strip() if flight.find('div', class_='J0g6-operator-text') else ''
        source = flight.find_all('div', class_='c_cgF c_cgF-mod-variant-full-airport-wide')[0]['title'].strip() if len(flight.find_all('div', 'c_cgF c_cgF-mod-variant-full-airport-wide')) > 0 else ''
        destination = flight.find_all('div', class_='c_cgF c_cgF-mod-variant-full-airport-wide')[1]['title'].strip() if len(flight.find_all('div', 'c_cgF c_cgF-mod-variant-full-airport-wide')) > 1 else ''
        departure_arrival_div = flight.find('div', class_='vmXl vmXl-mod-variant-large')
        departure_span, arrival_span = departure_arrival_div.find_all('span')[:3:2] if departure_arrival_div else (None, None)
        departure = departure_span.text.strip() if departure_span else ''
        arrival = arrival_span.text.strip() if arrival_span else ''

        jweo_div = flight.find('div', class_='JWEO')
        num_stops_div = jweo_div.find('div', class_='vmXl vmXl-mod-variant-default') if jweo_div else None
        num_stops_text = num_stops_div.find('span', 'JWEO-stops-text').text.strip() if num_stops_div else ''
        num_stops.append(num_stops_text)

        stopover_div = jweo_div.find('div', class_='c_cgF c_cgF-mod-variant-full-airport') if jweo_div else None
        stopover_details = ', '.join([span.get('title', '') for span in stopover_div.find_all('span')]) if stopover_div else ''
        stopover_details_list.append(stopover_details)

        price = flight.find('div', class_='f8F1-price-text').text.strip() if flight.find('div', class_='f8F1-price-text') else ''
        travel_class = flight.find('div', class_='aC3z-name')['title'].strip() if flight.find('div', class_='aC3z-name') else ''

        airlines.append(airline)
        sources.append(source)
        destinations.append(destination)
        departures.append(departure)
        arrivals.append(arrival)
        prices.append(price)
        classes.append(travel_class)
        dates.append(date_in_url)

    # Create a DataFrame
    df = pd.DataFrame({
        'Airline': airlines,
        'Source': sources,
        'Destination': destinations,
        'Departure': departures,
        'Arrival': arrivals,
        'Number of Stops': num_stops,
        'Stopover Details': stopover_details_list,
        'Price': prices,
        'Class': classes,
        'Date': dates
    })

    return df

# Filepath for the CSV
output_file = 'scraped_flight_data.csv'

# Check if the file already exists
file_exists = os.path.isfile(output_file)

# Iterate over all URLs in the list with a 1-minute interval
for url in all_urls:
    df = scrape(url)
    
    # Append data to the CSV file
    if not file_exists:
        df.to_csv(output_file, index=False)  # Write header only for the first file
        file_exists = True
    else:
        df.to_csv(output_file, mode='a', header=False, index=False)  # Append data without writing header

    print(f"Data from {url} saved to {output_file}")
    
    # Wait for 1 minute before processing the next URL
    time.sleep(10)

print("Data scraping complete and saved to scraped_flight_data.csv")


Data from https://www.ca.kayak.com/flights/YYZ-BLR/2024-06-02/?sort=bestflight_a saved to scraped_flight_data.csv
Data from https://www.ca.kayak.com/flights/YYZ-BLR/2024-06-02/premium?sort=bestflight_a saved to scraped_flight_data.csv
Data from https://www.ca.kayak.com/flights/YYZ-BLR/2024-06-02/business?sort=bestflight_a saved to scraped_flight_data.csv
Data from https://www.ca.kayak.com/flights/YYZ-BLR/2024-06-03/?sort=bestflight_a saved to scraped_flight_data.csv
Data from https://www.ca.kayak.com/flights/YYZ-BLR/2024-06-03/premium?sort=bestflight_a saved to scraped_flight_data.csv
Data from https://www.ca.kayak.com/flights/YYZ-BLR/2024-06-03/business?sort=bestflight_a saved to scraped_flight_data.csv
Data from https://www.ca.kayak.com/flights/YYZ-BLR/2024-06-04/?sort=bestflight_a saved to scraped_flight_data.csv
Data from https://www.ca.kayak.com/flights/YYZ-BLR/2024-06-04/premium?sort=bestflight_a saved to scraped_flight_data.csv
Data from https://www.ca.kayak.com/flights/YYZ-BLR/

Data from https://www.ca.kayak.com/flights/YYZ-BLR/2024-06-25/?sort=bestflight_a saved to scraped_flight_data.csv
Data from https://www.ca.kayak.com/flights/YYZ-BLR/2024-06-25/premium?sort=bestflight_a saved to scraped_flight_data.csv
Data from https://www.ca.kayak.com/flights/YYZ-BLR/2024-06-25/business?sort=bestflight_a saved to scraped_flight_data.csv
Data from https://www.ca.kayak.com/flights/YYZ-BLR/2024-06-26/?sort=bestflight_a saved to scraped_flight_data.csv
Data from https://www.ca.kayak.com/flights/YYZ-BLR/2024-06-26/premium?sort=bestflight_a saved to scraped_flight_data.csv
Data from https://www.ca.kayak.com/flights/YYZ-BLR/2024-06-26/business?sort=bestflight_a saved to scraped_flight_data.csv
Data from https://www.ca.kayak.com/flights/YYZ-BLR/2024-06-27/?sort=bestflight_a saved to scraped_flight_data.csv
Data from https://www.ca.kayak.com/flights/YYZ-BLR/2024-06-27/premium?sort=bestflight_a saved to scraped_flight_data.csv
Data from https://www.ca.kayak.com/flights/YYZ-BLR/

Data from https://www.ca.kayak.com/flights/YYZ-BLR/2024-07-18/?sort=bestflight_a saved to scraped_flight_data.csv
Data from https://www.ca.kayak.com/flights/YYZ-BLR/2024-07-18/premium?sort=bestflight_a saved to scraped_flight_data.csv
Data from https://www.ca.kayak.com/flights/YYZ-BLR/2024-07-18/business?sort=bestflight_a saved to scraped_flight_data.csv
Data from https://www.ca.kayak.com/flights/YYZ-BLR/2024-07-19/?sort=bestflight_a saved to scraped_flight_data.csv
Data from https://www.ca.kayak.com/flights/YYZ-BLR/2024-07-19/premium?sort=bestflight_a saved to scraped_flight_data.csv
Data from https://www.ca.kayak.com/flights/YYZ-BLR/2024-07-19/business?sort=bestflight_a saved to scraped_flight_data.csv
Data from https://www.ca.kayak.com/flights/YYZ-BLR/2024-07-20/?sort=bestflight_a saved to scraped_flight_data.csv
Data from https://www.ca.kayak.com/flights/YYZ-BLR/2024-07-20/premium?sort=bestflight_a saved to scraped_flight_data.csv
Data from https://www.ca.kayak.com/flights/YYZ-BLR/

Data from https://www.ca.kayak.com/flights/YYZ-BOM/2024-06-21/?sort=bestflight_a saved to scraped_flight_data.csv
Data from https://www.ca.kayak.com/flights/YYZ-BOM/2024-06-21/premium?sort=bestflight_a saved to scraped_flight_data.csv
Data from https://www.ca.kayak.com/flights/YYZ-BOM/2024-06-21/business?sort=bestflight_a saved to scraped_flight_data.csv
Data from https://www.ca.kayak.com/flights/YYZ-BOM/2024-06-22/?sort=bestflight_a saved to scraped_flight_data.csv
Data from https://www.ca.kayak.com/flights/YYZ-BOM/2024-06-22/premium?sort=bestflight_a saved to scraped_flight_data.csv
Data from https://www.ca.kayak.com/flights/YYZ-BOM/2024-06-22/business?sort=bestflight_a saved to scraped_flight_data.csv
Data from https://www.ca.kayak.com/flights/YYZ-BOM/2024-06-23/?sort=bestflight_a saved to scraped_flight_data.csv
Data from https://www.ca.kayak.com/flights/YYZ-BOM/2024-06-23/premium?sort=bestflight_a saved to scraped_flight_data.csv
Data from https://www.ca.kayak.com/flights/YYZ-BOM/