In [2]:
import os
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import undetected_chromedriver as uc
from urllib.parse import urlparse
import time
import pandas as pd
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import re
from collections import Counter
import openpyxl
from selenium.common.exceptions import NoSuchElementException
from openpyxl import Workbook
from openpyxl.utils import get_column_letter

from selenium.common.exceptions import TimeoutException

In [4]:
def over_view(driver):
    text_div = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, "div.max-w-3xl.mb-12.whitespace-pre-wrap.prose"))
    )
    
    # Get the text content
    content = text_div.text

    return content

In [5]:
def overall(driver):

    metrics_data = []

    try:
    
        # Find the metrics section
        metrics_section = driver.find_element(By.ID, 'metrics-section')
        
        # Find all metric divs within the section
        metric_divs = metrics_section.find_elements(By.CLASS_NAME, 'flex.flex-col.items-center.px-2.py-4')
    
        #metric_divs1 = WebDriverWait(metrics_section, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'flex.flex-col.items-center.px-2.py-4')))
        
        # Iterate over each metric div and extract the required information
        for div in metric_divs[:7]:
            
            # Get the review category (e.g., 'Overall Value for Money')
            category = div.find_element(By.TAG_NAME, 'p').text
            
            # Get the score (e.g., 3.4)
            score = div.find_element(By.CLASS_NAME, 'text-xl.font-medium').text
            
            # Get the number of reviews (e.g., 'From 83 reviews')
            reviews_text = div.find_element(By.CLASS_NAME, 'text-xs.text-foreground-500').text
            
            # Extract the number of reviews from the text
            reviews = reviews_text.split(" ")[1]
    
            metrics_data.append({
                'title': category,
                'rating': score,
                'reviews': reviews
            })
                
        return metrics_data

    except NoSuchElementException:
        return metrics_data

In [6]:
def safety(driver):

    driver.get(f"{driver.current_url}/safety")

    safety_data = []
    
    # Extract the Current Safety Rating text
    safety_rating_heading = 'Current Safety Rating'
    # Extract the star rating (7/7)
    star_rating = driver.find_element(By.XPATH, "//p[contains(@class, 'ml-2')]").text
    # Extract the last updated date
    last_updated = driver.find_element(By.XPATH, "//p[contains(text(), 'Updated')]").text

    safety_data.append({
        'heading': safety_rating_heading,
        'status': last_updated,
        'score': star_rating
    })

    # Locate the parent container div that holds the three divs
    parent_container = driver.find_element(By.XPATH, "//div[@class='flex flex-col space-y-4 py-4']")

    # Locate all rating containers
    incident_divs = parent_container.find_elements(By.XPATH, ".//div[@class='w-full flex flex-col p-4 rounded-xl border dark:border-slate-800']")
    
    # Iterate through each container
    for rating_container in incident_divs:
    
        # Extract the first <p> tag inside this div (which would typically be the heading like "Incident Rating")
        heading = rating_container.find_element(By.XPATH, ".//p[@class='font-medium font-heading text-lg mb-1']").text
        status = rating_container.find_element(By.XPATH, ".//div[@class='w-full space-x-1 items-center flex flex-row mb-3']//p").text
        score = rating_container.find_element(By.XPATH, ".//p[@class='text-sm text-foreground-500 ml-2']").text        

        safety_data.append({
            'heading': heading,
            'status': status,
            'score': score
        })

    return safety_data

In [7]:
def airline_data(driver, name, link, retries=5):
    # Store the main window handle
    main_window = driver.current_window_handle
    
    # Retry logic, with a maximum number of retries
    for attempt in range(0 , retries):
        try:
            # Open a new tab by executing JavaScript
            driver.execute_script("window.open('');")
            
            # Switch to the new tab
            driver.switch_to.window(driver.window_handles[1])
    
            name = name.lower().replace(" ", "-")
            
            # Load the airline ratings webpage
            #driver.get(f"https://www.airlineratings.com/airlines/{name}")

            driver.get(link)
            
            # Check if the error page is shown
            try:
                # Look for the error message element on the page
                error_message = driver.find_element(By.XPATH, "//h1[contains(text(), 'Oh o')]")
                print("Error page detected, retrying...")
                raise Exception("Error page detected")
                
            except NoSuchElementException:
                # No error detected, proceed with data extraction
                pass

            # Extract the overview, overall ratings, and safety data
            over_view_data = over_view(driver)
            overall_data = overall(driver)
            safety_data = safety(driver)
            
            # Close the new tab
            driver.close()
            
            # Switch back to the main tab
            driver.switch_to.window(main_window)

            # Return the collected data
            return over_view_data, overall_data, safety_data

        except Exception as e:
            # Close the tab if there's an error
            driver.close()
           # Switch back to the main tab
            driver.switch_to.window(main_window)
            print(f"Attempt {attempt + 1} failed with error: {e}")
            
            # If we've hit the retry limit, raise the exception
            if attempt == retries - 1:
                raise Exception("Failed to retrieve data after multiple attempts.")
            
            # Otherwise, wait for a moment and retry
            time.sleep(8)  # Adjust the sleep time as necessary
            
    return None  # In case retries fail, return None

In [12]:
def is_exist(header, value, airlines_filename):
    try:
        # Load the existing workbook
        workbook = openpyxl.load_workbook(airlines_filename)
        sheet = workbook.active
    except FileNotFoundError:
        print(f"The file {airlines_filename} does not exist.")
        return False

    # Get the header row (first row)
    headers = [cell.value for cell in sheet[1]]  # First row is the header row

    # Check if the header exists
    if header in headers:
        # Find the column index for the header (Excel columns start from 1, lists from 0)
        column_index = headers.index(header) + 1

        # Search for the value in the corresponding column (skip the header row)
        for row in sheet.iter_rows(min_row=2, max_row=sheet.max_row, min_col=column_index, max_col=column_index, values_only=True):
            if row[0] == value:  # row is a tuple, so access the first element
                return True  # Value found
        return False  # Value not found
    else:
        print(f"Header '{header}' not found.")
        return False  # Header not found

In [15]:
def save_to_excel(airlines_info, airlines_filename):
    
    try:
        # Load the existing workbook if it exists
        workbook = openpyxl.load_workbook(airlines_filename)
        sheet = workbook.active
    except FileNotFoundError:
        # If the file doesn't exist, create a new workbook and sheet
        workbook = openpyxl.Workbook()
        sheet = workbook.active

    # Get the header row (first row) and the last row
    if sheet.max_row > 0:
        header = [cell.value for cell in sheet[1]]
    else:
        header = []
        sheet.append([])  # Ensure first row (header row) exists

    
    # Check if the entry already exists in the file
    for row in sheet.iter_rows(min_row=2, max_row=sheet.max_row, values_only=True):
        existing_data = dict(zip(header, row))
        
        # Compare the existing row with the current dictionary
        if all(existing_data.get(k) == v for k, v in airlines_info.items() if k in existing_data):
            print("This entry already exists in the file. Skipping.")
            return  # Skip saving if the entry already exists

    
    last_row = sheet.max_row + 1  # Row where new data will be added

    for key, value in airlines_info.items():
        if key in header:
            # If the key exists, find the corresponding column
            column_index = header.index(key) + 1  # Excel columns start from 1
        else:
            # If the key doesn't exist, add it to the first row
            column_index = len(header) + 1  # New column index
            sheet.cell(row=1, column=column_index).value = key
            header.append(key)  # Update the header list with the new key

        # Write the value to the corresponding cell
        sheet.cell(row=last_row, column=column_index).value = value

    # Save the workbook
    workbook.save(airlines_filename)

In [19]:
def airlineratings(driver):
    # Find all rows in the table
    rows = driver.find_elements(By.CSS_SELECTOR, 'table tbody tr')
    
    # Loop through each row and extract information
    for row in rows:
        # Extract Airline Name
        name = row.find_element(By.CSS_SELECTOR, 'td[role="rowheader"] p').text

        airlines_filename = "airlines_details.xlsx"

        exist = is_exist("Name", name, airlines_filename)

        if exist:
            continue
        
        link = row.find_element(By.CSS_SELECTOR, 'a[class="w-full h-full inline-block p-2"]').get_attribute('href')
        #link1 = row.find_element(By.XPATH, "//a[contains(@class, 'w-full h-full inline-block p-2')]").get_attribute('href')

        #print(link1)
        #print(link)
        
        # Extract Country
        country = row.find_elements(By.TAG_NAME, 'td')[1].text.strip()
    
        # Passenger Rating
        passenger_rating  = row.find_elements(By.TAG_NAME, 'td')[2].text.strip()
    
        # Extract Product Rating
        product_rating = row.find_elements(By.TAG_NAME, 'td')[3].text.strip()

        safety_rating = row.find_elements(By.TAG_NAME, 'td')[4].text.strip()
    
        # Wait for the page to load completely
        driver.implicitly_wait(10)

        over_view_data, overall_data, safety_data = airline_data(driver, name, link)

        airlines_info = {}
    
        airlines_info = {
            "Name": name,
            "Country": country,
            "Passenger_Rating": passenger_rating,
            "Product_Rating": product_rating,
            "Safety_Rating": safety_rating,
            "About_The_Airline" : over_view_data,
        }

        for item in overall_data:
            # Add each title and corresponding rating and reviews
            airlines_info[item['title']] = item['rating']
            airlines_info[f"score for {item['title']} from this number of reviews"] = item['reviews']

        is_first = True
        for item in safety_data:
            airlines_info[item['heading']] = item['score']
            if is_first:
                is_first = False
                airlines_info['last_update'] = item['status']
            else:
                airlines_info[f"{item['heading']} Status"] = item['status']
        
        save_to_excel(airlines_info, airlines_filename)

In [22]:
start_from = 0
last_page_number = 21

# Navigate to the specified page
target_url = f"https://www.airlineratings.com/airlines"

In [24]:
# Setup ChromeDriver
chrome_driver_path = "C:\\chromedriver\\chromedriver.exe"
service = Service(chrome_driver_path)
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(service=service, options=options)

driver.get(target_url)

# Save the main window handle
main_window = driver.current_window_handle

# Wait for the page to load completely (optional, in case of JavaScript delays)
driver.implicitly_wait(10)

In [25]:
not_first_time = False

for i in range(start_from, int(last_page_number)):
    
    if not_first_time :
        # Find the 'next page' button using its attributes and click it
        next_button = driver.find_element(By.CSS_SELECTOR, 'li[aria-label="next page button"]')
        
        # Click the 'next page' button
        next_button.click()
        
        # Optional: Wait for the next page to load before continuing with your scraping
        driver.implicitly_wait(10)
    else :
        not_first_time = True
    
    airlineratings(driver)