In [1]:
from bs4 import BeautifulSoup
import requests
import os
import time
import shutil

## Retrieve data 

In [2]:
categories = ['airline-reviews', 'seat-reviews', 'lounge-reviews']

In [3]:
# Function to get the total number of pages
def get_total_pages(url):
    response = requests.get(url.format(1))
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find all <a> tags
    pages = soup.find_all("a")

    # Extract page numbers into a list using .text on each link
    page_numbers = [page.text for page in pages if page.text.isdigit()]
    
    # Convert the page numbers to integers
    page_numbers = [int(page) for page in page_numbers]

    # Find the largest page number
    total_page = max(page_numbers)

    return total_page

In [4]:
# Function to scrape reviews from a single page
def get_page_review(category, page_number, page_url):
    response = requests.get(page_url.format(page_number))
    soup = BeautifulSoup(response.content, 'html.parser')
    reviews = []
    for review in reviews:
        # Extract review title
        title = review.find("h2", class_="text_header").text.strip() if review.find("h2", class_="text_header") else "No Title"
        # Extract author name
        author = review.find("span", itemprop="name").text.strip() if review.find("span", itemprop="name") else "No Author"
        # Extract date published
        date_published = review.find("meta", itemprop="datePublished")["content"] if review.find("meta", itemprop="datePublished") else "No Date"
        # Extract review body
        review_body = review.find("div", itemprop="reviewBody").text.strip() if review.find("div", itemprop="reviewBody") else "No Review Body"
        # Extract review statistics (e.g., seat comfort, cabin staff, etc.)
        review_stats = {}
        stats = review.find_all("tr")
        for stat in stats:
            header = stat.find("td", class_="review-rating-header")
            value = stat.find("td", class_="review-value") or stat.find("td", class_="review-rating-stars stars")
            
            # Extract star ratings for each category
            if header and value:
                category = header.text.strip()
                #if value and value.get('class'):
                    # If the 'value' element has a 'class' attribute, this block will execute
                if "stars" in value["class"]:
                    stars = get_star_rating(value.find_all("span", class_="star"))  # Get star ratings
                    review_stats[category] = stars
                else:
                    review_stats[category] = value.text.strip()
            else:
                # Handle cases where there is no class attribute in 'value'
                review_stats[category] = "No Class Attribute"
                                    
        review_info = {
        "Title": title,
        "Author": author,
        "Date Published": date_published,
        "Review Body": review_body,
        "Review Stats": review_stats}
        
        # Append the review data to the list
        review_data.append(review_info)

    # Convert the review data into a DataFrame
    df = pd.DataFrame(review_data)

    return df

In [5]:
def get_star_rating(star_elements):
    filled_star = 0
    for star in star_elements:
        if "fill" in star.get("class", []):  # Check if 'fill' is in the class list
            filled_star += 1
    return filled_star

In [6]:
for cat in categories:
    url_start = f"https://www.airlinequality.com/{cat}/british-airways/page/1/?sortby=post_date%3ADesc&pagesize=100" # Get link
    review_data = []
    # Define highest page number
    higest_page = get_total_pages(url_start)
    
    # Loop over each review page
    for page in range(1, higest_page + 1):
        url_page = f"https://www.airlinequality.com/{cat}/british-airways/page/{page}/?sortby=post_date%3ADesc&pagesize=100"
        response = requests.get(url_page) # Request to access the link
        time.sleep(2)
        soup = BeautifulSoup(response.content, 'html.parser') # Converts HTML content into a navigable, searchable BeautifulSoup object structure
        
        #Find all review containers
        reviews = soup.find_all("article", itemprop="review")
    
        # Loop through each review and extract the required information
        for review in reviews:
            # Extract review title
            title = review.find("h2", class_="text_header").text.strip() if review.find("h2", class_="text_header") else "No Title"
            # Extract author name
            author = review.find("span", itemprop="name").text.strip() if review.find("span", itemprop="name") else "No Author"
            # Extract date published
            date_published = review.find("meta", itemprop="datePublished")["content"] if review.find("meta", itemprop="datePublished") else "No Date"
            # Extract review body
            review_body = review.find("div", itemprop="reviewBody").text.strip() if review.find("div", itemprop="reviewBody") else "No Review Body"
            # Extract review statistics (e.g., seat comfort, cabin staff, etc.)
            review_stats = {}
            stats = review.find_all("tr")
            for stat in stats:
                header = stat.find("td", class_="review-rating-header")
                value = stat.find("td", class_="review-value") or stat.find("td", class_="review-rating-stars stars")
                
                # Extract star ratings for each category
                if header and value:
                    category = header.text.strip()
                    #if value and value.get('class'):
                        # If the 'value' element has a 'class' attribute, this block will execute
                    if "stars" in value["class"]:
                        stars = get_star_rating(value.find_all("span", class_="star"))  # Get star ratings
                        review_stats[category] = stars
                    else:
                        review_stats[category] = value.text.strip()
                else:
                    # Handle cases where there is no class attribute in 'value'
                    review_stats[category] = "No Class Attribute"
                                        
            review_info = {
            "Title": title,
            "Author": author,
            "Date Published": date_published,
            "Review Body": review_body,
            "Review Stats": review_stats}
            
            # Append the review data to the list
            review_data.append(review_info)
    
    # Convert the review data into a DataFrame
    df = pd.DataFrame(review_data)
    
    # Save the data to a CSV file
    df.to_csv("{}1.csv".format(cat), index=False)

## Read data files

In [7]:
airline = pd.read_csv('airline-reviews.csv')

In [8]:
seat = pd.read_csv('seat-reviews.csv')

In [9]:
lounge = pd.read_csv('lounge-reviews.csv')

In [10]:
airline['Review Stats'].values

array(["{'Type Of Traveller': 'Family Leisure', 'Seat Type': 'Economy Class', 'Route': 'Larnaca to Glasgow via Heathrow', 'Date Flown': 'November 2024', 'Seat Comfort': 1, 'Cabin Staff Service': 1, 'Food & Beverages': 1, 'Inflight Entertainment': 2, 'Ground Service': 1, 'Value For Money': 1, 'Recommended': 'no'}",
       "{'Type Of Traveller': 'Couple Leisure', 'Seat Type': 'Economy Class', 'Route': 'London to Rome', 'Date Flown': 'December 2024', 'Seat Comfort': 2, 'Cabin Staff Service': 4, 'Ground Service': 1, 'Value For Money': 2, 'Recommended': 'no'}",
       "{'Aircraft': 'Boeing 777 / A350', 'Type Of Traveller': 'Business', 'Seat Type': 'Business Class', 'Route': 'Washington to London', 'Date Flown': 'December 2024', 'Seat Comfort': 4, 'Cabin Staff Service': 3, 'Food & Beverages': 1, 'Inflight Entertainment': 5, 'Ground Service': 1, 'Wifi & Connectivity': 4, 'Value For Money': 1, 'Recommended': 'no'}",
       ...,
       "{'Seat Type': 'Economy Class', 'Value For Money': 4, 'Reco

In [11]:
import ast

## Data cleaning

In [12]:
def expand_review_stats(df):
    parsed_data = [ast.literal_eval(entry) for entry in df['Review Stats']]
    new_df = pd.json_normalize(parsed_data)
    return pd.concat([df, new_df], axis=1)

In [13]:
airline = expand_review_stats(airline)

In [14]:
seat = expand_review_stats(seat)

In [15]:
lounge = expand_review_stats(lounge)

In [16]:
airline.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3903 entries, 0 to 3902
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Title                   3903 non-null   object 
 1   Author                  3903 non-null   object 
 2   Date Published          3903 non-null   object 
 3   Review Body             3903 non-null   object 
 4   Review Stats            3903 non-null   object 
 5   Type Of Traveller       3132 non-null   object 
 6   Seat Type               3901 non-null   object 
 7   Route                   3127 non-null   object 
 8   Date Flown              3125 non-null   object 
 9   Seat Comfort            3777 non-null   float64
 10  Cabin Staff Service     3762 non-null   float64
 11  Food & Beverages        3468 non-null   float64
 12  Inflight Entertainment  2668 non-null   float64
 13  Ground Service          3053 non-null   float64
 14  Value For Money         3903 non-null   

In [17]:
seat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 23 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Title              200 non-null    object 
 1   Author             200 non-null    object 
 2   Date Published     200 non-null    object 
 3   Review Body        200 non-null    object 
 4   Review Stats       200 non-null    object 
 5   Seat Type          200 non-null    object 
 6   Aircraft Type      200 non-null    object 
 7   Seat Layout        200 non-null    object 
 8   Date Flown         123 non-null    object 
 9   Type Of Traveller  123 non-null    object 
 10  Seat Legroom       190 non-null    float64
 11  Seat Recline       190 non-null    float64
 12  Seat Width         190 non-null    float64
 13  Aisle Space        190 non-null    float64
 14  Viewing Tv Screen  164 non-null    float64
 15  Power Supply       61 non-null     float64
 16  Seat Storage       122 non

In [18]:
lounge.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 421 entries, 0 to 420
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Title              421 non-null    object 
 1   Author             421 non-null    object 
 2   Date Published     421 non-null    object 
 3   Review Body        421 non-null    object 
 4   Review Stats       421 non-null    object 
 5   Lounge Name        270 non-null    object 
 6   Airport            416 non-null    object 
 7   Type Of Lounge     393 non-null    object 
 8   Date Visit         188 non-null    object 
 9   Type Of Traveller  290 non-null    object 
 10  Comfort            419 non-null    float64
 11  Cleanliness        419 non-null    float64
 12  Bar & Beverages    409 non-null    float64
 13  Catering           412 non-null    float64
 14  Washrooms          367 non-null    float64
 15  Staff Service      405 non-null    float64
 16  Recommended        421 non

In [19]:
# Remove NA entries
airline = airline.dropna(subset=['Type Of Traveller', 'Seat Comfort', 'Cabin Staff Service', 'Food & Beverages', 'Inflight Entertainment', 'Ground Service'], inplace=True)
seat = seat.dropna(subset=['Type Of Traveller', 'Seat Legroom', 'Seat Width'], inplace=True)
lounge = lounge.dropna(subset=['Type Of Lounge', 'Type Of Traveller', 'Comfort', 'Cleanliness', 'Catering', 'Wifi Connectivity', 'Staff Service'], inplace=True)

In [21]:
airline