In [None]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

In [None]:
base_url = "https://www.airlinequality.com/airline-reviews/british-airways"
pages = 10
page_size = 100

In [None]:
# Create an empty DataFrame to store the data
columns = ['review', 'seat_type', 'seat_comfort', 'cabin_staff_service', 'food_beverages', 'inflight_entertainment', 'value_for_money', 'recommended']
df = pd.DataFrame(columns=columns)

In [None]:
for i in range(1, pages + 1):
    print(f"Scraping page {i}")

    # Create URL to collect links from paginated data
    url = f"{base_url}/page/{i}/?sortby=post_date%3ADesc&pagesize={page_size}"

    # Collect HTML data from this page
    response = requests.get(url)

    # Parse content
    content = response.content
    parsed_content = BeautifulSoup(content, 'html.parser')

    # Find all review elements on the page
    review_elements = parsed_content.find_all("article", {"itemprop": "review"})

    # Iterate over each review element and extract the data
    for review_element in review_elements:
        # Extract the review content
        review_content = review_element.find("div", {"class": "text_content"}).get_text()

        # Extract the seat type
        seat_type_element = review_element.find("td", {"class": "review-rating-header type_of_traveller"})
        seat_type = seat_type_element.find_next_sibling("td").get_text() if seat_type_element else None

        # Extract the ratings
        ratings_element = review_element.find("table", {"class": "review-ratings"})
        ratings = {}
        if ratings_element:
            for row in ratings_element.find_all("tr"):
                rating_category_element = row.find("td", {"class": "review-rating-header"})
                if rating_category_element:
                    rating_category = rating_category_element.get_text().strip().lower().replace(" ", "_")
                    rating_value = len(row.find_all("span", {"class": "star fill"}))
                    ratings[rating_category] = rating_value

        # Extract the recommendation data
        recommended_element = review_element.find("td", {"class": "review-rating-header recommended"})
        recommended = recommended_element.find_next_sibling("td").get_text() if recommended_element else None

        # Create a dictionary with the extracted data
        data = {
            'review': review_content,
            'seat_type': seat_type,
            'seat_comfort': ratings.get('seat_comfort'),
            'cabin_staff_service': ratings.get('cabin_staff_service'),
            'food_beverages': ratings.get('food_&_beverages'),
            'inflight_entertainment': ratings.get('inflight_entertainment'),
            'value_for_money': ratings.get('value_for_money'),
            'recommended': recommended,
        }

        # Append the data to the DataFrame
        data_df = pd.DataFrame([data])
        df = pd.concat([df, data_df], ignore_index=True)

    print(f"   ---> {len(df)} total reviews")


Scraping page 1
   ---> 100 total reviews
Scraping page 2
   ---> 200 total reviews
Scraping page 3
   ---> 300 total reviews
Scraping page 4
   ---> 400 total reviews
Scraping page 5
   ---> 500 total reviews
Scraping page 6
   ---> 600 total reviews
Scraping page 7
   ---> 700 total reviews
Scraping page 8
   ---> 800 total reviews
Scraping page 9
   ---> 900 total reviews
Scraping page 10
   ---> 1000 total reviews


In [None]:
# Display the resulting DataFrame
display(df)

Unnamed: 0,review,seat_type,seat_comfort,cabin_staff_service,food_beverages,inflight_entertainment,value_for_money,recommended
0,✅ Trip Verified | Late boarding led to a one ...,Business,3,3,1,,1,no
1,✅ Trip Verified | As usual the flight is delay...,Solo Leisure,,,,,1,no
2,✅ Trip Verified | I had the most fantastic BA...,Solo Leisure,5,5,5,5,5,yes
3,✅ Trip Verified | Couldn’t book in online. Ar...,Couple Leisure,3,3,1,,1,no
4,✅ Trip Verified | London Heathrow to Mumbai in...,Couple Leisure,4,5,5,3,4,yes
...,...,...,...,...,...,...,...,...
995,✅ Trip Verified | London to Bangkok. Flew Bri...,Solo Leisure,1,3,2,1,1,no
996,✅ Trip Verified | London Heathrow to Miami. T...,Family Leisure,5,5,5,4,5,yes
997,✅ Trip Verified | London to Singapore. It was...,Solo Leisure,5,5,5,4,5,yes
998,Not Verified | Dublin to London. I was trying ...,Solo Leisure,1,1,,,1,no


In [None]:
df.to_csv("BA_reviews.csv")

In [None]:
# Define the new column order
columns = ['verification', 'reviews', 'seat_type', 'seat_comfort', 'cabin_staff_service', 'food_beverages', 'inflight_entertainment', 'value_for_money', 'recommended']

# Rearrange the columns in the DataFrame
df = df.reindex(columns=columns)