In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [2]:
def extract_rating(reviews, category):
    ratings = []
    for review in reviews:
        header_element = review.find('td', class_=f'review-rating-header {category}')
        if header_element:
            sibling = header_element.find_next_sibling('td')
            if sibling and 'stars' in sibling.get('class', []):
                # Đếm số sao đã fill cho rating
                rating = len(sibling.find_all('span', class_='fill'))
            else:
                rating = sibling.get_text().strip() if sibling else np.nan
        else:
            rating = np.nan  # Nan cho những thành phần trong review chưa được feedback
        ratings.append(rating)
    return ratings

# Hàm xử lý và lấy dữ liệu được trả về từ request 
def get_data(soup):
    date_published = [meta['content'] for meta in soup.find_all('meta', itemprop='datePublished')]
    rating_values = [value.get_text() for value in soup.find_all('span', itemprop='ratingValue')]
    rating_values= rating_values[1:]
    headers = [header.get_text() for header in soup.find_all('h2', class_='text_header')]
    names = [name.get_text() for name in soup.find_all('span', itemprop='name')]

    review_bodies = soup.find_all('div', itemprop='reviewBody')
    reviews = []
    verified_status = []
    for review in review_bodies:
        text = review.get_text()
        if 'Trip Verified' in text:
            status = 'Trip Verified'
            # Bỏ 'Trip Verified' và các string trước nó
            text_content = text.split('Trip Verified', 1)[-1]
        elif 'Not Verified' in text:
            status = 'Not Verified'
            # Bỏ 'Not Verified' và các string trước nó
            text_content = text.split('Not Verified', 1)[-1]
        else:
            status = 'Unknown'
            text_content = text.strip()
        text_content = text_content.split('|', 1)[-1].strip() if '|' in text_content else text_content.strip()
        
        verified_status.append(status)
        reviews.append(text_content)

    review_sections = soup.find_all('div', class_='review-stats')  # Lấy dữ liệu đánh giá 
    aircrafts = extract_rating(review_sections, 'aircraft')
    type_of_travellers = extract_rating(review_sections, 'type_of_traveller')
    cabin_flowns = extract_rating(review_sections, 'cabin_flown')
    routes = extract_rating(review_sections, 'route')
    date_flowns = extract_rating(review_sections, 'date_flown')
    seat_comforts = extract_rating(review_sections, 'seat_comfort')
    cabin_staff_services = extract_rating(review_sections, 'cabin_staff_service')
    food_and_beverages = extract_rating(review_sections, 'food_and_beverages')
    inflight_entertainments = extract_rating(review_sections, 'inflight_entertainment')
    ground_service = extract_rating(review_sections, 'ground_service')
    value_for_money = extract_rating(review_sections, 'value_for_money')
    wifi_and_connectivity = extract_rating(review_sections, 'wifi_and_connectivity')
    recommended = extract_rating(review_sections, 'recommended')

    data = {
        'Date published': date_published,
        'Name': names,
        'Review_header': headers,
        'Review_body': reviews,
        'Verified_review': verified_status,
        'Type_of_traveller': type_of_travellers,
        'Seat_type': cabin_flowns,
        'Route': routes,
        'Date_flown': date_flowns,
        'Aircraft': aircrafts,
        'Seat_comfort': seat_comforts,
        'Cabin_staff_service': cabin_staff_services,
        'Food_and_beverages': food_and_beverages,
        'Inflight_entertainments': inflight_entertainments,
        'Ground_service': ground_service,
        'Value_for_money': value_for_money,
        'Wifi_and_connectivity': wifi_and_connectivity,
        'Overall_rating': rating_values,
        'Recommended': recommended
    }
    temp_df = pd.DataFrame(data)
    return temp_df

In [3]:
base_url = 'https://www.airlinequality.com/airline-reviews/british-airways/page/'
url_parameters = '/?sortby=post_date%3ADesc&pagesize=100'

urls = [f"{base_url}{page_number}{url_parameters}" for page_number in range(1, 31)]
main_df = pd.DataFrame()
for url in urls:
    try:
        response = requests.get(url)
        if response.status_code == 200:
            html = response.text
            soup = BeautifulSoup(html, 'html.parser')
            temp_df = get_data(soup) 
            main_df = pd.concat([main_df, temp_df], ignore_index=True)
        else:
            print(f"Failed to fetch data from {url} - Status code: {response.status_code}")
    except Exception as e:
        print(f"An error occurred while processing {url}: {e}")
   
main_df.head(10)

Unnamed: 0,Date published,Name,Review_header,Review_body,Verified_review,Type_of_traveller,Seat_type,Route,Date_flown,Aircraft,Seat_comfort,Cabin_staff_service,Food_and_beverages,Inflight_entertainments,Ground_service,Value_for_money,Wifi_and_connectivity,Overall_rating,Recommended
0,2025-11-22,L Nayler,“Never again”,I had sworn never again with BA after a flight...,Trip Verified,Solo Leisure,Business Class,San Francisco to London,October 2025,Boeing 777,3.0,2.0,,,1.0,2,,3,no
1,2025-11-17,C Hadid,“not able to check-in online”,Flying business and not able to check-in onlin...,Trip Verified,Business,Business Class,Basel to London,November 2025,A320,3.0,5.0,5.0,,,1,,1,no
2,2025-11-16,J Tydeman,"""a very budget airline these days""",Flew to and from St Lucia with BA. On the outb...,Not Verified,Couple Leisure,Economy Class,St Lucia to London Gatwick,November 2025,Boeing 777,1.0,1.0,1.0,3.0,3.0,3,,3,no
3,2025-11-16,E Parjam,"""a nice experience""",This was a surprisingly OK experience. The pla...,Trip Verified,Solo Leisure,Economy Class,London to New Orleans,November 2025,,4.0,5.0,4.0,4.0,3.0,3,,7,yes
4,2025-11-07,Ben Everson,"""it appears bullying is ok""",We were taking my 90 year old mother to Teneri...,Trip Verified,Couple Leisure,Economy Class,Gatwick to Tenerife South,November 2025,,1.0,1.0,,,1.0,1,,1,no
5,2025-11-05,K Pellberg,“crew terribly disorganised and inattentive”,Delayed departure but arrival almost on time. ...,Not Verified,Solo Leisure,Business Class,Málaga to London,November 2025,A321,3.0,1.0,3.0,,4.0,3,,3,no
6,2025-11-03,Rayan Hunjan,"""BA was going to be cheap""",First flight diverted - Oct 25 BA524 not BAs f...,Not Verified,Family Leisure,Economy Class,Heathrow to Florence,October 2025,A320neo,4.0,5.0,,,3.0,5,,4,yes
7,2025-10-30,E Miller,“Two hours of hell”,Two hours of hell seated at the very back of t...,Trip Verified,Couple Leisure,Economy Class,Alicante to Gatwick,October 2025,,1.0,2.0,1.0,,4.0,3,,1,no
8,2025-10-28,L Han,"""overall experience was good""",My overall experience was good. The table was ...,Trip Verified,Solo Leisure,Economy Class,London to Washington,June 2025,,5.0,4.0,3.0,2.0,5.0,4,2.0,9,yes
9,2025-10-17,S Keale,"""seat is unsuitable for use""",Unable to check in and book seats due to BA we...,Trip Verified,Couple Leisure,Economy Class,London Heathrow to Palma,September 2025,A320,1.0,1.0,1.0,,2.0,2,,1,no


In [4]:
len(main_df)

3000

In [5]:
main_df.to_json("data.json", orient="records", indent=2, force_ascii=False)