In [82]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import unicodedata
import re

In [136]:
airline_name = []
headers = {
    "Mozilla/5.0": "(Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Edg/125.0.0.0"
    }

url = f'https://www.airlinequality.com/review-pages/a-z-airline-reviews/'

for letter in range(ord('A'), ord('B') + 1):
    l = chr(letter)
    uri = f"a2z-ldr-{l}"
    html_get = requests.get(url, headers=headers).text
    soup = BeautifulSoup(html_get, 'lxml')
    container_div = soup.find("div", id=uri)
    for div in container_div.find_all("li"):
        airline_name.append(div.text)

In [137]:
len(airline_name)

151

In [138]:
airline_name_top_10 = airline_name[:10] 
airline_name_top_10

['AB Aviation',
 'Adria Airways',
 'Aegean Airlines',
 'Aer Lingus',
 'Aero VIP',
 'Aerocaribbean',
 'Aeroflot Russian Airlines',
 'AeroItalia',
 'Aerolineas Argentinas',
 'Aeromar']

In [139]:
airline_url = []
start = "https://www.airlinequality.com/airline-reviews/"
end= "?sortby=post_date%3ADesc&pagesize=100"

for items in airline_name_top_10:
    items = items.lower()
    items = unicodedata.normalize('NFKD', items).encode('ASCII', 'ignore').decode('utf-8')
    items = items.replace(" ", "-")
    airline_url.append(start + items + '/'+end)

df_airline = pd.DataFrame({"Name":airline_name_top_10,"Links":airline_url})
df_airline

Unnamed: 0,Name,Links
0,AB Aviation,https://www.airlinequality.com/airline-reviews...
1,Adria Airways,https://www.airlinequality.com/airline-reviews...
2,Aegean Airlines,https://www.airlinequality.com/airline-reviews...
3,Aer Lingus,https://www.airlinequality.com/airline-reviews...
4,Aero VIP,https://www.airlinequality.com/airline-reviews...
5,Aerocaribbean,https://www.airlinequality.com/airline-reviews...
6,Aeroflot Russian Airlines,https://www.airlinequality.com/airline-reviews...
7,AeroItalia,https://www.airlinequality.com/airline-reviews...
8,Aerolineas Argentinas,https://www.airlinequality.com/airline-reviews...
9,Aeromar,https://www.airlinequality.com/airline-reviews...


In [140]:
columns = ["Aircraft","Type Of Traveller","Seat Type","Route","Date Flown","Seat Comfort","Cabin Staff Service","Food & Beverages","Ground Service","Inflight Entertainment","Wifi & Connectivity","Value For Money","Recommended"]

df_columns = ["Airline Name","Overall_Rating","Review_Title","Review Date","Verified","Review","Aircraft","Type Of Traveller","Seat Type","Route","Date Flown","Seat Comfort","Cabin Staff Service","Food & Beverages","Ground Service","Inflight Entertainment","Wifi & Connectivity","Value For Money","Recommended"]

In [141]:
records = []

# Đọc từng dòng từ DataFrame df_airline
for index, row in df_airline.iterrows():
    html = requests.get(row['Links'], headers=headers).text
    soup = BeautifulSoup(html, 'lxml')
    containers = soup.find("article" , {"class" : "comp comp_reviews-airline querylist position-content"})
    
    if containers:
        for item in containers.find_all("article"):
            verified = False

            # rating
            rating = item.find("div", {"class": "rating-10"})
            if rating:
                rating = rating.text.strip()[:1]
            else:
                rating = None

            # review title
            title = item.find("h2", {"class": "text_header"})
            if title:
                title = title.text.strip()
            else:
                title = None

            # review date
            time = item.find("h3").find("time")
            if time:
                time = time.text
            else:
                time = None

            # review
            review = item.find("div", {"class": "text_content"}).text
            review = review.split('|')
            if len(review) == 1:
                review = review[0]
            else:
                if review[0] == '✅ Trip Verified ':
                    verified = True
                review = review[1][1:]

            tab = {}

            # Tìm bảng dữ liệu đánh giá
            table = item.find("table", {"class": "review-ratings"})
            if table:
                for tr in table.find_all("tr"):
                    indx = None  
                    for td in tr.find_all("td"):
                        if 'review-rating-header' in td.get('class', []):
                            indx = td.text
                        else: 
                            span = td.find("span", class_="star fill")
                            if span:
                                tab[indx] = span.text
                            else:
                                tab[indx] = td.text

            # Thêm dữ liệu vào danh sách records
            records.append({
                "Airline Name": row["Name"],
                "Overall_Rating": rating,
                "Review_Title": title,
                "Review Date": time,
                "Verified": verified,
                "Review": review,
                "Aircraft": tab.get("Aircraft", None),
                "Type Of Traveller": tab.get("Type Of Traveller", None),
                "Seat Type": tab.get("Seat Type", None),
                "Route": tab.get("Route", None),
                "Date Flown": tab.get("Date Flown", None),
                "Seat Comfort": tab.get("Seat Comfort", None),
                "Cabin Staff Service": tab.get("Cabin Staff Service", None),
                "Food & Beverages": tab.get("Food & Beverages", None),
                "Ground Service": tab.get("Ground Service", None),
                "Inflight Entertainment": tab.get("Inflight Entertainment", None),
                "Wifi & Connectivity": tab.get("Wifi & Connectivity", None),
                "Value For Money": tab.get("Value For Money", None),
                "Recommended": tab.get("Recommended", None)
            })

# Tạo DataFrame từ danh sách records
reviews = pd.DataFrame(records, columns=df_columns)

# Lưu DataFrame reviews vào file CSV
reviews.to_csv("duy.csv", index=False)