In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests


def airline_scrape():
    airline_names = [
        # Top 20 ASIA airlines
        ("indigo-airlines", "India","Asia"),
        ("vistara", "India","Asia"),
        ("spicejet", "India","Asia"),
        ("airasia-india", "India","Asia"),
        ("air-india", "India","Asia"),
        ("air-india-express", "India","Asia"),
        ("goair", "India","Asia"),
        ("emirates", "UAE","Asia"),
        ("china-eastern-airlines", "China","Asia"),
        ("air-china", "China","Asia"),
        ("qatar-airways", "Qatar","Asia"),
        ("lion-air", "Indonesia","Asia"),
        ("japan-airlines", "Japan","Asia"),
        ("vietjetair", "Vietnam","Asia"),
        ("korean-air", "South Korea","Asia"),
        ("singapore-airlines", "Singapore","Asia"),
        ("ana-all-nippon-airways", "Japan","Asia"),
        ("china-southern-airlines", "China","Asia"),
        ("jet-airways", "India","Asia"),
        # Top 12 African airlines
        ('south-african-airways', 'South Africa', 'Africa'),
        ('ethiopian-airlines', 'Ethiopia', 'Africa'),
        ('egyptair', 'Egypt', 'Africa'),
        ('royal-air-maroc', 'Morocco', 'Africa'),
        ('air-algerie', 'Algeria', 'Africa'),
        ('kulula', 'South Africa', 'Africa'),
        ('tunisair', 'Tunisia', 'Africa'),
        ('flysafair', 'South Africa', 'Africa'),
        ('kenya-airways', 'Kenya', 'Africa'),
        ('air-mauritius', 'Mauritius', 'Africa'),
        ('libyan-airlines', 'Libya', 'Africa'),
        # Top 21 European airlines
        ('ryanair', 'Ireland', 'Europe'),
        ('lufthansa', 'Germany', 'Europe'),
        ('air-france', 'France', 'Europe'),
        ('turkish-airlines', 'Turkey', 'Europe'),
        ('easyjet', 'United Kingdom', 'Europe'),
        ('wizz-air', 'Hungary', 'Europe'),
        ('aeroflot', 'Russia', 'Europe'),
        ('pegasus-airlines', 'Turkey', 'Europe'),
        ('norwegian', 'Norway', 'Europe'),
        ('s7-siberia-airlines', 'Russia', 'Europe'),
        ('jet2', 'United Kingdom', 'Europe'),
        ('tap-portugal', 'Portugal', 'Europe'),
        ('aegean-airlines', 'Greece', 'Europe'),
        ('tui-airways', 'United Kingdom', 'Europe'),
        ('ural-airlines', 'Russia', 'Europe'),
        ('lot-polish-airlines', 'Poland', 'Europe'),
        ('finnair', 'Finland', 'Europe'),
        ('utair-aviation', 'Russia', 'Europe'),
        ('volotea', 'Spain', 'Europe'),
        ('air-europa', 'Spain', 'Europe'),
        ('nordwind-airlines', 'Russia', 'Europe'),
        # Top 17 North American airlines
        ('american-airlines', 'United States', 'North America'),
        ('delta-air-lines', 'United States', 'North America'),
        ('southwest-airlines', 'United States', 'North America'),
        ('united-airlines', 'United States', 'North America'),
        ('alaska-airlines', 'United States', 'North America'),
        ('jetblue-airways', 'United States', 'North America'),
        ('spirit-airlines', 'United States', 'North America'),
        ('air-canada', 'Canada', 'North America'),
        ('westjet', 'Canada', 'North America'),
        ('volaris', 'Mexico', 'North America'),
        ('frontier-airlines', 'United States', 'North America'),
        ('aero-mexico', 'Mexico', 'North America'),
        ('viva-aerobus', 'Mexico', 'North America'),
        ('allegiant-air', 'United States', 'North America'),
        ('interjet', 'Mexico', 'North America'),
        ('hawaiian-airlines', 'United States', 'North America'),
        ('sun-country-airlines', 'United States', 'North America'),
        # Top 10 South American airlines
        ('latam-airlines', 'Chile', 'South America'),
        ('azul-airlines', 'Brazil', 'South America'),
        ('gol-transportes-aereos', 'Brazil', 'South America'),
        ('avianca', 'Colombia', 'South America'),
        ('aerolineas-argentinas', 'Argentina', 'South America'),
        ('viva-air', 'Colombia', 'South America'),
        ('sky-airline', 'Chile', 'South America'),
        ('jetSMART', 'Chile', 'South America'),
        ('boliviana-de-aviacion', 'Bolivia', 'South America'),
        ('flybondi', 'Argentina', 'South America'),
        ('wingo', 'Colombia', 'South America'),
        ('easyfly', 'Colombia', 'South America'),
        ('ultra-air', 'Peru', 'South America'),
        ('satena', 'Colombia', 'South America'),
        ('amaszonas', 'Bolivia', 'South America'),
        ('star-peru', 'Peru', 'South America'),
        ('voepass', 'Brazil', 'South America'),
        ('ecojet', 'Bolivia', 'South America'),
        ('atsa-airlines', 'Peru', 'South America'),
        # Top 10 Oceania airlines
        
    ]

    url = None
    airline = []
    country = []
    review = []
    date_published = []
    type_of_traveller = []
    seat_type = []
    route = []
    seat_comfort = []
    cabin_staff_service = []
    food_and_beverages = []
    inflight_entertainment = []
    ground_service = []
    value_for_money = []
    recommended = []

    for pages, air_name, country_name in airline_names:
        for j in range(1, pages):
            if j == 1:
                url = (
                    "https://www.airlinequality.com/airline-reviews/"
                    + f"{air_name}"
                    + "/?sortby=post_date%3ADesc&pagesize=100"
                )
            else:
                url = (
                    "https://www.airlinequality.com/airline-reviews/"
                    + f"{air_name}"
                    + "/page/"
                    + f"{j}"
                    + "/?sortby=post_date%3ADesc&pagesize=100"
                )
                pass

            # Getting the data from the website
            r = requests.get(url)
            soup = BeautifulSoup(r.text, "html5lib")

            # Get the date published of the review

            dates = soup.find_all(class_="media")
            for date in dates:
                try:
                    review_published_on = date.find("time").attrs["datetime"]
                    date_published.append(review_published_on)
                except AttributeError:
                    pass

            # Extracting the reviews and the review ratings

            articles = soup.find_all("div", class_="tc_mobile")

            for article in articles:
                review.append(
                    article.find(
                        "div", attrs={"class": "text_content", "itemprop": "reviewBody"}
                    ).text
                )

                table_rows = article.find(
                    "table", attrs={"class": "review-ratings"}
                ).find_all("tr")

                # Initialize variables to store the values
                type_of_traveller_value = np.nan
                seat_type_value = np.nan
                route_value = np.nan
                seat_comfort_value = np.nan
                cabin_staff_service_value = np.nan
                food_and_beverages_value = np.nan
                inflight_entertainment_value = np.nan
                ground_service_value = np.nan
                value_for_money_value = np.nan
                recommended_value = np.nan

                for row in table_rows:
                    try:
                        if row.find_all("td")[0].text == "Type Of Traveller":
                            type_of_traveller_value = row.find_all("td")[1].text
                        elif row.find_all("td")[0].text == "Seat Type":
                            seat_type_value = row.find_all("td")[1].text
                        elif row.find_all("td")[0].text == "Route":
                            route_value = row.find_all("td")[1].text
                        elif row.find_all("td")[0].text == "Seat Comfort":
                            seat_comfort_value = (
                                row.find_all("td")[1]
                                .find_all("span", class_="star fill")[-1]
                                .text
                            )
                        elif row.find_all("td")[0].text == "Cabin Staff Service":
                            cabin_staff_service_value = (
                                row.find_all("td")[1]
                                .find_all("span", class_="star fill")[-1]
                                .text
                            )
                        elif row.find_all("td")[0].text == "Food & Beverages":
                            food_and_beverages_value = (
                                row.find_all("td")[1]
                                .find_all("span", class_="star fill")[-1]
                                .text
                            )
                        elif row.find_all("td")[0].text == "Inflight Entertainment":
                            inflight_entertainment_value = (
                                row.find_all("td")[1]
                                .find_all("span", class_="star fill")[-1]
                                .text
                            )
                        elif row.find_all("td")[0].text == "Ground Service":
                            ground_service_value = (
                                row.find_all("td")[1]
                                .find_all("span", class_="star fill")[-1]
                                .text
                            )
                        elif row.find_all("td")[0].text == "Value For Money":
                            value_for_money_value = (
                                row.find_all("td")[1]
                                .find_all("span", class_="star fill")[-1]
                                .text
                            )
                        elif row.find_all("td")[0].text == "Recommended":
                            recommended_value = row.find_all("td")[1].text
                        else:
                            pass
                    except IndexError:
                        pass

                # Append the values to the respective lists
                airline.append(air_name)
                country.append(country_name)
                type_of_traveller.append(type_of_traveller_value)
                seat_type.append(seat_type_value)
                route.append(route_value)
                seat_comfort.append(seat_comfort_value)
                cabin_staff_service.append(cabin_staff_service_value)
                food_and_beverages.append(food_and_beverages_value)
                inflight_entertainment.append(inflight_entertainment_value)
                ground_service.append(ground_service_value)
                value_for_money.append(value_for_money_value)
                recommended.append(recommended_value)
                pass
            pass

    # Create a dataframe
    df = pd.DataFrame(
        {
            "Airline": airline,
            "Country": country,
            "Review": review,
            "Date_Published": date_published,
            "Type of Traveller": type_of_traveller,
            "Seat Type": seat_type,
            "Route": route,
            "Seat Comfort": seat_comfort,
            "Cabin Staff Service": cabin_staff_service,
            "Food & Beverages": food_and_beverages,
            "Inflight Entertainment": inflight_entertainment,
            "Ground Service": ground_service,
            "Value for Money": value_for_money,
            "Recommended": recommended,
        }
    )
    return df


airline_df = airline_scrape()
print(airline_df.head())
airline_df.to_csv("test_df.csv", index=False)