# Task 1

---

## Web scraping and analysis

I use a package called `BeautifulSoup` to collect the data from the web and saved it into a local `BA_reviews.csv` file 

### Scraping data from Skytrax

In [None]:
import time
import requests
from bs4 import BeautifulSoup

base_url = "https://www.airlinequality.com/airline-reviews/british-airways"
pages = 10
page_size = 100

reviews = []
session = requests.Session()

for i in range(1, pages + 1):
    print(f"Scraping page {i}")

    # Create URL to collect links from paginated data
    url = f"{base_url}/page/{i}/?sortby=post_date%3ADesc&pagesize={page_size}"

    try:
        # Collect HTML data from this page
        response = session.get(url, headers={"User-Agent": "Mozilla/5.0"})
        response.raise_for_status()

        # Parse content
        content = response.content
        parsed_content = BeautifulSoup(content, 'html.parser')
        for para in parsed_content.find_all("div", {"class": "text_content"}):
            reviews.append(para.get_text())

        print(f"   ---> {len(reviews)} total reviews")
        
        # Add a delay between requests to avoid overloading the server
        time.sleep(2)
        
    except requests.exceptions.RequestException as e:
        print(f"Error scraping page {i}: {e}")

print(f"Finished scraping. Total reviews collected: {len(reviews)}")


In [6]:
import os
import pandas as pd

output_dir = "data"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

df = pd.DataFrame()
df["reviews"] = reviews

output_path = os.path.join(output_dir, "BA_reviews.csv")
df.to_csv(output_path, index=False)

print(f"Reviews saved to {output_path}")

Reviews saved to data/BA_reviews.csv
