In [1]:
from bs4 import BeautifulSoup
import requests
import time
import pandas as pd

In [2]:
test_url = "https://www.airlinequality.com/airline-reviews/british-airways"
response = requests.get(test_url)
print(f"Status Code: {response.status_code}")
print(response.url)  # To see the final URL after any redirects

Status Code: 200
https://www.airlinequality.com/airline-reviews/british-airways


In [3]:
def scrapeReviews(aw_url, num_pages):
    reviews = []
    
    for page in range(1, num_pages+1):
        print(f"Scraping page {page}...")
        
        #Request the page
        url = f"{aw_url}/page/{page}/"
        response = requests.get(url)
        #print(url)
        
        # Print the status code to ensure we get a valid response
        #print(f"Status Code: {response.status_code}")
        
        if response.status_code == 404:
            print("Page not found. Ending scraping.")
            break
        
        soup = BeautifulSoup(response.content, 'html.parser')
        
        #Find review container
        review_container = soup.find_all('article', itemprop='review')
        
        #Extract review details
        for container in review_container:
            try:
                rating = container.find('span', itemprop='ratingValue').text
                title = container.find('h2', class_='text_header').text.strip()
                content = container.find('div', itemprop= 'reviewBody').text.strip()
                date = container.find('meta', itemprop='datePublished')['content']
                
                seat_type_row = None
                for row in soup.find_all('tr'):
                    header = row.find('td', class_='review-rating-header')
                    if header and 'Seat Type' in header.text:
                        seat_type_row = row
                        break
                
                if seat_type_row:
                    seat_type = seat_type_row.find('td', class_='review-value').text.strip()
                else:
                    seat_type = 'Not Found'
                
                reviews.append({
                    'Date': date,
                    'Title': title,
                    'Content': content,
                    'Seat Type': seat_type,
                    'Rating': rating
                })
                
            except AttributeError as e:
                print(f"Error while parsing review: {e}")
                
        #Sleep to aviod overwhelming the server with requests
        time.sleep(2)
        
    #Save to Dataframe
    reviews_df = pd.DataFrame(reviews)
    return reviews_df

aw_url = test_url
reviews_df = scrapeReviews(aw_url, num_pages=300)

print("Scraping Completed.")

Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraping page 8...
Scraping page 9...
Scraping page 10...
Scraping page 11...
Scraping page 12...
Scraping page 13...
Scraping page 14...
Scraping page 15...
Scraping page 16...
Scraping page 17...
Scraping page 18...
Scraping page 19...
Scraping page 20...
Scraping page 21...
Scraping page 22...
Scraping page 23...
Scraping page 24...
Scraping page 25...
Scraping page 26...
Scraping page 27...
Scraping page 28...
Scraping page 29...
Scraping page 30...
Scraping page 31...
Scraping page 32...
Scraping page 33...
Scraping page 34...
Scraping page 35...
Scraping page 36...
Scraping page 37...
Scraping page 38...
Scraping page 39...
Scraping page 40...
Scraping page 41...
Scraping page 42...
Scraping page 43...
Scraping page 44...
Scraping page 45...
Scraping page 46...
Scraping page 47...
Scraping page 48...
Scraping page 49...
Scraping page 50...
Scraping 

In [4]:
# Save the data to a CSV file
reviews_df.to_csv('skytrax_reviews.csv', index=False)
print("Scraping completed and data saved to skytrax_reviews.csv")

Scraping completed and data saved to skytrax_reviews.csv
