 # Web Scraping and Analysis 

In [1]:
import csv
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

In [2]:
# Set the URL of the paginated webpage that you want to scrape
url = "https://www.airlinequality.com/airline-reviews/british-airways"

# Initialize an empty list to store the data that you scrape
data = []

# Setting the initial page number and the increment that you want to use to paginate through the webpage
page_num = 1
page_incr = 1
page_size = 100
# maximum number of pages to be scraped
max_pages = 20

# Set the URL of the webpage to be scraped 
paginated_url = f"{url}/page/{page_num}/?sortby=post_date%3ADesc&pagesize={page_size}"

# A while loop to paginate through the webpage and scrape the data
while page_num <= max_pages:

    print(f"Scraping page {page_num}")

    # A GET request to the paginated URL
    response = requests.get(paginated_url)

    # Parsing the response using BeautifulSoup
    parsed_content = BeautifulSoup(response.text, "html.parser")

    # Finding all the elements on the page that contain the data to be scraped
    elements = parsed_content.find_all("div",class_ = "body")

    # Looping through the elements and extract the data that you want to scrape
    for element in elements:
        header = element.find("h2",class_ = "text_header").text.replace("\n", " ")
        sub_header = element.find("h3",class_ = "text_sub_header").text.replace("\n", " ")
        content = element.find("div",class_ = "text_content").text.replace("\n", " ")
        
        data.append([header,sub_header,content])

    # Increasing the page number and setting the paginated URL to the new page
    page_num += page_incr
    paginated_url = f"{url}/page/{page_num}/?sortby=post_date%3ADesc&pagesize={page_size}"

    print(f"   ---> {len(data)} total reviews")

Scraping page 1
   ---> 100 total reviews
Scraping page 2
   ---> 200 total reviews
Scraping page 3
   ---> 300 total reviews
Scraping page 4
   ---> 400 total reviews
Scraping page 5
   ---> 500 total reviews
Scraping page 6
   ---> 600 total reviews
Scraping page 7
   ---> 700 total reviews
Scraping page 8
   ---> 800 total reviews
Scraping page 9
   ---> 900 total reviews
Scraping page 10
   ---> 1000 total reviews
Scraping page 11
   ---> 1100 total reviews
Scraping page 12
   ---> 1200 total reviews
Scraping page 13
   ---> 1300 total reviews
Scraping page 14
   ---> 1400 total reviews
Scraping page 15
   ---> 1500 total reviews
Scraping page 16
   ---> 1600 total reviews
Scraping page 17
   ---> 1700 total reviews
Scraping page 18
   ---> 1800 total reviews
Scraping page 19
   ---> 1900 total reviews
Scraping page 20
   ---> 2000 total reviews


In [3]:
#Coverting the list data into a dataframe
df = pd.DataFrame(data)
df.columns = ["REVIEW","PERSONAL INFO","CONTENT"]

#Removing unwanted text(first text preprocessing)
df.replace(re.compile(r'\s*✅ Trip Verified \|\s*'), '', inplace=True)
df

Unnamed: 0,REVIEW,PERSONAL INFO,CONTENT
0,"""Very good flight""",Guy Senior (United Kingdom) 20th January 2025,Not Verified | Very good flight following an ...
1,"""relatively comfortable elderly plane""",Simon Channon (United Kingdom) 19th January ...,Not Verified | An hour's delay due to late ar...
2,"""70 days chasing BA’s complaints department""",R Layne (United Kingdom) 15th January 2025,I booked through BA because Loganair don’t hav...
3,"""BA refused to reimburse me""",Michael Chastain (United States) 9th January...,British airways lost bags in LHR then found th...
4,"""the flight was delayed""",S Herron (Netherlands) 5th January 2025,The check in process and reward/loyalty progra...
...,...,...,...
1995,"""plane in both directions old and shabby""",K Tatten (United Kingdom) 8th March 2017,✅ Verified Review | London to Vancouver retur...
1996,"""lack of enforcement of the carry on allowance""",1 reviews L Irving (United Kingdom) 8th M...,✅ Verified Review | Flew London Heathrow to G...
1997,"""Inflight service was basic""",Kah Kay Au (Singapore) 8th March 2017,✅ Verified Review | This is my first time fly...
1998,"""nothing short of chaotic""",T Robinson (United Kingdom) 7th March 2017,✅ Verified Review | British Airways are reall...


In [8]:
#Saving data into a csv
df.to_csv(r"C:\Users\HomePC\Downloads\Cleaned_BA_reviews.csv")

In [9]:
sentiment_analysis_df = df.drop(["REVIEW","PERSONAL INFO"], axis=1)
sentiment_analysis_df.replace(re.compile(r'\s*✅ Verified Review \|\s*'), '', inplace=True)
sentiment_analysis_df

Unnamed: 0,CONTENT
0,Not Verified | Very good flight following an ...
1,Not Verified | An hour's delay due to late ar...
2,I booked through BA because Loganair don’t hav...
3,British airways lost bags in LHR then found th...
4,The check in process and reward/loyalty progra...
...,...
1995,London to Vancouver return. British Airways is...
1996,Flew London Heathrow to Gibraltar. This is pro...
1997,This is my first time flying on BA's A380 dail...
1998,British Airways are really trying to take all ...
