In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
base_url = "https://www.airlinequality.com/airline-reviews/british-airways"
pages = 10
page_size = 100

reviews = []

# for i in range(1, pages + 1):
for i in range(1, pages + 1):

    print(f"Scraping page {i}")

    # Create URL to collect links from paginated data
    url = f"{base_url}/page/{i}/?sortby=post_date%3ADesc&pagesize={page_size}"

    # Collect HTML data from this page
    response = requests.get(url)

    # Parse content
    content = response.content
    parsed_content = BeautifulSoup(content, 'html.parser')
    for para in parsed_content.find_all("div", {"class": "text_content"}):
        reviews.append(para.get_text())

    print(f"   ---> {len(reviews)} total reviews")

Scraping page 1
   ---> 100 total reviews
Scraping page 2
   ---> 200 total reviews
Scraping page 3
   ---> 300 total reviews
Scraping page 4
   ---> 400 total reviews
Scraping page 5
   ---> 500 total reviews
Scraping page 6
   ---> 600 total reviews
Scraping page 7
   ---> 700 total reviews
Scraping page 8
   ---> 800 total reviews
Scraping page 9
   ---> 900 total reviews
Scraping page 10
   ---> 1000 total reviews


In [5]:
df = pd.DataFrame()
df["reviews"] = reviews
df.head()

Unnamed: 0,reviews
0,✅ Trip Verified | Worst seats I have ever enco...
1,Not Verified | Top Ten REASONS to not use Brit...
2,Not Verified | Easy check in on the way to He...
3,✅ Trip Verified | Online check in worked fine...
4,✅ Trip Verified |. The BA first lounge at Term...


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   reviews  1000 non-null   object
dtypes: object(1)
memory usage: 7.9+ KB


In [16]:
df.reviews= df.reviews.str.split('|',expand=True)[0]
df

Unnamed: 0,reviews
0,Worst seats I have ever encountered in econom...
1,Top Ten REASONS to not use British Airways To...
2,Easy check in on the way to Heathrow. The fl...
3,Online check in worked fine. Quick security ...
4,. The BA first lounge at Terminal 5 was a zoo...
...,...
995,Bermuda to Gatwick. Usual window and Aisle s...
996,Aberdeen to Boston via London Heathrow. I fo...
997,London to Hamburg. Baggage self-check-in was...
998,Flew London Heathrow to Hong Kong with Britis...


**Text cleaning**

In [21]:
import re

# Define a function to clean the text
def clean(text):
# Removes all special characters and numericals leaving the alphabets
    text = re.sub('[^A-Za-z]+', ' ', str(text))
    return text

# Cleaning the text in the review column
df['Cleaned Reviews'] = df['reviews'].apply(clean)
df.head()

Unnamed: 0,reviews,Cleaned Reviews
0,Worst seats I have ever encountered in econom...,Worst seats I have ever encountered in econom...
1,Top Ten REASONS to not use British Airways To...,Top Ten REASONS to not use British Airways To...
2,Easy check in on the way to Heathrow. The fl...,Easy check in on the way to Heathrow The flig...
3,Online check in worked fine. Quick security ...,Online check in worked fine Quick security ch...
4,. The BA first lounge at Terminal 5 was a zoo...,The BA first lounge at Terminal was a zoo at ...


In [22]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk import pos_tag
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('wordnet')
from nltk.corpus import wordnet

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...


In [27]:
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

# POS tagger dictionary
pos_dict = {'J':wordnet.ADJ, 'V':wordnet.VERB, 'N':wordnet.NOUN, 'R':wordnet.ADV}
def token_stop_pos(text):
    tags = pos_tag(word_tokenize(text))
    #print(tags)
    newlist = []
    for word, tag in tags:
        if word.lower() not in set(stopwords.words('english')):
          newlist.append(tuple([word, pos_dict.get(tag[0])]))

    return newlist

df['POS tagged'] = df['Cleaned Reviews'].apply(token_stop_pos)
df.head()

Unnamed: 0,reviews,Cleaned Reviews,POS tagged
0,Worst seats I have ever encountered in econom...,Worst seats I have ever encountered in econom...,"[(Worst, n), (seats, n), (ever, r), (encounter..."
1,Top Ten REASONS to not use British Airways To...,Top Ten REASONS to not use British Airways To...,"[(Top, a), (Ten, n), (REASONS, n), (use, v), (..."
2,Easy check in on the way to Heathrow. The fl...,Easy check in on the way to Heathrow The flig...,"[(Easy, a), (check, n), (way, n), (Heathrow, v..."
3,Online check in worked fine. Quick security ...,Online check in worked fine Quick security ch...,"[(Online, n), (check, n), (worked, a), (fine, ..."
4,. The BA first lounge at Terminal 5 was a zoo...,The BA first lounge at Terminal was a zoo at ...,"[(BA, n), (first, a), (lounge, n), (Terminal, ..."


[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Unnamed: 0,reviews,Cleaned Reviews,POS tagged
0,Worst seats I have ever encountered in econom...,Worst seats I have ever encountered in econom...,"[(Worst, n), (seats, n), (ever, r), (encounter..."
1,Top Ten REASONS to not use British Airways To...,Top Ten REASONS to not use British Airways To...,"[(Top, a), (Ten, n), (REASONS, n), (use, v), (..."
2,Easy check in on the way to Heathrow. The fl...,Easy check in on the way to Heathrow The flig...,"[(Easy, a), (check, n), (way, n), (Heathrow, v..."
3,Online check in worked fine. Quick security ...,Online check in worked fine Quick security ch...,"[(Online, n), (check, n), (worked, a), (fine, ..."
4,. The BA first lounge at Terminal 5 was a zoo...,The BA first lounge at Terminal was a zoo at ...,"[(BA, n), (first, a), (lounge, n), (Terminal, ..."


[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
