In [48]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [49]:
base_url = "https://www.airlinequality.com/airline-reviews/british-airways"
pages = 10
page_size = 100

reviews = []

# for i in range(1, pages + 1):
for i in range(1, pages + 1):

    print(f"Scraping page {i}")

    # Create URL to collect links from paginated data
    url = f"{base_url}/page/{i}/?sortby=post_date%3ADesc&pagesize={page_size}"

    # Collect HTML data from this page
    response = requests.get(url)

    # Parse content
    content = response.content
    parsed_content = BeautifulSoup(content, 'html.parser')
    for para in parsed_content.find_all("div", {"class": "text_content"}):
        reviews.append(para.get_text())

    print(f"   ---> {len(reviews)} total reviews")

Scraping page 1
   ---> 100 total reviews
Scraping page 2
   ---> 200 total reviews
Scraping page 3
   ---> 300 total reviews
Scraping page 4
   ---> 400 total reviews
Scraping page 5
   ---> 500 total reviews
Scraping page 6
   ---> 600 total reviews
Scraping page 7
   ---> 700 total reviews
Scraping page 8
   ---> 800 total reviews
Scraping page 9
   ---> 900 total reviews
Scraping page 10
   ---> 1000 total reviews


In [50]:
df = pd.DataFrame()
df["reviews"] = reviews
df.head()

Unnamed: 0,reviews
0,"Not Verified | I flew with numerous airlines, ..."
1,✅ Trip Verified | We were traveling as a fami...
2,✅ Trip Verified | Flight at 8.40am from DUB to...
3,✅ Trip Verified | Terrible. I have traveled t...
4,✅ Trip Verified | The customer service is ugl...


In [51]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   reviews  1000 non-null   object
dtypes: object(1)
memory usage: 7.9+ KB


In [57]:
df.reviews= df.reviews.str.split('|',expand=True)[0]
df.head()

Unnamed: 0,reviews,Cleaned Reviews,POS tagged,Lemma
0,Not Verified,Not Verified,"[(Verified, v)]",Verified
1,✅ Trip Verified,Trip Verified,"[(Trip, n), (Verified, v)]",Trip Verified
2,✅ Trip Verified,Trip Verified,"[(Trip, n), (Verified, v)]",Trip Verified
3,✅ Trip Verified,Trip Verified,"[(Trip, n), (Verified, v)]",Trip Verified
4,✅ Trip Verified,Trip Verified,"[(Trip, n), (Verified, v)]",Trip Verified


**Text cleaning**

In [53]:
import re

# Define a function to clean the text
def clean(text):
# Removes all special characters and numericals leaving the alphabets
    text = re.sub('[^A-Za-z]+', ' ', str(text))
    return text

# Cleaning the text in the review column
df['Cleaned Reviews'] = df['reviews'].apply(clean)
df.head()

Unnamed: 0,reviews,Cleaned Reviews
0,Not Verified,Not Verified
1,✅ Trip Verified,Trip Verified
2,✅ Trip Verified,Trip Verified
3,✅ Trip Verified,Trip Verified
4,✅ Trip Verified,Trip Verified


In [54]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk import pos_tag
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('wordnet')
from nltk.corpus import wordnet

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [55]:
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

# POS tagger dictionary
pos_dict = {'J':wordnet.ADJ, 'V':wordnet.VERB, 'N':wordnet.NOUN, 'R':wordnet.ADV}
def token_stop_pos(text):
    tags = pos_tag(word_tokenize(text))
    newlist = []
    for word, tag in tags:
        if word.lower() not in set(stopwords.words('english')):
          newlist.append(tuple([word, pos_dict.get(tag[0])]))

    return newlist

df['POS tagged'] = df['Cleaned Reviews'].apply(token_stop_pos)
df.head()

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Unnamed: 0,reviews,Cleaned Reviews,POS tagged
0,Not Verified,Not Verified,"[(Verified, v)]"
1,✅ Trip Verified,Trip Verified,"[(Trip, n), (Verified, v)]"
2,✅ Trip Verified,Trip Verified,"[(Trip, n), (Verified, v)]"
3,✅ Trip Verified,Trip Verified,"[(Trip, n), (Verified, v)]"
4,✅ Trip Verified,Trip Verified,"[(Trip, n), (Verified, v)]"


In [56]:
 ##Obtaining the stem words – Lemmatization

from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
def lemmatize(pos_data):
    lemma_rew = " "
    for word, pos in pos_data:
     if not pos:
        lemma = word
        lemma_rew = lemma_rew + " " + lemma
     else:
        lemma = wordnet_lemmatizer.lemmatize(word, pos=pos)
        lemma_rew = lemma_rew + " " + lemma
    return lemma_rew

df['Lemma'] = df['POS tagged'].apply(lemmatize)
df.head()

Unnamed: 0,reviews,Cleaned Reviews,POS tagged,Lemma
0,Not Verified,Not Verified,"[(Verified, v)]",Verified
1,✅ Trip Verified,Trip Verified,"[(Trip, n), (Verified, v)]",Trip Verified
2,✅ Trip Verified,Trip Verified,"[(Trip, n), (Verified, v)]",Trip Verified
3,✅ Trip Verified,Trip Verified,"[(Trip, n), (Verified, v)]",Trip Verified
4,✅ Trip Verified,Trip Verified,"[(Trip, n), (Verified, v)]",Trip Verified


In [58]:
df[['reviews','Lemma']]

Unnamed: 0,reviews,Lemma
0,Not Verified,Verified
1,✅ Trip Verified,Trip Verified
2,✅ Trip Verified,Trip Verified
3,✅ Trip Verified,Trip Verified
4,✅ Trip Verified,Trip Verified
...,...,...
995,✅ Trip Verified,Trip Verified
996,✅ Trip Verified,Trip Verified
997,✅ Trip Verified,Trip Verified
998,✅ Trip Verified,Trip Verified


***Sentiment Analysis using VADER***


In [59]:
!pip install vaderSentiment

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [60]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()


# function to calculate vader sentiment
def vadersentimentanalysis(review):
    vs = analyzer.polarity_scores(review)
    return vs['compound']

df['Sentiment'] = df['Lemma'].apply(vadersentimentanalysis)

# function to analyse
def vader_analysis(compound):
    if compound >= 0.5:
        return 'Positive'
    elif compound < 0 :
        return 'Negative'
    else:
        return 'Neutral'
df['Analysis'] = df['Sentiment'].apply(vader_analysis)
df.head()

Unnamed: 0,reviews,Cleaned Reviews,POS tagged,Lemma,Sentiment,Analysis
0,Not Verified,Not Verified,"[(Verified, v)]",Verified,0.0,Neutral
1,✅ Trip Verified,Trip Verified,"[(Trip, n), (Verified, v)]",Trip Verified,0.0,Neutral
2,✅ Trip Verified,Trip Verified,"[(Trip, n), (Verified, v)]",Trip Verified,0.0,Neutral
3,✅ Trip Verified,Trip Verified,"[(Trip, n), (Verified, v)]",Trip Verified,0.0,Neutral
4,✅ Trip Verified,Trip Verified,"[(Trip, n), (Verified, v)]",Trip Verified,0.0,Neutral


In [61]:
vader_counts = df['Analysis'].value_counts()
vader_counts

Neutral    1000
Name: Analysis, dtype: int64