In [None]:
# Standard Libraries

import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# For visualizations
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls


# For Web Scrapping
import requests
from bs4 import BeautifulSoup

# For Data Pre-processing
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer   
from textblob import Word, TextBlob
from wordcloud import WordCloud , STOPWORDS

# For topic modeling
from sklearn.decomposition import NMF, LatentDirichletAllocation

# For sentiment analysis
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

In [None]:
## Extracting reviews from "Skytrax" website

In [3]:
website = "https://www.airlinequality.com/airline-reviews/british-airways"
pages = 363              # Total no. of pages selected for review extraction
page_size = 100          

reviews = []            # object to store extracted reviews from website

for i in range(1, pages + 1):      

    print(f"Scraping data from Page {i}")

    # Create URL to collect links 
    url = f"{website}/page/{i}/?sortby=post_date%3ADesc&pagesize={page_size}"

    
    response = requests.get(url)     # Collect HTML data from this page

    # Parse content
    content = response.content
    parsed_content = BeautifulSoup(content, 'html.parser')
    for para in parsed_content.find_all("div", {"class": "text_content"}):
        reviews.append(para.get_text())
    
    print(f"   ---> {len(reviews)} Total Reviews")

Scraping data from Page 1
   ---> 100 Total Reviews
Scraping data from Page 2
   ---> 200 Total Reviews
Scraping data from Page 3
   ---> 300 Total Reviews
Scraping data from Page 4
   ---> 400 Total Reviews
Scraping data from Page 5
   ---> 500 Total Reviews
Scraping data from Page 6
   ---> 600 Total Reviews
Scraping data from Page 7
   ---> 700 Total Reviews
Scraping data from Page 8
   ---> 800 Total Reviews
Scraping data from Page 9
   ---> 900 Total Reviews
Scraping data from Page 10
   ---> 1000 Total Reviews
Scraping data from Page 11
   ---> 1100 Total Reviews
Scraping data from Page 12
   ---> 1200 Total Reviews
Scraping data from Page 13
   ---> 1300 Total Reviews
Scraping data from Page 14
   ---> 1400 Total Reviews
Scraping data from Page 15
   ---> 1500 Total Reviews
Scraping data from Page 16
   ---> 1600 Total Reviews
Scraping data from Page 17
   ---> 1700 Total Reviews
Scraping data from Page 18
   ---> 1800 Total Reviews
Scraping data from Page 19
   ---> 1900 Total 

   ---> 3625 Total Reviews
Scraping data from Page 153
   ---> 3625 Total Reviews
Scraping data from Page 154
   ---> 3625 Total Reviews
Scraping data from Page 155
   ---> 3625 Total Reviews
Scraping data from Page 156
   ---> 3625 Total Reviews
Scraping data from Page 157
   ---> 3625 Total Reviews
Scraping data from Page 158
   ---> 3625 Total Reviews
Scraping data from Page 159
   ---> 3625 Total Reviews
Scraping data from Page 160
   ---> 3625 Total Reviews
Scraping data from Page 161
   ---> 3625 Total Reviews
Scraping data from Page 162
   ---> 3625 Total Reviews
Scraping data from Page 163
   ---> 3625 Total Reviews
Scraping data from Page 164
   ---> 3625 Total Reviews
Scraping data from Page 165
   ---> 3625 Total Reviews
Scraping data from Page 166
   ---> 3625 Total Reviews
Scraping data from Page 167
   ---> 3625 Total Reviews
Scraping data from Page 168
   ---> 3625 Total Reviews
Scraping data from Page 169
   ---> 3625 Total Reviews
Scraping data from Page 170
   ---> 36

   ---> 3625 Total Reviews
Scraping data from Page 302
   ---> 3625 Total Reviews
Scraping data from Page 303
   ---> 3625 Total Reviews
Scraping data from Page 304
   ---> 3625 Total Reviews
Scraping data from Page 305
   ---> 3625 Total Reviews
Scraping data from Page 306
   ---> 3625 Total Reviews
Scraping data from Page 307
   ---> 3625 Total Reviews
Scraping data from Page 308
   ---> 3625 Total Reviews
Scraping data from Page 309
   ---> 3625 Total Reviews
Scraping data from Page 310
   ---> 3625 Total Reviews
Scraping data from Page 311
   ---> 3625 Total Reviews
Scraping data from Page 312
   ---> 3625 Total Reviews
Scraping data from Page 313
   ---> 3625 Total Reviews
Scraping data from Page 314
   ---> 3625 Total Reviews
Scraping data from Page 315
   ---> 3625 Total Reviews
Scraping data from Page 316
   ---> 3625 Total Reviews
Scraping data from Page 317
   ---> 3625 Total Reviews
Scraping data from Page 318
   ---> 3625 Total Reviews
Scraping data from Page 319
   ---> 36

In [4]:
# Storing reviews in a pandas dataframe 
df = pd.DataFrame()               
df["reviews"] = reviews             
df.head()

Unnamed: 0,reviews
0,✅ Trip Verified | My family flew from Washing...
1,✅ Trip Verified | Easy check in a T5. Galleri...
2,"Not Verified | Flight delayed by an hour, it ..."
3,Not Verified | The staff are very rude and not...
4,✅ Trip Verified | Good domestic flight operat...


In [5]:
df.reviews= df.reviews.str.split('|',expand=True)[1]

In [6]:
df.head()

Unnamed: 0,reviews
0,My family flew from Washington to London on ...
1,Easy check in a T5. Galleries south and Nort...
2,"Flight delayed by an hour, it happens, no bi..."
3,The staff are very rude and not trained prope...
4,Good domestic flight operated by BA Cityflye...


In [7]:
def replace(text):            # Define a function to clean the text
    text = re.sub(r'[^A-Za-z]+', ' ', str(text)) # Replaces all special characters and numericals with blanks and leaving the alphabets
    return text
# Cleaning the text in the review column
df['reviews']= df["reviews"].apply(replace)
df.head()

Unnamed: 0,reviews
0,My family flew from Washington to London on a...
1,Easy check in a T Galleries south and North l...
2,Flight delayed by an hour it happens no biggi...
3,The staff are very rude and not trained prope...
4,Good domestic flight operated by BA Cityflyer...


In [8]:
df['reviews'] = df['reviews'].str.lower()

In [9]:
df.head()

Unnamed: 0,reviews
0,my family flew from washington to london on a...
1,easy check in a t galleries south and north l...
2,flight delayed by an hour it happens no biggi...
3,the staff are very rude and not trained prope...
4,good domestic flight operated by ba cityflyer...


In [10]:
df['reviews'] = df['reviews'].str.replace('[^\w\s]', '')

In [11]:
df.head()

Unnamed: 0,reviews
0,my family flew from washington to london on a...
1,easy check in a t galleries south and north l...
2,flight delayed by an hour it happens no biggi...
3,the staff are very rude and not trained prope...
4,good domestic flight operated by ba cityflyer...


In [12]:
df['reviews'] = df['reviews'].str.replace('\d', '')

In [13]:
df.head()

Unnamed: 0,reviews
0,my family flew from washington to london on a...
1,easy check in a t galleries south and north l...
2,flight delayed by an hour it happens no biggi...
3,the staff are very rude and not trained prope...
4,good domestic flight operated by ba cityflyer...


In [14]:
df.to_csv("BA_reviews.csv")