# Task 1

---

## Web scraping and analysis


### Scraping data from Skytrax

Visit [https://www.airlinequality.com] you can see that there is a lot of data there. For this task, we are only interested in reviews related to British Airways and the Airline itself.

If you navigate to this link: [https://www.airlinequality.com/airline-reviews/british-airways] you will see this data. Now, we can use `Python` and `BeautifulSoup` to collect all the links to the reviews and then to collect the text data on each of the individual review links.

## Importing of libraries

In [None]:
import pandas as pd
import numpy as np

import requests
from bs4 import BeautifulSoup

import string

In [None]:
import nltk

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('vader_lexicon')

from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


# ETL

## Helper functions

In [None]:
def extract_webscrape(base_url="https://www.airlinequality.com/airline-reviews/british-airways", page_size=10, pages=365):
    '''
    page_size --> pages
    10 --> 365
    20 --> 183
    50 --> 73
    100 --> 37
    '''

    header = []
    review = []

    # for i in range(1, pages + 1):
    for i in range(1, pages + 1):

        print(f"Scraping page {i}", end=" | ")

        # Create URL to collect links from paginated data
        url = f"{base_url}/page/{i}/?sortby=post_date%3ADesc&pagesize={page_size}"

        # Collect HTML data from this page
        response = requests.get(url)

        # Parse content
        content = response.content
        parsed_content = BeautifulSoup(content, 'html.parser')

        # header
        for h in parsed_content.find_all("h2", {"class": "text_header"}):
          header.append(h.get_text())

        # review
        for r in parsed_content.find_all("div", {"class": "text_content"}):
            review.append(r.get_text())

    print()
    print(f"Page size: {page_size}")
    print(f"Pages: {pages}")
    print()
    print(f"Total number of headers: {len(header)}")
    print(f"Total number of reviews: {len(review)}")

    return header, review

In [None]:
def extract_to_df(name_of_columns=['Header', 'Review'], values=()):

    data = {
        name_of_columns[0]: values[0],
        name_of_columns[1]: values[1]
    }

    df = pd.DataFrame(data)
    return df

In [None]:
def transform_remove_verification(df, column):
    df[column] = df[column].str.replace("Not Verified", "").str.replace("Trip Verified", "").str.replace("✅", "").str.replace("|", "").str.strip()
    return df

In [None]:
def transform_lower_case(df, columns):
    df[columns[0]] = df[columns[0]].str.lower()
    df[columns[1]] = df[columns[1]].str.lower()
    return df

In [None]:
def transform_remove_punctuation(df, columns):
    for punc in string.punctuation:
        df[columns[0]] = df[columns[0]].str.replace(punc, "")
        df[columns[1]] = df[columns[1]].str.replace(punc, "")
    return df

In [None]:
def transform_token_stopw(df):
    stop_words = stopwords.words('english')

    tokenized_header = []
    for header in df['Header']:
        filtered_h = []
        for h in word_tokenize(header, 'english'):
            if h not in stop_words:
                filtered_h.append(h)
        tokenized_header.append(" ".join(filtered_h))

    tokenized_review = []
    for review in df['Review']:
        filtered_r = []
        for r in word_tokenize(review, 'english'):
            if r not in stop_words:
                filtered_r.append(r)
        tokenized_review.append(" ".join(filtered_r))

    df['Tokenized Header'] = tokenized_header
    df['Tokenized Review'] = tokenized_review
    return df

## ETL process

In [None]:
def etl():
    # Extraction

    print("E X T R A C T I O N ...")

    data = extract_webscrape(page_size=100, pages=37)
    df = extract_to_df(values=data)

    print()

    # Transform

    print("T R A N S F O R M ...")

    df = transform_remove_verification(df, 'Review')
    df = transform_lower_case(df, ['Header', 'Review'])
    df = transform_remove_punctuation(df, ['Header', 'Review'])
    df = transform_token_stopw(df)

    print()

    # Load

    print("L O A D ...")
    print("Converting into csv.")
    return df.to_csv('header_review.csv')

In [None]:
etl()

E X T R A C T I O N ...
Scraping page 1 | Scraping page 2 | Scraping page 3 | Scraping page 4 | Scraping page 5 | Scraping page 6 | Scraping page 7 | Scraping page 8 | Scraping page 9 | Scraping page 10 | Scraping page 11 | Scraping page 12 | Scraping page 13 | Scraping page 14 | Scraping page 15 | Scraping page 16 | Scraping page 17 | Scraping page 18 | Scraping page 19 | Scraping page 20 | Scraping page 21 | Scraping page 22 | Scraping page 23 | Scraping page 24 | Scraping page 25 | Scraping page 26 | Scraping page 27 | Scraping page 28 | Scraping page 29 | Scraping page 30 | Scraping page 31 | Scraping page 32 | Scraping page 33 | Scraping page 34 | Scraping page 35 | Scraping page 36 | Scraping page 37 | 
Page size: 100
Pages: 37

Total number of headers: 3650
Total number of reviews: 3650

T R A N S F O R M ...


  df[column] = df[column].str.replace("Not Verified", "").str.replace("Trip Verified", "").str.replace("✅", "").str.replace("|", "").str.strip()
  df[columns[0]] = df[columns[0]].str.replace(punc, "")
  df[columns[1]] = df[columns[1]].str.replace(punc, "")



L O A D ...
Converting into csv.


# ANALYSIS

## EDA

In [None]:
df = pd.read_csv('header_review.csv')
df.head(3)

Unnamed: 0.1,Unnamed: 0,Header,Review,Tokenized Header,Tokenized Review
0,0,the worst airline in the uk,british airways has confirmed itself as the wo...,worst airline uk,british airways confirmed worst airline uk las...
1,1,worst ba experience,worst ba experience i was supposed to fly out ...,worst ba experience,worst ba experience supposed fly italy 7 septe...
2,2,the worst airline service,my daughter and i were denied boarding on our ...,worst airline service,daughter denied boarding business class flight...


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3650 entries, 0 to 3649
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Unnamed: 0        3650 non-null   int64 
 1   Header            3650 non-null   object
 2   Review            3650 non-null   object
 3   Tokenized Header  3650 non-null   object
 4   Tokenized Review  3650 non-null   object
dtypes: int64(1), object(4)
memory usage: 142.7+ KB


## Feature Engineering

### Part of Speech (POS) Tagging

In [None]:
pos_review = []

for review in df['Tokenized Review']:
    pos_review.append(nltk.pos_tag(word_tokenize(review)))

df['POS Review'] = pos_review

In [None]:
pos_review_filtered = []

target_pos = ['JJR', 'JJS', 'RB', 'RBR', 'RBS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']

for review in df['POS Review']:
    p = []
    for pos_review in review:
        if pos_review[1] in target_pos:
            p.append(pos_review[0])
    pos_review_filtered.append(" ".join(p))

df['POS Review Filtered'] = pos_review_filtered

### Sentiments

In [None]:
def sentiment_analyze(text):
    score = SentimentIntensityAnalyzer().polarity_scores(text)
    neg = score['neg']
    pos = score['pos']
    if max(neg, pos) == neg: return "Negative", neg
    if max(neg, pos) == pos: return "Positive", pos

In [None]:
sentiment_r = []

for review in df['POS Review Filtered']:
    sentiment_r.append(sentiment_analyze(review))

df['Sentiment Review'] = sentiment_r

In [None]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,Header,Review,Tokenized Header,Tokenized Review,POS Review,POS Review Filtered,Sentiment Review
0,0,the worst airline in the uk,british airways has confirmed itself as the wo...,worst airline uk,british airways confirmed worst airline uk las...,"[(british, JJ), (airways, NNS), (confirmed, VB...",confirmed worst happened,"(Negative, 0.672)"
1,1,worst ba experience,worst ba experience i was supposed to fly out ...,worst ba experience,worst ba experience supposed fly italy 7 septe...,"[(worst, JJS), (ba, NN), (experience, NN), (su...",worst supposed italy arrived already checkedin...,"(Negative, 0.176)"
2,2,the worst airline service,my daughter and i were denied boarding on our ...,worst airline service,daughter denied boarding business class flight...,"[(daughter, NN), (denied, VBD), (boarding, VBG...",denied boarding heathrow march given denied bo...,"(Negative, 0.21)"


### Frequency Distribution

In [None]:
pos_adverbs = []
pos_verbs = []
pos_adjectives = []

for content in df['POS Review']:
    for tag in content:
        if tag[1] in target_pos:
            if tag[1].startswith('R'):
              pos_adverbs.append(tag[0])
            elif tag[1].startswith('V'):
              pos_verbs.append(tag[0])
            elif tag[1].startswith('J'):
              pos_adjectives.append(tag[0])

print(f'Adverbs: {pos_adverbs[:10]}, count: {len(pos_adverbs)}')
print(f'Verbs: {pos_verbs[:10]}, count: {len(pos_verbs)}')
print(f'Adjectives: {pos_adjectives[:10]}, count: {len(pos_adjectives)}')

Adverbs: ['italy', 'already', 'however', 'later', 'back', 'still', 'back', 'south', 'first', 'still'], count: 23229
Verbs: ['confirmed', 'happened', 'supposed', 'arrived', 'checkedin', 'online', 'evening', 'boarding', 'told', 'connecting'], count: 61810
Adjectives: ['worst', 'worst', 'best', 'best', 'least', 'better', 'worst', 'worst', 'least', 'worse'], count: 2239


In [None]:
sentiments = []
sentiments_score = []

for sentiment in df['Sentiment Review']:
    sentiments.append(sentiment[0])
    sentiments_score.append(sentiment[1])

print(sentiments[:3])
print(sentiments_score[:3])

['Negative', 'Negative', 'Negative']
[0.672, 0.176, 0.21]


# Converting to CSV for Visualization

In [None]:
adverbs = pd.DataFrame(pos_adverbs, columns=['Adverbs'])
verbs = pd.DataFrame(pos_verbs, columns=['Verbs'])
adjectives = pd.DataFrame(pos_adjectives, columns=['Adjectives'])
sentiment = pd.DataFrame({
    'Sentiment': sentiments,
    'Score': sentiments_score
})

adverbs.to_csv('adverbs.csv')
verbs.to_csv('verbs.csv')
adjectives.to_csv('adjectives.csv')
sentiment.to_csv('sentiment.csv')