In [1]:
import pandas as pd
import nltk
#nltk.download('stopwords')
#nltk.download('punkt')
#nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
# importing VADER (Valence Aware Dictionary and sEntiment Reasoner) :
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import warnings

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
warnings.filterwarnings('ignore')

In [2]:
#load data;
data = pd.read_csv('data/BA_reviews.csv')

In [3]:
data.head()

Unnamed: 0,review_headers,reviews,time
0,"""It was a nightmare""",Not Verified | They changed our Flights from ...,18th April 2023
1,"""Abysmal service""",Not Verified | At Copenhagen the most chaotic...,18th April 2023
2,"""trained to give you the runaround""",✅ Trip Verified | Worst experience of my life...,17th April 2023
3,"""they only had one choice of meal""",✅ Trip Verified | Due to code sharing with Ca...,17th April 2023
4,"""relentless BA cost cutting""",✅ Trip Verified | LHR check in was quick at t...,16th April 2023


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3531 entries, 0 to 3530
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   review_headers  3531 non-null   object
 1   reviews         3531 non-null   object
 2   time            3531 non-null   object
dtypes: object(3)
memory usage: 82.9+ KB


In [5]:
# function to preprocess texts;

In [6]:
def preprocess_text(text):
    """
    Preprocesses the text by removing stop words, converting to lowercase, and lemmatizing the words.
    """
    tokens = word_tokenize(text)  # Tokenize the text into words
    filtered_tokens = [lemmatizer.lemmatize(word.lower()) for word in tokens if word.lower() not in stop_words]  # Remove stop words, convert to lowercase, and lemmatize
    processed_text = " ".join(filtered_tokens)  # Join the tokens back into a string
    return processed_text

In [7]:
# extract year of review
data['year'] = data['time'].apply(lambda x : x.split(' ')[2])

# split review column:
data[['ver_status','reviews_']] = data['reviews'].str.split('|', expand=True)

# removing quotations from 'review_headers'
data['review_headers'] = data['review_headers'].str.replace('"','')

#preprocess the reviews_ column
data['reviews_'] = data['reviews_'].apply(lambda x : preprocess_text(str(x)))

In [8]:
# Subset the data for the 2021 - 2023 reviews:
subset = data[(data['year'] == '2023') | (data['year'] == '2022') | (data['year'] == '2021')]

In [9]:
subset

Unnamed: 0,review_headers,reviews,time,year,ver_status,reviews_
0,It was a nightmare,Not Verified | They changed our Flights from ...,18th April 2023,2023,Not Verified,changed flight brussels london heathrow lax 4/...
1,Abysmal service,Not Verified | At Copenhagen the most chaotic...,18th April 2023,2023,Not Verified,copenhagen chaotic ticket counter assignment h...
2,trained to give you the runaround,✅ Trip Verified | Worst experience of my life...,17th April 2023,2023,✅ Trip Verified,worst experience life trying deal customer ser...
3,they only had one choice of meal,✅ Trip Verified | Due to code sharing with Ca...,17th April 2023,2023,✅ Trip Verified,due code sharing cathay pacific downgraded ba ...
4,relentless BA cost cutting,✅ Trip Verified | LHR check in was quick at t...,16th April 2023,2023,✅ Trip Verified,lhr check quick first wing quickly security . ...
...,...,...,...,...,...,...
379,A poor show with BA,✅ Trip Verified | Had booked the above mention...,21st February 2021,2021,✅ Trip Verified,booked mentioned flight . travel restriction c...
380,promised to give me compensation,✅ Trip Verified | My return flight from Antig...,20th February 2021,2021,✅ Trip Verified,return flight antigua london cancelled 2 time ...
381,Do not recommend to anyone in Covid restrictions,Not Verified | 4 cancellations for 1 month! A...,10th February 2021,2021,Not Verified,4 cancellation 1 month ! applying voucher find...
382,She was an amazing person,✅ Trip Verified | I had travelled to India fo...,4th February 2021,2021,✅ Trip Verified,"travelled india eye surgery , flight back lond..."


In [10]:
subset['verified_flight_status'] = subset['ver_status'].apply(lambda x : 'Verified' if x != 'Not Verified ' else 'Not Verified')

In [11]:
subset.head()

Unnamed: 0,review_headers,reviews,time,year,ver_status,reviews_,verified_flight_status
0,It was a nightmare,Not Verified | They changed our Flights from ...,18th April 2023,2023,Not Verified,changed flight brussels london heathrow lax 4/...,Not Verified
1,Abysmal service,Not Verified | At Copenhagen the most chaotic...,18th April 2023,2023,Not Verified,copenhagen chaotic ticket counter assignment h...,Not Verified
2,trained to give you the runaround,✅ Trip Verified | Worst experience of my life...,17th April 2023,2023,✅ Trip Verified,worst experience life trying deal customer ser...,Verified
3,they only had one choice of meal,✅ Trip Verified | Due to code sharing with Ca...,17th April 2023,2023,✅ Trip Verified,due code sharing cathay pacific downgraded ba ...,Verified
4,relentless BA cost cutting,✅ Trip Verified | LHR check in was quick at t...,16th April 2023,2023,✅ Trip Verified,lhr check quick first wing quickly security . ...,Verified


In [12]:
subset['ver_status'].unique()

array(['Not Verified ', '✅ Trip Verified '], dtype=object)

In [15]:
def sentiment_analyzer(preprocessed_text):
    
    '''Generate polarity scores for the preprocessed texts and 
    output their sentiment based on the compound value'''
    
    analyzer = SentimentIntensityAnalyzer()
    sentiment_scores = analyzer.polarity_scores(preprocessed_text)
    if sentiment_scores['compound'] > 0:
        return 'Positive review!'
    elif sentiment_scores['compound'] <= 0:
        return 'Negative review!'
    

In [16]:
subset['result'] = subset['reviews_'].apply(lambda x : sentiment_analyzer(x))

In [17]:
subset.head()

Unnamed: 0,review_headers,reviews,time,year,ver_status,reviews_,verified_flight_status,result
0,It was a nightmare,Not Verified | They changed our Flights from ...,18th April 2023,2023,Not Verified,changed flight brussels london heathrow lax 4/...,Not Verified,Negative review!
1,Abysmal service,Not Verified | At Copenhagen the most chaotic...,18th April 2023,2023,Not Verified,copenhagen chaotic ticket counter assignment h...,Not Verified,Negative review!
2,trained to give you the runaround,✅ Trip Verified | Worst experience of my life...,17th April 2023,2023,✅ Trip Verified,worst experience life trying deal customer ser...,Verified,Negative review!
3,they only had one choice of meal,✅ Trip Verified | Due to code sharing with Ca...,17th April 2023,2023,✅ Trip Verified,due code sharing cathay pacific downgraded ba ...,Verified,Negative review!
4,relentless BA cost cutting,✅ Trip Verified | LHR check in was quick at t...,16th April 2023,2023,✅ Trip Verified,lhr check quick first wing quickly security . ...,Verified,Positive review!


In [19]:
# outputs:
subset[['year','result','verified_flight_status']].to_csv('data/sentiment_analysis_result.csv',index=False)

In [20]:
subset[['year','result','verified_flight_status']]

Unnamed: 0,year,result,verified_flight_status
0,2023,Negative review!,Not Verified
1,2023,Negative review!,Not Verified
2,2023,Negative review!,Verified
3,2023,Negative review!,Verified
4,2023,Positive review!,Verified
...,...,...,...
379,2021,Negative review!,Verified
380,2021,Negative review!,Verified
381,2021,Negative review!,Not Verified
382,2021,Positive review!,Verified
