In [1]:
# lets start by importing the necessary libraries

import numpy as np
import pandas as pd
import textblob

# lets load the data
reviewsData = pd.read_csv('airlines_reviews.csv')

reviewsData.head()

Unnamed: 0,Title,Name,Review Date,Airline,Verified,Reviews,Type of Traveller,Month Flown,Route,Class,Seat Comfort,Staff Service,Food & Beverages,Inflight Entertainment,Value For Money,Overall Rating,Recommended
0,Flight was amazing,Alison Soetantyo,2024-03-01,Singapore Airlines,True,Flight was amazing. The crew onboard this fl...,Solo Leisure,December 2023,Jakarta to Singapore,Business Class,4,4,4,4,4,9,yes
1,seats on this aircraft are dreadful,Robert Watson,2024-02-21,Singapore Airlines,True,Booking an emergency exit seat still meant h...,Solo Leisure,February 2024,Phuket to Singapore,Economy Class,5,3,4,4,1,3,no
2,Food was plentiful and tasty,S Han,2024-02-20,Singapore Airlines,True,Excellent performance on all fronts. I would...,Family Leisure,February 2024,Siem Reap to Singapore,Economy Class,1,5,2,1,5,10,yes
3,“how much food was available,D Laynes,2024-02-19,Singapore Airlines,True,Pretty comfortable flight considering I was f...,Solo Leisure,February 2024,Singapore to London Heathrow,Economy Class,5,5,5,5,5,10,yes
4,“service was consistently good”,A Othman,2024-02-19,Singapore Airlines,True,The service was consistently good from start ...,Family Leisure,February 2024,Singapore to Phnom Penh,Economy Class,5,5,5,5,5,10,yes


In [2]:
# checking for any missing values

reviewsData.isnull().sum()

Title                     0
Name                      0
Review Date               0
Airline                   0
Verified                  0
Reviews                   0
Type of Traveller         0
Month Flown               0
Route                     0
Class                     0
Seat Comfort              0
Staff Service             0
Food & Beverages          0
Inflight Entertainment    0
Value For Money           0
Overall Rating            0
Recommended               0
dtype: int64

In [3]:
# removing missing values

reviewsData = reviewsData.dropna()

In [4]:
# checking for any duplicates

reviewsData.duplicated().sum()

0

In [5]:
columns_to_drop = ['Name', 'Airline', 'Verified', 'Type of Traveller', 'Month Flown', 'Route', 'Class', 'Seat Comfort', 'Staff Service', 'Food & Beverages', 'Inflight Entertainment', 'Value For Money', 'Overall Rating', 'Recommended']
existing_columns_to_drop = [col for col in columns_to_drop if col in reviewsData.columns]
reviewsData = reviewsData.drop(existing_columns_to_drop, axis=1)

In [6]:
reviewsData.head()

Unnamed: 0,Title,Review Date,Reviews
0,Flight was amazing,2024-03-01,Flight was amazing. The crew onboard this fl...
1,seats on this aircraft are dreadful,2024-02-21,Booking an emergency exit seat still meant h...
2,Food was plentiful and tasty,2024-02-20,Excellent performance on all fronts. I would...
3,“how much food was available,2024-02-19,Pretty comfortable flight considering I was f...
4,“service was consistently good”,2024-02-19,The service was consistently good from start ...


In [7]:
# let's clean the review column by removing common words and punctuations

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
import re

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dhiya\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dhiya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dhiya\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
def clean_review(review):
    review = review.lower()
    review = review.translate(str.maketrans('', '', string.punctuation))
    review = re.sub('\[.*?\]', '', review)
    review = re.sub('https?://\S+|www\.\S+', '', review)
    review = re.sub('<.*?>+', '', review)
    review = re.sub('[%s]' % re.escape(string.punctuation), '', review)
    review = re.sub('\n', '', review)
    review = re.sub('\w*\d\w*', '', review)
    review = [word for word in review.split(' ') if word not in stop_words]
    review = [lemmatizer.lemmatize(word) for word in review]
    return ' '.join(review)

In [10]:
# applying the clean_review function to the review column

reviewsData['Reviews'] = reviewsData['Reviews'].apply(clean_review)

In [11]:
reviewsData.head()

Unnamed: 0,Title,Review Date,Reviews
0,Flight was amazing,2024-03-01,flight amazing crew onboard flight welcoming...
1,seats on this aircraft are dreadful,2024-02-21,booking emergency exit seat still meant huge...
2,Food was plentiful and tasty,2024-02-20,excellent performance front would definitely...
3,“how much food was available,2024-02-19,pretty comfortable flight considering flying ...
4,“service was consistently good”,2024-02-19,service consistently good start finish cabin ...


In [12]:
# lets calculate the sentiment of each review using TextBlob

def get_sentiment(review):
    analysis = textblob.TextBlob(review)
    if analysis.sentiment.polarity > 0:
        return 'Positive'
    elif analysis.sentiment.polarity == 0:
        return 'Neutral'
    else:
        return 'Negative'

In [14]:
# generating the sentiment column

reviewsData['GeneratedSentiment'] = reviewsData['Reviews'].apply(get_sentiment)


In [15]:
reviewsData.head()

Unnamed: 0,Title,Review Date,Reviews,GeneratedSentiment
0,Flight was amazing,2024-03-01,flight amazing crew onboard flight welcoming...,Positive
1,seats on this aircraft are dreadful,2024-02-21,booking emergency exit seat still meant huge...,Negative
2,Food was plentiful and tasty,2024-02-20,excellent performance front would definitely...,Positive
3,“how much food was available,2024-02-19,pretty comfortable flight considering flying ...,Positive
4,“service was consistently good”,2024-02-19,service consistently good start finish cabin ...,Positive


In [16]:
# Writing the cleaned data to a new csv file

reviewsData.to_csv('cleaned_airlines_reviews.csv', index=False)