In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re

In [2]:
cwd = os.getcwd()
df = pd.read_csv(cwd +"/BA_reviews.csv", index_col = 0)

In [3]:
df.head()

Unnamed: 0,reviews,stars,date,country
0,✅ Trip Verified | Horrible service from boar...,\n\t\t\t\t\t\t\t\t\t\t\t\t5,22nd September 2024,Morocco
1,Not Verified | My wife and I are very disappo...,1,13th September 2024,United States
2,Not Verified | We flew BA between Heathrow an...,1,13th September 2024,Australia
3,Not Verified | Absolutely disgusted with BA. ...,8,13th September 2024,United Kingdom
4,Not Verified | Took a trip to Nashville with m...,1,11th September 2024,United Kingdom


In [4]:
df['verified'] = df.reviews.str.contains("Verified Trip")

In [5]:
df['verified']

0       False
1       False
2       False
3       False
4       False
        ...  
3495    False
3496    False
3497    False
3498    False
3499    False
Name: verified, Length: 3500, dtype: bool

In [15]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aayushamrute/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/aayushamrute/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/aayushamrute/nltk_data...


True

In [16]:
# we will use nltk library for lemmatization of words

from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
lemma = WordNetLemmatizer()

reviews_data = df.reviews.str.strip("✅ Verified Trip")

corpus = []

for rev in reviews_data:
    rev = re.sub('[^a-zA-Z]',' ', rev)
    rev = rev.lower()
    rev = rev.split()
    rev = [lemma.lemmatize(word) for word in rev if word not in set(stopwords.words("english"))]
    rev = " ".join(rev)
    corpus.append(rev)

In [17]:
df['corpus'] = corpus

In [18]:
df.head()

Unnamed: 0,reviews,stars,date,country,verified,corpus
0,✅ Trip Verified | Horrible service from boar...,\n\t\t\t\t\t\t\t\t\t\t\t\t5,22nd September 2024,Morocco,False,horrible service boarding landing flew london ...
1,Not Verified | My wife and I are very disappo...,1,13th September 2024,United States,False,verified wife disappointed flying british airw...
2,Not Verified | We flew BA between Heathrow an...,1,13th September 2024,Australia,False,verified flew ba heathrow berlin one way conne...
3,Not Verified | Absolutely disgusted with BA. ...,8,13th September 2024,United Kingdom,False,verified absolutely disgusted ba flight cancel...
4,Not Verified | Took a trip to Nashville with m...,1,11th September 2024,United Kingdom,False,verified took trip nashville wife leisure brea...


In [20]:
df.dtypes

reviews     object
stars       object
date        object
country     object
verified      bool
corpus      object
dtype: object

In [21]:
# conver date to datetime format
df.date = pd.to_datetime(df.date)

In [22]:
df.date.head()

0   2024-09-22
1   2024-09-13
2   2024-09-13
3   2024-09-13
4   2024-09-11
Name: date, dtype: datetime64[ns]

In [23]:
#checking whether there are any unique values or not

df.stars = df.stars.str.strip("\n\t\t\t\t\t\t\t\t\t\t\t\t\t")

In [24]:
df.stars.value_counts()

1       870
2       406
3       399
8       340
10      284
7       273
9       270
5       246
4       236
6       173
None      3
Name: stars, dtype: int64

In [26]:
# we see there are 3 rows having none values in the ratings. We will drop them

df.drop(df[df.stars == "None"].index, axis = 0, inplace=True)

In [27]:
df.stars.unique()

array(['5', '1', '8', '4', '2', '9', '10', '3', '6', '7'], dtype=object)

In [29]:
# now we will check if there are any null values or not

df.isnull().value_counts()

reviews  stars  date   country  verified  corpus
False    False  False  False    False     False     3496
                       True     False     False        1
dtype: int64

In [30]:
df.country.isnull().value_counts()

False    3496
True        1
Name: country, dtype: int64

In [31]:
df.drop(df[df.country.isnull() == True].index, axis=0, inplace=True)

In [32]:
df.shape

(3496, 6)

In [33]:
df.reset_index(drop = True)

Unnamed: 0,reviews,stars,date,country,verified,corpus
0,✅ Trip Verified | Horrible service from boar...,5,2024-09-22,Morocco,False,horrible service boarding landing flew london ...
1,Not Verified | My wife and I are very disappo...,1,2024-09-13,United States,False,verified wife disappointed flying british airw...
2,Not Verified | We flew BA between Heathrow an...,1,2024-09-13,Australia,False,verified flew ba heathrow berlin one way conne...
3,Not Verified | Absolutely disgusted with BA. ...,8,2024-09-13,United Kingdom,False,verified absolutely disgusted ba flight cancel...
4,Not Verified | Took a trip to Nashville with m...,1,2024-09-11,United Kingdom,False,verified took trip nashville wife leisure brea...
...,...,...,...,...,...,...
3491,Traveled with British Airways ORD-LHR last mon...,6,2014-10-28,United States,False,aveled british airway ord lhr last month world...
3492,Gatwick to Marrakech and back. Bag drop was fa...,9,2014-10-28,United Kingdom,False,gatwick marrakech back bag drop fast premium s...
3493,Flight BA1445 Edinburgh-London code share onto...,6,2014-10-28,Australia,False,flight ba edinburgh london code share onto lis...
3494,LHR-GRU B744 First time with BA my experience ...,9,2014-10-28,Denmark,False,lhr gru b first time ba experience great seat ...


In [34]:
df.to_csv(cwd + "/Cleaned_BA_reviews.csv")