Data Cleaning:

Extraction of data from the website is done, it is not cleaned and ready to be analyzed yet. Need to clean for punctuations, spellings and other characters.

In [1]:
#imports

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

#regex
import re

In [2]:
#create a dataframe from csv file

cwd = os.getcwd()

df = pd.read_csv(cwd+"/BA_reviews.csv", index_col=0)

In [3]:
df.head()

Unnamed: 0,reviews,stars,date,country
0,✅ Trip Verified | A simple story with an unfor...,\n\t\t\t\t\t\t\t\t\t\t\t\t\t5,21st July 2023,Germany
1,✅ Trip Verified | Flight was delayed due to t...,1,21st July 2023,United Kingdom
2,Not Verified | Fast and friendly check in (to...,4,20th July 2023,United Kingdom
3,✅ Trip Verified | I don't understand why Brit...,8,20th July 2023,United Kingdom
4,Not Verified | I'm sure that BA have graduall...,2,20th July 2023,United Kingdom


In [4]:
df['verified'] = df.reviews.str.contains("Trip Verified")

In [5]:
df['verified']

0        True
1        True
2       False
3        True
4       False
        ...  
3495    False
3496    False
3497    False
3498    False
3499    False
Name: verified, Length: 3500, dtype: bool

In [8]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re

lemma = WordNetLemmatizer()

reviews_data = df.reviews.str.strip("✅ Trip Verified |")

#create an empty list to collect cleaned data corpus
corpus = []

#loop through each review, remove punctuations, small case it, join it, and add it to corpus
for rev in reviews_data:
    rev = re.sub('[^a-zA-Z]', ' ', rev)
    rev = rev.lower()
    rev = rev.split()
    rev = [lemma.lemmatize(word) for word in rev if word not in set(stopwords.words("english"))]
    rev = " ".join(rev)
    corpus.append(rev)


In [9]:
df['corpus'] = corpus

In [10]:
df.head()

Unnamed: 0,reviews,stars,date,country,verified,corpus
0,✅ Trip Verified | A simple story with an unfor...,\n\t\t\t\t\t\t\t\t\t\t\t\t\t5,21st July 2023,Germany,True,simple story unfortunate outcome really could ...
1,✅ Trip Verified | Flight was delayed due to t...,1,21st July 2023,United Kingdom,True,flight delayed due inbound flight arriving lat...
2,Not Verified | Fast and friendly check in (to...,4,20th July 2023,United Kingdom,False,verified fast friendly check total contrast ga...
3,✅ Trip Verified | I don't understand why Brit...,8,20th July 2023,United Kingdom,True,understand british airway classified star airl...
4,Not Verified | I'm sure that BA have graduall...,2,20th July 2023,United Kingdom,False,verified sure ba gradually made economy experi...


In [11]:
df.dtypes

reviews     object
stars       object
date        object
country     object
verified      bool
corpus      object
dtype: object

In [12]:
# convert the date to datetime format

df.date = pd.to_datetime(df.date)

In [13]:
df.date.head()

0   2023-07-21
1   2023-07-21
2   2023-07-20
3   2023-07-20
4   2023-07-20
Name: date, dtype: datetime64[ns]

Cleaning ratings with stars

In [14]:
#check for unique values
df.stars.unique()

array(['\n\t\t\t\t\t\t\t\t\t\t\t\t\t5', '1', '4', '8', '2', '9', '3',
       '10', '7', '5', '6', 'None'], dtype=object)

In [15]:
df.stars = df.stars.str.strip("\n\t\t\t\t\t\t\t\t\t\t\t\t\t")

In [16]:
df.stars.value_counts()

1       800
2       401
3       393
8       346
10      302
9       296
7       293
5       251
4       240
6       173
None      5
Name: stars, dtype: int64

In [17]:
# drop the rows where the value of ratings is None
df.drop(df[df.stars == "None"].index, axis=0, inplace=True)

In [18]:
#check the unique values again
df.stars.unique()

array(['5', '1', '4', '8', '2', '9', '3', '10', '7', '6'], dtype=object)

Check for null Values:


In [19]:
df.isnull().value_counts()

reviews  stars  date   country  verified  corpus
False    False  False  False    False     False     3493
                       True     False     False        2
dtype: int64

In [20]:
df.country.isnull().value_counts()

False    3493
True        2
Name: country, dtype: int64

In [21]:
#drop the rows using index where the country value is null
df.drop(df[df.country.isnull() == True].index, axis=0, inplace=True)

In [22]:
df.shape

(3493, 6)

In [23]:
#resetting the index
df.reset_index(drop=True)

Unnamed: 0,reviews,stars,date,country,verified,corpus
0,✅ Trip Verified | A simple story with an unfor...,5,2023-07-21,Germany,True,simple story unfortunate outcome really could ...
1,✅ Trip Verified | Flight was delayed due to t...,1,2023-07-21,United Kingdom,True,flight delayed due inbound flight arriving lat...
2,Not Verified | Fast and friendly check in (to...,4,2023-07-20,United Kingdom,False,verified fast friendly check total contrast ga...
3,✅ Trip Verified | I don't understand why Brit...,8,2023-07-20,United Kingdom,True,understand british airway classified star airl...
4,Not Verified | I'm sure that BA have graduall...,2,2023-07-20,United Kingdom,False,verified sure ba gradually made economy experi...
...,...,...,...,...,...,...
3488,I flew LHR-YVR outwards 20th May 2014 on 747. ...,9,2014-06-16,United Kingdom,False,flew lhr yvr outwards th may cabin newly fitte...
3489,LGW - Paphos 4th June. Check in and fast track...,7,2014-06-16,United Kingdom,False,lgw paphos th june check fast track security e...
3490,Business class from Shanghai to London. It was...,5,2014-06-16,United Kingdom,False,business class shanghai london worst business ...
3491,Paphos - LGW 11th June. We had monitored the o...,4,2014-06-16,United Kingdom,False,paphos lgw th june monitored outbound flight i...


In [24]:
# export the cleaned data

df.to_csv(cwd + "/cleaned-BA-reviews.csv")