### **DATA CLEANING**

In [1]:
#imports

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

#regex
import re

In [4]:
#create a dataframe from csv file

df=pd.read_csv('BA_reviews.csv',index_col=0)

In [5]:
df.head()

Unnamed: 0,reviews,stars,date,country
0,✅ Trip Verified | Very impressed with BA. Chec...,5.0,18th May 2024,United Kingdom
1,"✅ Trip Verified | LHR - SFO, LAS - LGW August...",9.0,14th May 2024,United Kingdom
2,Not Verified | I flew from Malaga via LHR to...,3.0,8th May 2024,Canada
3,✅ Trip Verified | Milan to Miami return via L...,2.0,8th May 2024,Italy
4,✅ Trip Verified | BA created a new low-cost s...,1.0,7th May 2024,United Kingdom


# **Future Engineering**

Create a new column verified from an exiting column reviews

In [7]:
df['verified'] = df.reviews.str.contains("Trip Verified")

In [8]:
df['verified']

0        True
1        True
2       False
3        True
4        True
        ...  
3413    False
3414    False
3415    False
3416    False
3417    False
Name: verified, Length: 3418, dtype: bool

In [9]:
df.head()

Unnamed: 0,reviews,stars,date,country,verified
0,✅ Trip Verified | Very impressed with BA. Chec...,5.0,18th May 2024,United Kingdom,True
1,"✅ Trip Verified | LHR - SFO, LAS - LGW August...",9.0,14th May 2024,United Kingdom,True
2,Not Verified | I flew from Malaga via LHR to...,3.0,8th May 2024,Canada,False
3,✅ Trip Verified | Milan to Miami return via L...,2.0,8th May 2024,Italy,True
4,✅ Trip Verified | BA created a new low-cost s...,1.0,7th May 2024,United Kingdom,True


In [14]:
#for lemmatization of words we will use nltk library
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
lemma = WordNetLemmatizer()

In [15]:
reviews_data = df.reviews.str.strip("✅ Trip Verified |")

In [16]:
reviews_data

0       y impressed with BA. Check in online was a lit...
1       LHR - SFO, LAS - LGW August 2023 in Club World...
2       Not Verified |   I flew from Malaga via LHR to...
3       Milan to Miami return via London. Worst busine...
4       BA created a new low-cost subsidiary in BA Eur...
                              ...                        
3413    BRU-Toronto-BRU 1st class. In BRU check in pas...
3414    BA2706 and BA2707 LGW-BCN. No real issues plea...
3415    BA 0027 ex London HR to Hong Kong which depart...
3416    We travelled economy from Manchester to Toront...
3417    Flight LGW to TFS in Business. Like another re...
Name: reviews, Length: 3418, dtype: object

In [20]:
import nltk
nltk.download('wordnet')
!python3 -m nltk.downloader stopwords
corpus =[]

#loop through each review, remove punctuations, small case it, join it and add it to corpus
for rev in reviews_data:
    rev = re.sub('[^a-zA-Z]',' ', rev)
    rev = rev.lower()
    rev = rev.split()
    rev = [lemma.lemmatize(word) for word in rev if word not in set(stopwords.words("english"))]
    rev = " ".join(rev)
    corpus.append(rev)

[nltk_data] Downloading package wordnet to /root/nltk_data...


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [21]:
df['corpus'] = corpus

In [22]:
df.head()

Unnamed: 0,reviews,stars,date,country,verified,corpus
0,✅ Trip Verified | Very impressed with BA. Chec...,5.0,18th May 2024,United Kingdom,True,impressed ba check online little convoluted ev...
1,"✅ Trip Verified | LHR - SFO, LAS - LGW August...",9.0,14th May 2024,United Kingdom,True,lhr sfo la lgw august club world shame british...
2,Not Verified | I flew from Malaga via LHR to...,3.0,8th May 2024,Canada,False,verified flew malaga via lhr boston th may arr...
3,✅ Trip Verified | Milan to Miami return via L...,2.0,8th May 2024,Italy,True,milan miami return via london worst business c...
4,✅ Trip Verified | BA created a new low-cost s...,1.0,7th May 2024,United Kingdom,True,ba created new low cost subsidiary ba euroflye...


In [23]:
df.dtypes

reviews      object
stars       float64
date         object
country      object
verified       bool
corpus       object
dtype: object

In [29]:
df.stars.unique()

array([ 5.,  9.,  3.,  2.,  1.,  4.,  6., 10.,  7.,  8., nan])

In [31]:
df.stars.value_counts()

stars
1.0     837
3.0     394
2.0     394
8.0     331
10.0    282
7.0     273
9.0     262
5.0     241
4.0     232
6.0     169
Name: count, dtype: int64

In [32]:
df.isnull().value_counts()

reviews  stars  date   country  verified  corpus
False    False  False  False    False     False     3414
         True   False  False    False     False        3
         False  False  True     False     False        1
Name: count, dtype: int64

In [33]:
df.country.isnull().value_counts()

country
False    3417
True        1
Name: count, dtype: int64

In [35]:
#drop the rows using index where the country value is null
df.drop(df[df.country.isnull() == True].index, axis=0, inplace=True)

In [36]:
df.shape

(3417, 6)

In [37]:
df.reset_index(drop=True)

Unnamed: 0,reviews,stars,date,country,verified,corpus
0,✅ Trip Verified | Very impressed with BA. Chec...,5.0,18th May 2024,United Kingdom,True,impressed ba check online little convoluted ev...
1,"✅ Trip Verified | LHR - SFO, LAS - LGW August...",9.0,14th May 2024,United Kingdom,True,lhr sfo la lgw august club world shame british...
2,Not Verified | I flew from Malaga via LHR to...,3.0,8th May 2024,Canada,False,verified flew malaga via lhr boston th may arr...
3,✅ Trip Verified | Milan to Miami return via L...,2.0,8th May 2024,Italy,True,milan miami return via london worst business c...
4,✅ Trip Verified | BA created a new low-cost s...,1.0,7th May 2024,United Kingdom,True,ba created new low cost subsidiary ba euroflye...
...,...,...,...,...,...,...
3412,BRU-Toronto-BRU 1st class. In BRU check in pas...,9.0,5th November 2014,Belgium,False,bru toronto bru st class bru check passport co...
3413,BA2706 and BA2707 LGW-BCN. No real issues plea...,8.0,3rd November 2014,United Kingdom,False,ba ba lgw bcn real issue pleasant crew flight ...
3414,BA 0027 ex London HR to Hong Kong which depart...,10.0,3rd November 2014,Australia,False,ba ex london hr hong kong departed hong kong l...
3415,We travelled economy from Manchester to Toront...,4.0,3rd November 2014,United Kingdom,False,travelled economy manchester toronto via heath...


In [38]:
# export the cleaned data

df.to_csv(cwd + "/cleaned-BA-reviews.csv")