In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re

In [7]:
## Reading the dataset

In [2]:
df = pd.read_csv("BA_Scraped Data.csv")
df.head()

Unnamed: 0,Reviews,Rating,Date,Country
0,✅ Trip Verified | As always when I fly BA it ...,\n\t\t\t\t\t\t\t\t\t\t\t\t\t5,9th January 2024,Spain
1,✅ Trip Verified | First time using BA busines...,1,7th January 2024,United Kingdom
2,Not Verified | Extremely rude ground service....,9,3rd January 2024,United States
3,✅ Trip Verified | My son and I flew to Geneva...,6,2nd January 2024,China
4,✅ Trip Verified | For the price paid (bought ...,1,29th December 2023,United Kingdom


In [3]:
# Creating a column of verification

df["Verified"] = df.Reviews.str.contains("Trip Verified")

In [4]:
df.head()

Unnamed: 0,Reviews,Rating,Date,Country,Verified
0,✅ Trip Verified | As always when I fly BA it ...,\n\t\t\t\t\t\t\t\t\t\t\t\t\t5,9th January 2024,Spain,True
1,✅ Trip Verified | First time using BA busines...,1,7th January 2024,United Kingdom,True
2,Not Verified | Extremely rude ground service....,9,3rd January 2024,United States,False
3,✅ Trip Verified | My son and I flew to Geneva...,6,2nd January 2024,China,True
4,✅ Trip Verified | For the price paid (bought ...,1,29th December 2023,United Kingdom,True


## Cleaning  the data


In [5]:
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords


lemma = WordNetLemmatizer()

reviews_data = df.Reviews.str.strip("✅ Trip Verified |")

corpus = [] # To store the genetated corpus

for rev in reviews_data:
    rev = re.sub('[^a-zA-Z]',' ',rev)
    rev = rev.lower()
    rev = rev.split()
    rev = [lemma.lemmatize(word) for word in rev if word not in set(stopwords.words("english"))]
    rev = " ".join(rev)
    corpus.append(rev)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\91889\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
df['Corpus'] = corpus

In [7]:
df.head()

Unnamed: 0,Reviews,Rating,Date,Country,Verified,Corpus
0,✅ Trip Verified | As always when I fly BA it ...,\n\t\t\t\t\t\t\t\t\t\t\t\t\t5,9th January 2024,Spain,True,always fly ba total shamble booked manchester ...
1,✅ Trip Verified | First time using BA busines...,1,7th January 2024,United Kingdom,True,first time using ba business class pleased ser...
2,Not Verified | Extremely rude ground service....,9,3rd January 2024,United States,False,verified extremely rude ground service non rev...
3,✅ Trip Verified | My son and I flew to Geneva...,6,2nd January 2024,China,True,son flew geneva last sunday skiing holiday le ...
4,✅ Trip Verified | For the price paid (bought ...,1,29th December 2023,United Kingdom,True,price paid bought sale decent experience altho...


## Cleaning The Date

In [8]:
df.dtypes

Reviews     object
Rating      object
Date        object
Country     object
Verified      bool
Corpus      object
dtype: object

In [9]:
df.Date = pd.to_datetime(df.Date)

In [10]:
df.Date.head()

0   2024-01-09
1   2024-01-07
2   2024-01-03
3   2024-01-02
4   2023-12-29
Name: Date, dtype: datetime64[ns]

## Cleaning rating with stars

In [11]:
# Checking for unique values
df.Rating.unique()

array(['\n\t\t\t\t\t\t\t\t\t\t\t\t\t5', '1', '9', '6', '8', '2', '5', '3',
       '10', '4', '7', 'None'], dtype=object)

In [12]:
df.Rating= df.Rating.str.strip("\n\t\t\t\t\t\t\t\t\t\t\t\t\t")

In [13]:
df.Rating.value_counts()

1       868
2       421
3       401
8       364
10      324
7       309
9       307
5       267
4       247
6       187
None      5
Name: Rating, dtype: int64

In [14]:
df.drop(df[df.Rating == "None"].index, inplace = True)

In [15]:
df.Rating.unique()

array(['5', '1', '9', '6', '8', '2', '3', '10', '4', '7'], dtype=object)

## Check for NULL values

In [16]:
df.isnull().value_counts()

Reviews  Rating  Date   Country  Verified  Corpus
False    False   False  False    False     False     3693
                        True     False     False        2
dtype: int64

In [17]:
df.drop(df[df.Country.isnull() == True].index, inplace = True)

In [18]:
df.isnull().value_counts()

Reviews  Rating  Date   Country  Verified  Corpus
False    False   False  False    False     False     3693
dtype: int64

In [19]:
df.shape

(3693, 6)

In [20]:
df.reset_index(drop = True)

Unnamed: 0,Reviews,Rating,Date,Country,Verified,Corpus
0,✅ Trip Verified | As always when I fly BA it ...,5,2024-01-09,Spain,True,always fly ba total shamble booked manchester ...
1,✅ Trip Verified | First time using BA busines...,1,2024-01-07,United Kingdom,True,first time using ba business class pleased ser...
2,Not Verified | Extremely rude ground service....,9,2024-01-03,United States,False,verified extremely rude ground service non rev...
3,✅ Trip Verified | My son and I flew to Geneva...,6,2024-01-02,China,True,son flew geneva last sunday skiing holiday le ...
4,✅ Trip Verified | For the price paid (bought ...,1,2023-12-29,United Kingdom,True,price paid bought sale decent experience altho...
...,...,...,...,...,...,...
3688,Once again a good flight from LHR to Warsaw in...,10,2014-05-11,United Kingdom,False,good flight lhr warsaw cabin crew efficient fr...
3689,LGW-MRU-LGW in business. Outbound flight good ...,6,2014-05-11,United Kingdom,False,lgw mru lgw business outbound flight good atte...
3690,LHR-FRA-LCY May 2014. LHR-FRA on a 767 row 18....,2,2014-05-11,United Kingdom,False,lhr fra lcy may lhr fra row checked via mobile...
3691,Rating : 10/10 Cabin Flown Economy Value for M...,9,2014-05-11,United Kingdom,False,rating cabin flown economy value money seat co...


In [21]:
df.to_csv("BA_Cleaned Data.csv")