## Data Cleaning

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re



In [2]:
#creating pandas dataframe from csv file
cwd = os.getcwd()

df = pd.read_csv(r"C:\Users\bhush\OneDrive\Desktop\BA_Data.csv")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,reviews,stars,date,country
0,0,Not Verified | Top Ten REASONS to not use Brit...,\n\t\t\t\t\t\t\t\t\t\t\t\t\t5,23rd May 2023,United States
1,1,Not Verified | Easy check in on the way to He...,1,23rd May 2023,Spain
2,2,✅ Trip Verified | Online check in worked fine...,10,23rd May 2023,Chile
3,3,✅ Trip Verified |. The BA first lounge at Term...,10,22nd May 2023,United Kingdom
4,4,Not Verified | Paid a quick visit to Nice yest...,2,22nd May 2023,United Kingdom


In [4]:
df['verified']=df.reviews.str.contains("✅ Trip Verified")

In [5]:
df['verified'].head(10)

0    False
1    False
2     True
3     True
4    False
5     True
6     True
7     True
8     True
9    False
Name: verified, dtype: bool

## Cleaning Reviews

In [6]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
lemma  = WordNetLemmatizer()

reviews_data = df.reviews.str.strip("✅ Trip Verified")

# Create an empty list to collect cleaned data corpus

corpus = []

for rev in reviews_data:
    rev = re.sub('[^a-zA-Z]',' ',rev)
    rev = rev.lower()
    rev = rev.split()
    rev = [lemma.lemmatize(word) for word in rev if word not in set (stopwords.words("english"))]
    rev = " ".join(rev)
    corpus.append(rev)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  dtype=np.int):
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  method='lar', copy_X=True, eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  method='lar', copy_X=True, eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_Gram=True, verbose=0,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.floa

In [7]:
df['corpus'] = corpus

In [8]:
df['corpus'].head()

0    verified top ten reason use british airway awa...
1    verified easy check way heathrow flight time i...
2    online check worked fine quick security check ...
3    ba first lounge terminal zoo pm dirty table us...
4    verified paid quick visit nice yesterday heath...
Name: corpus, dtype: object

In [9]:
df.head()

Unnamed: 0.1,Unnamed: 0,reviews,stars,date,country,verified,corpus
0,0,Not Verified | Top Ten REASONS to not use Brit...,\n\t\t\t\t\t\t\t\t\t\t\t\t\t5,23rd May 2023,United States,False,verified top ten reason use british airway awa...
1,1,Not Verified | Easy check in on the way to He...,1,23rd May 2023,Spain,False,verified easy check way heathrow flight time i...
2,2,✅ Trip Verified | Online check in worked fine...,10,23rd May 2023,Chile,True,online check worked fine quick security check ...
3,3,✅ Trip Verified |. The BA first lounge at Term...,10,22nd May 2023,United Kingdom,True,ba first lounge terminal zoo pm dirty table us...
4,4,Not Verified | Paid a quick visit to Nice yest...,2,22nd May 2023,United Kingdom,False,verified paid quick visit nice yesterday heath...


## Cleaning Date Format

In [10]:
df.dtypes

Unnamed: 0     int64
reviews       object
stars         object
date          object
country       object
verified        bool
corpus        object
dtype: object

In [11]:
df.drop('Unnamed: 0',axis=1,inplace=True)

In [12]:
df.head()

Unnamed: 0,reviews,stars,date,country,verified,corpus
0,Not Verified | Top Ten REASONS to not use Brit...,\n\t\t\t\t\t\t\t\t\t\t\t\t\t5,23rd May 2023,United States,False,verified top ten reason use british airway awa...
1,Not Verified | Easy check in on the way to He...,1,23rd May 2023,Spain,False,verified easy check way heathrow flight time i...
2,✅ Trip Verified | Online check in worked fine...,10,23rd May 2023,Chile,True,online check worked fine quick security check ...
3,✅ Trip Verified |. The BA first lounge at Term...,10,22nd May 2023,United Kingdom,True,ba first lounge terminal zoo pm dirty table us...
4,Not Verified | Paid a quick visit to Nice yest...,2,22nd May 2023,United Kingdom,False,verified paid quick visit nice yesterday heath...


## Cleaning Date Format

In [13]:
df['date'] = pd.to_datetime(df['date'])

In [14]:
df['date'].head()

0   2023-05-23
1   2023-05-23
2   2023-05-23
3   2023-05-22
4   2023-05-22
Name: date, dtype: datetime64[ns]

## Cleaning Ratings 

In [15]:
df['stars'].unique()

array(['\n\t\t\t\t\t\t\t\t\t\t\t\t\t5', '1', '10', '2', '4', '3', '5',
       '9', '7', '8', '6', 'None'], dtype=object)

In [16]:
df['stars'] = df['stars'].str.strip("\n\t\t\t\t\t\t\t\t\t\t\t\t\t5")

In [17]:
df['stars'].head()

0      
1     1
2    10
3    10
4     2
Name: stars, dtype: object

In [18]:
df['stars'].value_counts()

1       786
2       396
3       389
8       349
10      308
9       299
7       298
        256
4       235
6       179
None      5
Name: stars, dtype: int64

In [19]:
df['stars'].unique()

array(['', '1', '10', '2', '4', '3', '9', '7', '8', '6', 'None'],
      dtype=object)

In [20]:
# dropping the rows with stars value as "None" as there are only 5 rows as such
df.drop(df[df.stars == "None"].index,axis = 0, inplace=True)

In [21]:
df.isnull().sum()

reviews     0
stars       0
date        0
country     2
verified    0
corpus      0
dtype: int64

In [22]:
#Dropping those 2 column having country values as NaN
df.dropna(axis=0,inplace=True)

In [23]:
df.isnull().sum()

reviews     0
stars       0
date        0
country     0
verified    0
corpus      0
dtype: int64

In [24]:
df.reset_index(drop=True)

Unnamed: 0,reviews,stars,date,country,verified,corpus
0,Not Verified | Top Ten REASONS to not use Brit...,,2023-05-23,United States,False,verified top ten reason use british airway awa...
1,Not Verified | Easy check in on the way to He...,1,2023-05-23,Spain,False,verified easy check way heathrow flight time i...
2,✅ Trip Verified | Online check in worked fine...,10,2023-05-23,Chile,True,online check worked fine quick security check ...
3,✅ Trip Verified |. The BA first lounge at Term...,10,2023-05-22,United Kingdom,True,ba first lounge terminal zoo pm dirty table us...
4,Not Verified | Paid a quick visit to Nice yest...,2,2023-05-22,United Kingdom,False,verified paid quick visit nice yesterday heath...
...,...,...,...,...,...,...
3488,BA Heathrow to Madrid Club Euro. Disappointing...,,2014-05-26,United Kingdom,False,ba heathrow madrid club euro disappointing ser...
3489,Club Europe London Heathrow T3 to Lisbon. T3 l...,10,2014-05-26,United Kingdom,False,club europe london heathrow lisbon lounge good...
3490,TXL-LHR-SFO. TXL-LHR on an A321 it was an unev...,10,2014-05-25,India,False,xl lhr sfo txl lhr uneventful flight drink ser...
3491,San Diego return business class (the seat was ...,,2014-05-24,United Kingdom,False,san diego return business class seat comfortab...


In [25]:
## Our Dataset is really large so we can remove those rows whose stars are not given
df.drop(df[df['stars']==''].index,axis=0,inplace=True)

In [26]:
df

Unnamed: 0,reviews,stars,date,country,verified,corpus
1,Not Verified | Easy check in on the way to He...,1,2023-05-23,Spain,False,verified easy check way heathrow flight time i...
2,✅ Trip Verified | Online check in worked fine...,10,2023-05-23,Chile,True,online check worked fine quick security check ...
3,✅ Trip Verified |. The BA first lounge at Term...,10,2023-05-22,United Kingdom,True,ba first lounge terminal zoo pm dirty table us...
4,Not Verified | Paid a quick visit to Nice yest...,2,2023-05-22,United Kingdom,False,verified paid quick visit nice yesterday heath...
5,✅ Trip Verified | Words fail to describe this...,4,2023-05-19,United States,True,word fail describe last awful flight baby acro...
...,...,...,...,...,...,...
3493,Heathrow - Johannesburg A380. My first flight ...,2,2014-05-27,Bulgaria,False,heathrow johannesburg first flight half hour d...
3494,Heathrow - Johannesburg. Concorde lounge comfo...,1,2014-05-26,United Kingdom,False,heathrow johannesburg concorde lounge comforta...
3496,Club Europe London Heathrow T3 to Lisbon. T3 l...,10,2014-05-26,United Kingdom,False,club europe london heathrow lisbon lounge good...
3497,TXL-LHR-SFO. TXL-LHR on an A321 it was an unev...,10,2014-05-25,India,False,xl lhr sfo txl lhr uneventful flight drink ser...


## Extracting cleaned data to new csv file

In [27]:
df.to_csv(r"C:\Users\bhush\OneDrive\Desktop\Cleaned_Data.csv")