# Cleaning Dataset

Final Notebook for data cleaning.

# Import Packages

In [20]:
import pandas as pd
from pandas import util
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import *
import os
import glob
import pickle
from emot.emo_unicode import UNICODE_EMOJI # For emojis
from emot.emo_unicode import EMOTICONS_EMO # For EMOTICONS

# Load Data

In [None]:
# Combining all the reviews

# Define path and files    
path = "./GoogleReviews"
#all_files = os.path.join(path, "*.csv")
all_files = glob.glob(path + "/*.csv")

# Create dataframe containing all reviews
google_reviews = pd.concat((pd.read_csv(f) for f in all_files))

# Save google_reviews in csv file
google_reviews.to_csv("./Data/total_google_reviews.csv")

In [35]:
# Load the total reviews together
reviews = pd.read_csv("./Data/total_google_reviews.csv")

# Preview
reviews.head()

Unnamed: 0,Venue Index,Name,Review Rate,Review Time,Review Text
0,0,Ellis,5 stars,3 years ago,"It was a bit quite when we went in, but don’t ..."
1,1,Ellis,5 stars,2 years ago,Nice cozy place which serves very tasty burger...
2,2,Ellis,5 stars,3 years ago,Really nice place. One of my favourite burger ...
3,3,Ellis,2 stars,3 years ago,The Service was quite good but the burgers we ...
4,4,Ellis,5 stars,2 years ago,I had a very nice experience! The staff were r...


# Alter DataFrame Column Names

In [36]:
# Alter column names
reviews = reviews.rename(columns={"Review Rate": "Rating", "Review Time": "Date", "Review Text": "Text"})
reviews = pd.DataFrame(reviews)

# Add target column Aspect
reviews["Aspect"] = 0
reviews["Accessibility"] = 0

# Preview
reviews.head()

Unnamed: 0,Venue Index,Name,Rating,Date,Text,Aspect,Accessibility
0,0,Ellis,5 stars,3 years ago,"It was a bit quite when we went in, but don’t ...",0,0
1,1,Ellis,5 stars,2 years ago,Nice cozy place which serves very tasty burger...,0,0
2,2,Ellis,5 stars,3 years ago,Really nice place. One of my favourite burger ...,0,0
3,3,Ellis,2 stars,3 years ago,The Service was quite good but the burgers we ...,0,0
4,4,Ellis,5 stars,2 years ago,I had a very nice experience! The staff were r...,0,0


In [23]:
# Checking types of data
reviews.dtypes

Venue Index       int64
Name             object
Rating           object
Date             object
Text             object
Aspect            int64
Accessibility     int64
dtype: object

# Cleaning Data

In [37]:
# Removing empty reviews
def remove_nan(data, column_name):
    '''Returns data where values in column_name are not empty (NaN)'''
    
    data = data[data[column_name].notna()]
    
    return data

# Removing original languages
def clean_translation(data):
    '''Returns the (Translated by Google) English text, removes the (Original) text from column_name.
        Indicated by sep, specifies what separator to separate the review by.'''
    
    sep = "(Original)"
    if sep in data:
        translation, separator, original = data.partition(sep)
        data = translation
        
    return data

# Cleaning text
def clean_string(s):
    '''Lowercases reviews, removes \n and (translated by google).'''
    
    s = str(s)                                         # Change type to string
    s = s.lower()                                      # Set all text to lower string
    s = s.replace("\n", '')
    s = s.replace("(translated by google)", '')        
    
    s = re.sub("n’t", ' not', s)                       # Change n't to not
    s = re.sub(" +", " ", s)                           # Remove multiple spaces
    s = re.sub(r"http.*?(?=\s)", "", s)                # Remove URL's
    s = re.sub("'"," ", s)                             # Remove apostrophes
    
    return s

In [None]:
#    emoji_pattern = re.compile("["
#                           u"\U0001F600-\U0001F64F" # emoticons
#                           u"\U0001F300-\U0001F5FF" # symbols & pictographs
#                           u"\U0001F680-\U0001F6FF" # transport & map symbols
#                           u"\U0001F1E0-\U0001F1FF" # flags (iOS)
#                           u"\U00002702-\U000027B0"
#                           u"\U000024C2-\U0001F251"
#                           "]+", flags=re.UNICODE)
#    s = emoji_pattern.sub(r'', s)
#    s = str(s)

In [6]:
clean_string("Hilarious 😂! The feeling of making a sale 😎, The feeling of actually fulfilling orders 😒")

'hilarious ! the feeling of making a sale , the feeling of actually fulfilling orders '

In [38]:
%%time
reviews = remove_nan(reviews, 'Text')
reviews["Text"] = reviews["Text"].apply(clean_translation)
reviews["Text"] = reviews["Text"].apply(clean_string)
reviews["Rating"] = reviews["Rating"].apply(clean_string)
reviews["Date"] = reviews["Date"].apply(clean_string)
reviews["Name"] = reviews["Name"].apply(clean_string)
reviews.head()

CPU times: user 11 s, sys: 174 ms, total: 11.2 s
Wall time: 11.3 s


Unnamed: 0,Venue Index,Name,Rating,Date,Text,Aspect,Accessibility
0,0,ellis,5 stars,3 years ago,"it was a bit quite when we went in, but do not...",0,0
1,1,ellis,5 stars,2 years ago,nice cozy place which serves very tasty burger...,0,0
2,2,ellis,5 stars,3 years ago,really nice place. one of my favourite burger ...,0,0
3,3,ellis,2 stars,3 years ago,the service was quite good but the burgers we ...,0,0
4,4,ellis,5 stars,2 years ago,i had a very nice experience! the staff were r...,0,0


In [39]:
reviews[403840:403843]

Unnamed: 0,Venue Index,Name,Rating,Date,Text,Aspect,Accessibility
577236,636,amstelhoeck,5 stars,4 years ago,location location location,0,0
577237,637,amstelhoeck,3 stars,4 years ago,nice moment,0,0
577238,638,amstelhoeck,5 stars,2 years ago,cozy,0,0


In [40]:
# Fix indexes
print("Shape of Reviews:", reviews.shape)

# Reset index
reviews = reviews.reset_index()

# Preview
reviews.head()
#reviews[403840:403843]

Shape of Reviews: (403843, 7)


Unnamed: 0,index,Venue Index,Name,Rating,Date,Text,Aspect,Accessibility
0,0,0,ellis,5 stars,3 years ago,"it was a bit quite when we went in, but do not...",0,0
1,1,1,ellis,5 stars,2 years ago,nice cozy place which serves very tasty burger...,0,0
2,2,2,ellis,5 stars,3 years ago,really nice place. one of my favourite burger ...,0,0
3,3,3,ellis,2 stars,3 years ago,the service was quite good but the burgers we ...,0,0
4,4,4,ellis,5 stars,2 years ago,i had a very nice experience! the staff were r...,0,0


In [41]:
# Drop old indexes
reviews = reviews.drop(columns={"index"})
#reviews = reviews.drop(columns={"level_0"})
reviews[403840:403843]

Unnamed: 0,Venue Index,Name,Rating,Date,Text,Aspect,Accessibility
403840,636,amstelhoeck,5 stars,4 years ago,location location location,0,0
403841,637,amstelhoeck,3 stars,4 years ago,nice moment,0,0
403842,638,amstelhoeck,5 stars,2 years ago,cozy,0,0


# Altering Data

In [42]:
# Change rating to positive, negative or neutral label
def rating_to_sent(s):
    '''Change Rating to integer score from 1 to 5.'''
    
    s = str(s)
    s = s.replace("stars", '')
    s = s.replace("star", '')
    s = s.replace(" ", '')
    s = int(s)
    
    '''Change score to positive, negative or neutral'''
    
    if s > 3:
        return 'positive'
    elif s < 3:
        return 'negative'
    else:
        return 'neutral'  

In [43]:
%%time
reviews["Rating Sent"] = reviews['Rating'].apply(rating_to_sent)
reviews.head()

CPU times: user 410 ms, sys: 7.19 ms, total: 417 ms
Wall time: 433 ms


Unnamed: 0,Venue Index,Name,Rating,Date,Text,Aspect,Accessibility,Rating Sent
0,0,ellis,5 stars,3 years ago,"it was a bit quite when we went in, but do not...",0,0,positive
1,1,ellis,5 stars,2 years ago,nice cozy place which serves very tasty burger...,0,0,positive
2,2,ellis,5 stars,3 years ago,really nice place. one of my favourite burger ...,0,0,positive
3,3,ellis,2 stars,3 years ago,the service was quite good but the burgers we ...,0,0,negative
4,4,ellis,5 stars,2 years ago,i had a very nice experience! the staff were r...,0,0,positive


In [44]:
# Change relative Date to absolute Date
def abs_date(s):
    '''Change relative date to absolute date'''
    
    s = str(s)
    s = s.replace(" ago", "")
    s = s.replace("years", "year")
    s = s.replace("months", "month")
    s = s.replace("weeks", "week")
    s = s.replace("days", "day")
    s = s.replace("hours", "hour")
    s = s.replace("minutes", "minute")
    
    num, metric = s.split(' ')
    
    if num == 'a' or num =='an':
        num = 1
    
    num = int(num)
    
    if "year" in metric:
        s = 2022 - num
    elif "month" in metric and num > 3:
        s = 2021
    elif "month" in metric and num < 4:
        s = 2022
    elif "day" in metric or "week" in metric or "hour" in metric or "minute" in metric:
        s = 2022
    
    s = int(s)
    
    return s

In [45]:
%%time 
reviews["Date"] = reviews["Date"].apply(abs_date)
reviews.head()

CPU times: user 684 ms, sys: 23.8 ms, total: 708 ms
Wall time: 720 ms


Unnamed: 0,Venue Index,Name,Rating,Date,Text,Aspect,Accessibility,Rating Sent
0,0,ellis,5 stars,2019,"it was a bit quite when we went in, but do not...",0,0,positive
1,1,ellis,5 stars,2020,nice cozy place which serves very tasty burger...,0,0,positive
2,2,ellis,5 stars,2019,really nice place. one of my favourite burger ...,0,0,positive
3,3,ellis,2 stars,2019,the service was quite good but the burgers we ...,0,0,negative
4,4,ellis,5 stars,2020,i had a very nice experience! the staff were r...,0,0,positive


In [46]:
# Change column order
reviews = reviews.reindex(columns=['Venue Index', 'Name', 'Date', 'Rating', 'Rating Sent', 'Text', 'Aspect', 'Accessibility'])
reviews.head()

Unnamed: 0,Venue Index,Name,Date,Rating,Rating Sent,Text,Aspect,Accessibility
0,0,ellis,2019,5 stars,positive,"it was a bit quite when we went in, but do not...",0,0
1,1,ellis,2020,5 stars,positive,nice cozy place which serves very tasty burger...,0,0
2,2,ellis,2019,5 stars,positive,really nice place. one of my favourite burger ...,0,0
3,3,ellis,2019,2 stars,negative,the service was quite good but the burgers we ...,0,0
4,4,ellis,2020,5 stars,positive,i had a very nice experience! the staff were r...,0,0


# Save the cleaned data!

In [47]:
reviews.to_csv("./Data/cleaned_data.csv")