## Environment Setup

In [54]:
import pandas as pd # for data pre-processing
import seaborn as sns # for visualisations
import matplotlib.pyplot as plt # for creating graphs
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_curve, auc, balanced_accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
import numpy as np
import glob
import re
## add this to the bottom of the 

In [46]:
# NLTK
import nltk
from nltk import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
# stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mylene/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/mylene/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/mylene/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [28]:
%conda list --export > requirements.txt


Note: you may need to restart the kernel to use updated packages.


## Loading datasets

In [29]:
test_file_path = "/Users/mylene/BachelorsProject/Venue-Accessibility-Google-Reviews/GoogleReviews"
training_file_path = "/Users/mylene/BachelorsProject/Venue-Accessibility-Google-Reviews/Euan's Guide Data.xlsx"

In [30]:
all_file_names = glob.glob(test_file_path + "/*.csv")

google_df = []

for file_name in all_file_names:
    df = pd.read_csv(file_name, index_col=None, header=0)
    google_df.append(df)

test_data = pd.concat(google_df, axis=0, ignore_index=True)


In [31]:
training_data = pd.read_excel(training_file_path)

In [32]:
training_data.head()

Unnamed: 0,Aspect,Rating,Review,City,Country,Venue
0,Overview,5.0,Dobbies garden center has a large range of ite...,Perth,United Kingdom,https:||www.euansguide.com|venues|dobbies-gard...
1,Transport & Parking,5.0,"There is disabled parking close to the doors, ...",Perth,United Kingdom,https:||www.euansguide.com|venues|dobbies-gard...
2,Access,5.0,There is a lift and there is also a cafe where...,Perth,United Kingdom,https:||www.euansguide.com|venues|dobbies-gard...
3,Toilets,0.0,,Perth,United Kingdom,https:||www.euansguide.com|venues|dobbies-gard...
4,Staff,3.5,There were some staff who were very helpful an...,Perth,United Kingdom,https:||www.euansguide.com|venues|dobbies-gard...


In [33]:
test_data.head()

Unnamed: 0.1,Unnamed: 0,Name,Review Rate,Review Time,Review Text,name,street,housenumber,city,postcode,lat,lon,RD_x,RD_y,tile_code,place_id
0,0.0,Ellis,5 stars,3 years ago,"It was a bit quite when we went in, but don’t ...",,,,,,,,,,,
1,1.0,Ellis,5 stars,2 years ago,Nice cozy place which serves very tasty burger...,,,,,,,,,,,
2,2.0,Ellis,5 stars,3 years ago,Really nice place. One of my favourite burger ...,,,,,,,,,,,
3,3.0,Ellis,2 stars,3 years ago,The Service was quite good but the burgers we ...,,,,,,,,,,,
4,4.0,Ellis,5 stars,2 years ago,I had a very nice experience! The staff were r...,,,,,,,,,,,


## Cleaning DataFrame

In [34]:
# move this to pre-processing script
def cleaning_test_df(df):
    
    # Rename and drop colums
    df = df.drop("Unnamed: 0", axis=1)
    df = df.drop(columns=["Review Time"])
    df = df.rename(columns={"Review Text": "Text", "Review Rate": "Sentiment"})

    # Drop NaN
    df = df[df["Text"].notna()]
    df["Text"] = df["Text"].apply(lambda x: x.replace("\n", ' '))

    # Drop reviews with a rating of 0 (rating is missing)
    df = df[df["Sentiment"]!="0 stars"]
    
    for col in df.columns:
    # Check if the column has all NaN values
        if df[col].isnull().all():
            # Drop the column
            df.drop(col, axis=1, inplace=True)

    # Tokenize into sentences: regEX
    rule = r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s"
    df["Text"] = df["Text"].apply(lambda x: re.split(rule, x))
    df = df.explode("Text", ignore_index=True)
    df = df[df["Text"].notna()]
    df = df[df['Text']!='']

    df["Sentiment"] = df["Sentiment"].map(lambda x: re.sub(" stars", "", x))
    df["Sentiment"] = df["Sentiment"].map(lambda x: re.sub(" star", "", x))
    df["Sentiment"] = df["Sentiment"].map(lambda x: int(x))

    # Rating into Sentiment
    df["Sentiment"] = df["Sentiment"].map(lambda score: 'positive' if score > 3 else 'negative')
    df['Label'] = df["Sentiment"].map({'positive': 1, 'negative': 0})

    print("---> DONE CLEANING")
    return df

In [50]:
# ========= PRE-PROCESSING FOR MACHINE LEARNING MODELS =========
def text_preprocessing(s):
    """
    - Lowercase the sentence
    - Remove URLs
    - Change "'t" to "not"
    - Isolate and remove punctuations except "?"
    - Remove other special characters
    - Remove stop words except "not" and "can"
    - Remove trailing whitespace
    """
    s = s.lower()
    # Remove URLs
    s = re.sub(r"http.*?(?=\s)", "", s) 
    # Change 't to 'not'
    s = re.sub(r"\'t", " not", s)
    # Remove @name
    s = re.sub(r'(@.*?)[\s]', ' ', s)
    # Isolate and remove punctuations except '?'
    s = re.sub(r'([\'\"\.\(\)\!\?\\\/\,])', r' \1 ', s)
    s = re.sub(r'[^\w\s\?]', ' ', s)
    # Remove some special characters
    s = re.sub(r'([\;\:\|•«\n])', ' ', s)
    # Remove stopwords except 'not' and 'can'
    s = " ".join([word for word in s.split()
                  if word not in stopwords.words('english')
                  or word in ['not', 'can']])
    # Remove trailing whitespace
    s = re.sub(r'\s+', ' ', s).strip()

    return s

In [48]:
def cleaning_training_df(df):
    # Rename and drop colums
    df = df.drop(columns=["City", "Country"])
    df = df.rename(columns={"Review": "Text"})

    # Drop irrelevant aspects
    df = df[df['Aspect']!='Anything else you wish to tell us?']
    df = df[df['Aspect']!='Venue Manager responded to this review']       
    df = df[df['Aspect']!='COVID Precautions']  
    df = df[df['Aspect']!='Accessibility Guide']  
    df = df[df['Aspect']!='Awards List'] 
    df = df[df['Aspect']!='Access Statement']

    # Drop NaN
    df = df[df["Text"].notna()]

    # Some reviews contain: "A description about the access has not been added for this venue."
    # These have a rating <=0.0
    # Remove no description reviews
    df = df[df["Rating"]>0.0]

    # Remove review if sentence count == 0
    df["SentenceCount"] = df["Text"].apply(lambda x: len(sent_tokenize(x)))
    df = df[df["SentenceCount"]!=0]

    # Take only the venue name
    df["Venue"] = df["Venue"].apply(lambda x: ' '.join(x.split('|')[4].split("-")[:-1]))

    # Rating into Sentiment
    df["Sentiment"] = df["Rating"].map(lambda score: 'positive' if score > 3.0 else 'negative')
    df['Label'] = df["Sentiment"].map({'positive': 1, 'negative': 0})

    print("---> DONE CLEANING")
    return df

In [37]:
cleaned_train_data = cleaning_training_df(training_data)
display(cleaned_train_data)

---> DONE CLEANING


Unnamed: 0,Aspect,Rating,Text,Venue,SentenceCount,Sentiment,Label
0,Overview,5.0,Dobbies garden center has a large range of ite...,dobbies garden centre perth,1,positive,1
1,Transport & Parking,5.0,"There is disabled parking close to the doors, ...",dobbies garden centre perth,2,positive,1
2,Access,5.0,There is a lift and there is also a cafe where...,dobbies garden centre perth,2,positive,1
4,Staff,3.5,There were some staff who were very helpful an...,dobbies garden centre perth,1,positive,1
6,Overview,4.5,"Fantastic spot with great cocktails, friendly ...",bow lane dublin,1,positive,1
...,...,...,...,...,...,...,...
56590,Toilets,3.5,"It's on the ground floor, just inside the main...",wien museum karlsplatz vienna,3,positive,1
56591,Staff,5.0,Friendly and helpful. One staff member saw us ...,wien museum karlsplatz vienna,2,positive,1
56593,Overview,5.0,Friendly and helpful staff who examined how th...,kirkcudbright swimming pool kirkcudbright,2,positive,1
56594,Transport & Parking,5.0,Spaces located by the door and clearly signed ...,kirkcudbright swimming pool kirkcudbright,1,positive,1


In [38]:
cleaned_test_data = cleaning_test_df(test_data)
display(cleaned_test_data)

---> DONE CLEANING


Unnamed: 0,Name,Sentiment,Text,Label
0,Ellis,positive,"It was a bit quite when we went in, but don’t ...",1
1,Ellis,positive,Rightfully so! The burgers (and nachos) were l...,1
2,Ellis,positive,I would definitely recommend this place if you...,1
3,Ellis,positive,Nice cozy place which serves very tasty burger...,1
4,Ellis,positive,They have a good selection of burgers and othe...,1
...,...,...,...,...
815766,Amstelhoeck,positive,(Translated by Google) In a word super (Origi...,1
815767,Amstelhoeck,negative,(Translated by Google) Recommended (Original)...,0
815768,Amstelhoeck,positive,(Translated by Google) Location location locat...,1
815769,Amstelhoeck,negative,(Translated by Google) Nice moment (Original)...,0


## Train/Test Split

In [39]:
shuffled_training_data = cleaned_train_data.sample(frac=1)
print("training data ", shuffled_training_data.shape[0])
shuffled_test_data = cleaned_test_data.sample(frac=1)
print("test data ", shuffled_test_data.shape[0])

training data  40024
test data  815771


In [40]:
# Test 20% Training %80 
shuffled_training_data = shuffled_training_data[:32019]
shuffled_test_data = shuffled_test_data[:8005]

In [43]:
X = shuffled_training_data.Text.values
y = shuffled_training_data.Label.values
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
save_to_path = "/Users/mylene/BachelorsProject/Venue-Accessibility-Google-Reviews/src/data"
shuffled_training_data.to_csv(save_to_path+'train_data.csv')
shuffled_test_data.to_csv(save_to_path+'test_data.csv')
pd.DataFrame(X_train).to_csv(save_to_path+'X_train.csv')
pd.DataFrame(X_val).to_csv(save_to_path+'X_val.csv')
pd.DataFrame(y_train).to_csv(save_to_path+'y_train.csv')
pd.DataFrame(y_val).to_csv(save_to_path+'y_val.csv')

In [51]:
X_train_preprocessed = np.array([text_preprocessing(text) for text in X_train])
X_val_preprocessed = np.array([text_preprocessing(text) for text in X_val])


In [62]:
# naive_bayes = Pipeline([('vect', CountVectorizer()),
#                 ('tfidf', TfidfTransformer()),
#                 ('clf', MultinomialNB(alpha=1.0))
#                 ])
# naive_bayes.fit(X_train_preprocessed, y_train)
# y_pred = naive_bayes.predict(X_val_preprocessed)

# metrics_nb = classification_report(y_val, y_pred, output_dict=True)
# naive_bayes_tfid = pd.DataFrame.from_dict(metrics_nb)

In [63]:
# display(naive_bayes_tfid)

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,1.0,0.801126,0.801374,0.900563,0.840876
recall,0.00625,1.0,0.801374,0.503125,0.801374
f1-score,0.012422,0.889583,0.801374,0.451003,0.714261
support,1280.0,5124.0,0.801374,6404.0,6404.0
