## Loading files and Environment

In [216]:
#%pip freeze > requirements.txt

In [217]:
import os
import glob
import pandas as pd
import sys
import pathlib
import sklearn
import numpy
import nltk
from sklearn.metrics import classification_report
from nltk.corpus import stopwords
from collections import Counter
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [218]:
%run /Users/mylene/BachelorsProject/Venue-Accessibility-Google-Reviews/src/data_cleaning.py

In [219]:
# Get current directory
current_dir = os.getcwd()
# Get parent directory
parent_dir = os.path.join(current_dir, '..')
# Append parent directory to sys.path
sys.path.append(parent_dir)
from src import data_cleaning

In [220]:
cwd = pathlib.Path.cwd().parent
training_file_path = cwd.joinpath("datasets/EuansGuideData.xlsx")
test_file_path = cwd.joinpath("datasets/GoogleReviews")
print('path:', training_file_path)
print('path:', test_file_path)

path: /Users/mylene/BachelorsProject/Venue-Accessibility-Google-Reviews/datasets/EuansGuideData.xlsx
path: /Users/mylene/BachelorsProject/Venue-Accessibility-Google-Reviews/datasets/GoogleReviews


In [221]:
all_file_names = glob.glob(str(test_file_path) + "/*.csv")
google_df = [pd.read_csv(file_name, index_col=None, header=0) for file_name in all_file_names]
test_data = pd.concat(google_df, axis=0, ignore_index=True)

## Cleaning & Preprocessing

In [222]:
training_data = pd.read_excel(training_file_path)
clean_train_df = clean_and_select(training_data, ["Aspect", "Rating", "Review", "Venue"])
clean_test_df = clean_and_select(test_data, ["Name","Review Rate", "Review Text"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Text"] = df["Text"].apply(lambda x: x.replace("\n", ' '))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_aspects["Venue"] = selected_aspects["Venue"].apply(lambda x: get_venue_name(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Sentiment"] = df["Rating"].apply(lambda x : pick_s

In [223]:
display(clean_train_df[:40])
display(clean_test_df[:40])

Unnamed: 0,Aspect,Rating,Text,Venue,SentenceCount,Sentiment,Label
1,Transport & Parking,5.0,"There is disabled parking close to the doors, ...",dobbies garden centre perth,2,positive,1.0
7,Transport & Parking,4.5,"A ten minute walk from the luas, not much near...",bow lane dublin,1,positive,1.0
14,Toilets,4.5,"I did not use the toilets, but if you ring the...",jorvik viking centre york,1,positive,1.0
17,Transport & Parking,4.5,"It's situated on the high street, which has a ...",potter about burntisland,3,positive,1.0
19,Toilets,4.5,Clean and spacious with a grab rail on one sid...,potter about burntisland,3,positive,1.0
22,Transport & Parking,5.0,4 spaces next to the front door,dumfries ice bowl dumfries,1,positive,1.0
34,Transport & Parking,5.0,There is a large car park with a good number o...,the peak at stirling sports village stirling,4,positive,1.0
36,Toilets,4.5,There are quite a few accessible toilets thro...,the peak at stirling sports village stirling,6,positive,1.0
46,Toilets,4.5,As I mentioned above the toilet is also a chan...,nuffield health edinburgh fitness wellbeing gy...,5,positive,1.0
52,Toilets,5.0,"Top marks for the loo by the Forest Glen cafe,...",blists hill victorian town telford,3,positive,1.0


Unnamed: 0,Name,Sentiment,Text,Label
0,Ellis,positive,"It was a bit quite when we went in, but don’t ...",1.0
1,Ellis,positive,Nice cozy place which serves very tasty burger...,1.0
2,Ellis,positive,Really nice place. One of my favourite burger ...,1.0
3,Ellis,negative,The Service was quite good but the burgers we ...,0.0
4,Ellis,positive,I had a very nice experience! The staff were r...,1.0
5,Ellis,positive,Ellis Gourmet Burger - Today (15.03.2018) I w...,1.0
7,Ellis,positive,The only disappointing thing about this place ...,1.0
8,Ellis,positive,Yesterday in the afternoon we had some burgers...,1.0
9,Ellis,positive,Really cosy. Has an actual fireplace. Great fo...,1.0
10,Ellis,negative,Place looked really good and thought it may be...,0.0


In [224]:
train = clean_train_df
test = clean_test_df[:8341]

In [225]:
vectorizer = CountVectorizer()
X1 = vectorizer.fit_transform(train['Text'])
y1 = train['Label']
X2 = vectorizer.transform(test['Text'])
y2 = test['Label'] 

## Aspect Classification

### Split Training Set

In [226]:
# Split each dataset into training and testing sets
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.2, random_state=42)
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, random_state=42)

# Train a MultinomialNB model on the training data of each dataset
nb1 = MultinomialNB()
nb1.fit(X1_train, y1_train)
nb2 = MultinomialNB()
nb2.fit(X2_train, y2_train)

In [227]:
# Evaluate the performance of each model on its corresponding testing data
y1_pred = nb1.predict(X1_test)
y2_pred = nb2.predict(X2_test)
print('Dataset 1 metrics:')
print('Accuracy:', accuracy_score(y1_test, y1_pred))
print('Precision:', precision_score(y1_test, y1_pred))
print('Recall:', recall_score(y1_test, y1_pred))
print('F1-score:', f1_score(y1_test, y1_pred))
print('Dataset 2 metrics:')
print('Accuracy:', accuracy_score(y2_test, y2_pred))
print('Precision:', precision_score(y2_test, y2_pred))
print('Recall:', recall_score(y2_test, y2_pred))
print('F1-score:', f1_score(y2_test, y2_pred))

Dataset 1 metrics:
Accuracy: 0.8334331935290593
Precision: 0.8631970260223049
Recall: 0.9250996015936255
F1-score: 0.8930769230769231
Dataset 2 metrics:
Accuracy: 0.9281006590772918
Precision: 0.9374565670604587
Recall: 0.9782451051486585
F1-score: 0.957416607523066


In [None]:
report = classification_report(train['Label'], test['Label'], target_names=train['Sentiment'])

print(report)

## Sentiment Analysis

## Opinion Summarisation