## Loading files and Environment

In [166]:
#%pip freeze > requirements.txt

In [167]:
import os
import glob
import pandas as pd
import sys
import pathlib
import sklearn
import numpy
import nltk
from sklearn.metrics import classification_report
from sklearn.datasets import load_iris
from PIL import Image, ImageDraw, ImageFont
from nltk.corpus import stopwords
from collections import Counter
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import precision_recall_fscore_support

In [168]:
# Get current directory
current_dir = os.getcwd()
# Get parent directory
parent_dir = os.path.join(current_dir, '..')
# Append parent directory to sys.path
sys.path.append(parent_dir)
from src import data_cleaning as dc

In [169]:
cwd = pathlib.Path.cwd().parent
training_file_path = cwd.joinpath("datasets/EuansGuideData.xlsx")
test_file_path = cwd.joinpath("datasets/GoogleReviews")
print('path:', training_file_path)
print('path:', test_file_path)

path: /Users/mylene/BachelorsProject/Venue-Accessibility-Google-Reviews/datasets/EuansGuideData.xlsx
path: /Users/mylene/BachelorsProject/Venue-Accessibility-Google-Reviews/datasets/GoogleReviews


In [170]:
all_file_names = glob.glob(str(test_file_path) + "/*.csv")
google_df = [pd.read_csv(file_name, index_col=None, header=0) for file_name in all_file_names]
test_data = pd.concat(google_df, axis=0, ignore_index=True)

## Cleaning & Preprocessing

In [171]:
training_data = pd.read_excel(training_file_path)
clean_train_df = dc.clean_and_select(training_data, ["Aspect", "Rating", "Review", "Venue"])
clean_test_df = dc.clean_and_select(test_data, ["Name","Review Rate", "Review Text"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Text"] = df["Text"].apply(lambda x: x.replace("\n", ' '))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_aspects["Venue"] = selected_aspects["Venue"].apply(lambda x: get_venue_name(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Sentiment"] = df["Rating"].apply(lambda x : pick_s

In [172]:
display(clean_train_df[:10])
display(clean_test_df[:10])

Unnamed: 0,Aspect,Rating,Text,Venue,SentenceCount,Sentiment,Label
1,Transport & Parking,5.0,"There is disabled parking close to the doors, ...",dobbies garden centre perth,2,positive,1.0
7,Transport & Parking,4.5,"A ten minute walk from the luas, not much near...",bow lane dublin,1,positive,1.0
14,Toilets,4.5,"I did not use the toilets, but if you ring the...",jorvik viking centre york,1,positive,1.0
17,Transport & Parking,4.5,"It's situated on the high street, which has a ...",potter about burntisland,3,positive,1.0
19,Toilets,4.5,Clean and spacious with a grab rail on one sid...,potter about burntisland,3,positive,1.0
22,Transport & Parking,5.0,4 spaces next to the front door,dumfries ice bowl dumfries,1,positive,1.0
34,Transport & Parking,5.0,There is a large car park with a good number o...,the peak at stirling sports village stirling,4,positive,1.0
36,Toilets,4.5,There are quite a few accessible toilets thro...,the peak at stirling sports village stirling,6,positive,1.0
41,Toilets,4.0,"Accessible toilets on third, fourth and fifth ...",john lewis london,5,positive,1.0
44,Transport & Parking,4.0,There is blue badge parking right next to the ...,nuffield health edinburgh fitness wellbeing gy...,3,positive,1.0


Unnamed: 0,Name,Sentiment,Text,Label
0,Ellis,positive,"It was a bit quite when we went in, but don’t ...",1
1,Ellis,positive,Nice cozy place which serves very tasty burger...,1
2,Ellis,positive,Really nice place. One of my favourite burger ...,1
3,Ellis,negative,The Service was quite good but the burgers we ...,0
4,Ellis,positive,I had a very nice experience! The staff were r...,1
5,Ellis,positive,Ellis Gourmet Burger - Today (15.03.2018) I w...,1
6,Ellis,negative,"The taste was okay. Unfortunately, when we got...",0
7,Ellis,positive,The only disappointing thing about this place ...,1
8,Ellis,positive,Yesterday in the afternoon we had some burgers...,1
9,Ellis,positive,Really cosy. Has an actual fireplace. Great fo...,1


## Aspect Classification

### Split Data

In [173]:
vectorizer = CountVectorizer()
train = clean_train_df.Text.values.tolist()
test = clean_test_df[:13627].Text.values.tolist()
train_labels = clean_train_df.Aspect.values.tolist()
X = vectorizer.fit_transform(train)
X_train, X_test, y_train, y_test = train_test_split(X, train_labels, test_size=0.2, random_state=42)

In [174]:
nb = MultinomialNB()
nb.fit(X_train, y_train)  # train on Euans dataset
y_pred = nb.predict(X_test)  # test on dataset Google Reviews
y_pred_probs = nb.predict_proba(X_test)

In [175]:
pos_labels = ['Toilets', 'Transport & Parking']

### Evaluation Metrics

In [181]:
report = classification_report(y_test, y_pred, labels=pos_labels)

# save report as a text file
with open('../Results/classification_report.txt', 'w') as f:
    f.write(report)

# # convert text file to PNG image
img = Image.new('RGB', (800, 800), color='white')
font = ImageFont.truetype('arial.ttf', 20)
draw = ImageDraw.Draw(img)

# with open('../Results/classification_report.txt', 'r') as f:
#     y = 0
#     for line in f.readlines():
#         draw.text((10, y), line, fill='black', font=font)
#         y += 20

# img.save('../Results/classification_report.png')


OSError: cannot open resource

## Sentiment Analysis

## Opinion Summarisation