## Loading files and Environment

In [70]:
import os
import glob
import pandas as pd
import sys
import pathlib
import sklearn
import numpy
import nltk
from nltk.corpus import stopwords
from collections import Counter
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [73]:
%run /Users/mylene/BachelorsProject/Venue-Accessibility-Google-Reviews/src/data_cleaning.py

In [49]:
cwd = pathlib.Path.cwd().parent
training_file_path = cwd.joinpath("datasets/EuansGuideData.xlsx")
test_file_path = cwd.joinpath("datasets/GoogleReviews")
print('path:', training_file_path)
print('path:', test_file_path)

path: /Users/mylene/BachelorsProject/Venue-Accessibility-Google-Reviews/datasets/EuansGuideData.xlsx
path: /Users/mylene/BachelorsProject/Venue-Accessibility-Google-Reviews/datasets/GoogleReviews


In [68]:
all_file_names = glob.glob(str(test_file_path) + "/*.csv")
google_df = [pd.read_csv(file_name, index_col=None, header=0) for file_name in all_file_names]
test_data = pd.concat(google_df, axis=0, ignore_index=True)

## Cleaning & Preprocessing

In [None]:
training_data = pd.read_excel(training_file_path)
clean_train_df = cleaning_df(training_data, "train")
clean_test_df = cleaning_df(test_data, "test")
display(clean_train_df)
display(clean_test_df)

In [None]:
vectorizer = CountVectorizer()
X1 = vectorizer.fit_transform(clean_train_df['text'])
X2 = vectorizer.transform(clean_test_df['text'])
y1 = training_data['label'] # toilet/transport
y2 = test_data['label'] # toilet transport

## Aspect Classification

### Split Training Set

In [None]:
# Split each dataset into training and testing sets
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.2, random_state=42)
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, random_state=42)

# Train a MultinomialNB model on the training data of each dataset
nb1 = MultinomialNB()
nb1.fit(X1_train, y1_train)
nb2 = MultinomialNB()
nb2.fit(X2_train, y2_train)

In [None]:
# Evaluate the performance of each model on its corresponding testing data
y1_pred = nb1.predict(X1_test)
y2_pred = nb2.predict(X2_test)
print('Dataset 1 metrics:')
print('Accuracy:', accuracy_score(y1_test, y1_pred))
print('Precision:', precision_score(y1_test, y1_pred))
print('Recall:', recall_score(y1_test, y1_pred))
print('F1-score:', f1_score(y1_test, y1_pred))
print('Dataset 2 metrics:')
print('Accuracy:', accuracy_score(y2_test, y2_pred))
print('Precision:', precision_score(y2_test, y2_pred))
print('Recall:', recall_score(y2_test, y2_pred))
print('F1-score:', f1_score(y2_test, y2_pred))

## Sentiment Analysis

## Opinion Summarisation