<a href="https://colab.research.google.com/github/AmeliaTYR/DSC_NLP-workshop-2022/blob/main/NLP_for_NUS_restaurant_reviews(Participant).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Natural Language Processing applied to NUS restaurant reviews!


## Importing the libraries

In [None]:
import numpy as np
#for visualisation
import matplotlib.pyplot as plt
#python based data analytics toolkit
import pandas as pd
#this is simply for coloring the confusion matrix. Green for successful prediction, red for failed prediction
import sys
from termcolor import colored, cprint

## Importing the dataset

In [None]:
dataset = pd.read_csv('NUSRestaurant_Reviews.tsv', delimiter = '\t', quoting = 3)

- tsv files contain elements seperated by tabs and csv files contain elements seperated by commas which can interfere with the commas within the text so we use tsv files for NLP
- delimiter argument is used to specify to the read_csv method that this file is in tsv format
- quoting = 3 is needed to ensure that the use of quotes in the text are not misinterpreted

## Cleaning the texts

In [None]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
# this list will contain only the 'cleaned' words
for i in range(0, 1000):
  review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
  # the 1st argument specifies what should be replaced, the 2nd one specifies the replacement term and the 3rd one specifies the location of the term
  review = review.lower()
  review = review.split()
  # this splitting is done to get individual words so that the stem method can be applied to each word
  ps = PorterStemmer()
  all_stopwords = stopwords.words('english')
  all_stopwords.remove('not')
  review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
  review = ' '.join(review)
  corpus.append(review)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Types of 'cleaning' done:
- removal of stopwords(i.e. 'the', 'a', 'on')
- retrieving the essence/base form of remaining words(i.e. 'love' instead of 'loved'
- replacing of non-alphabetical features with spaces
- uppercase to lower case  

In [None]:
#The key words that our model will be looking at
print(corpus[-24:-1])

['good food ambienc visit dinner incred tasti authent person favourit vegetarian pizza top best freshest ingredi crust bake utmost perfect', 'ambienc wanderlust relax servic great', 'order three drink ice cream lava cake set one satchet sugar plain water need ask italian place umhm next tabl order set drink serv ice water without ask price menu', 'overal great place', 'great food ambienc not go back disappoint pizza pasta must tri', 'spaciou nice decor cafe good place chill friend famili price food reason well worth money paid ox tail stew highlight seabass pizza also tast good', 'seem minor chang decor sinc last visit almost year ago overal still offer function cosi enough set food menu seem expand letdown servic staff not attent diner wave attent good dine experi overal afternot chicken wing still good mani year dine', 'pasta tast realli aw', 'bimbabap terribl bowl not sizzl hotston type food luke warm best tasteless not want say anymor unagi rice averag satay best food tabl servic s

## Creating the Bag of Words model

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, -1].values

## Splitting the dataset into the Training set and Test set

In [None]:
from sklearn.model_selection import train_test_split
#Test size = 0.024 -> the 24 NUS restaurant reviews that we have as our training set
#The remaining 976 entries were collected online to train our model on restaurant reviews
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.024, random_state = 0, shuffle = False)

## Training the Naive Bayes model on the Training set

In [None]:
#Naive Bayes assumes that each input variable is independent
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB()

## Predicting the Test set results

In [None]:
y_pred = classifier.predict(X_test)
text = np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1)
for item in text:
  if item[0] != item[1]:
    print(colored(item, 'red', attrs=['reverse','blink']))
  else:
    print(colored(item, 'green', attrs=['reverse', 'blink']))

  


[5m[7m[32m[1 1][0m
[5m[7m[32m[1 1][0m
[5m[7m[32m[0 0][0m
[5m[7m[32m[1 1][0m
[5m[7m[32m[1 1][0m
[5m[7m[32m[1 1][0m
[5m[7m[31m[0 1][0m
[5m[7m[32m[0 0][0m
[5m[7m[32m[0 0][0m
[5m[7m[32m[1 1][0m
[5m[7m[32m[1 1][0m
[5m[7m[32m[1 1][0m
[5m[7m[32m[0 0][0m
[5m[7m[32m[1 1][0m
[5m[7m[32m[0 0][0m
[5m[7m[32m[0 0][0m
[5m[7m[32m[0 0][0m
[5m[7m[32m[0 0][0m
[5m[7m[32m[1 1][0m
[5m[7m[32m[1 1][0m
[5m[7m[32m[1 1][0m
[5m[7m[32m[1 1][0m
[5m[7m[32m[1 1][0m
[5m[7m[32m[1 1][0m


## Making the Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[ 8  0]
 [ 1 15]]


0.9583333333333334