Convert JSON to CSV

In [None]:
# Since google restaurant dataset is in json format, use pandas to convert it into a CSV file
# DO NOT RERUN
import pandas as pd 
df = pd.read_json(r'E:\AIEBA Datasets\Proj1 Datasets\GoogleRestaurantReview.json', lines=True)
df.to_csv(r'E:\AIEBA Datasets\Proj1 Datasets\GoogleRestaurantReview.csv', index = None)

Formatting of Data

In [1]:
# Load in all three datasets
# .csv files use the utf-8 encodings
import pandas as pd

#amzn = pd.read_csv(r'E:\AIEBA Datasets\Proj1 Datasets\AmazonReviews.csv',encoding='utf-8',engine='python')
#googl = pd.read_csv(r'E:\AIEBA Datasets\Proj1 Datasets\GoogleRestaurantReview.csv',encoding='utf-8',engine='python')
#dis = pd.read_csv(r'E:\AIEBA Datasets\Proj1 Datasets\DisneylandReviews.csv',encoding='utf-8',engine='python')

amzn = pd.read_csv(r'C:\AIEBA Datasets\Proj1 Datasets\AmazonReviews.csv',encoding='utf-8',engine='python')
googl = pd.read_csv(r'C:\AIEBA Datasets\Proj1 Datasets\GoogleRestaurantReview.csv',encoding='utf-8',engine='python')
dis = pd.read_csv(r'C:\AIEBA Datasets\Proj1 Datasets\DisneylandReviews.csv',encoding='utf-8',engine='python')

FileNotFoundError: [Errno 2] No such file or directory: 'E:\\AIEBA Datasets\\Proj1 Datasets\\AmazonReviews.csv'

In [None]:
amzn.head()

In [None]:
googl.head()

In [None]:
dis.head()

In [None]:
# Drop columns that are not needed and standardize column names
amzn = amzn.filter(items=['reviews.rating', 'reviews.text'])
amzn = amzn.rename(columns={"reviews.rating":"reviews_rating", "reviews.text":"reviews_text"})

googl = googl.filter(items=['rating', 'review_text'])
googl = googl.rename(columns={"rating":"reviews_rating", "review_text":"reviews_text"})

dis = dis.filter(items=['Rating','Review_Text'])
dis = dis.rename(columns={"Rating":"reviews_rating", "Review_Text":"reviews_text"})

amzn.head()

In [None]:
# Concatenate all 3 datasets by column
combined = pd.concat([amzn, googl, dis], ignore_index=True)
combined

Data Cleansing and preprocessing

In [None]:
# remove rows with missing values
print(combined.isnull().sum())
combined = combined.dropna()


In [None]:
#missing values removed 
print(combined.isnull().sum())

In [None]:
combined["reviews_rating"].value_counts().plot.bar()
combined["reviews_rating"].value_counts(ascending=True)

In [None]:
# perform undersampling to get balanced dataset

least_class_amount, class_1, class_3, class_4, class_5 = combined.reviews_rating.value_counts(ascending=True)

# create separate dataframes for each rating
c5 = combined[combined["reviews_rating"] == 5.0]
c4 = combined[combined["reviews_rating"] == 4.0]
c3 = combined[combined["reviews_rating"] == 3.0]
c2 = combined[combined["reviews_rating"] == 2.0]
c1 = combined[combined["reviews_rating"] == 1.0]

# randomly remove data to in each dataframe to match the number of rows in c2 which has the least amount of data
df_5 = c5.sample(least_class_amount, random_state=100)
df_4 = c4.sample(least_class_amount, random_state=100)
df_3 = c3.sample(least_class_amount, random_state=100)
df_1 = c1.sample(least_class_amount, random_state=100)

# concatenate all 5 dataframes into 1
undersampled = pd.concat([df_5, df_4, df_3, c2, df_1], axis=0)
combined = undersampled
combined.reviews_rating.value_counts().plot.bar()
combined.reviews_rating.value_counts()
combined

In [None]:
# remove non-ascii characters
combined["reviews_text"] = combined["reviews_text"].str.encode('ascii', 'ignore').str.decode('ascii')

In [None]:
# using regular expressions to remove unwanted characters and punctuations
import re

def cleanup(sentence):
  sentence = str(sentence).lower()
  sentence = cleanup_re.sub('',sentence).strip() 
  sentence = re.sub('[\d]','',sentence) # remove all digits and numbers
  return sentence
  
cleanup_re = re.compile('[^\w\s]') # match whitespaces words only
combined["reviews_text"] = combined["reviews_text"].apply(cleanup)
combined

In [None]:
# remove stopwords 
import nltk
from nltk.corpus import stopwords

def remove_stopwords(sentence):
  sentence = sentence.split()
  sentence = [word for word in sentence if word not in stoplist]
  sentence = ' '.join(sentence)
  return sentence
    

def custom_stoplist():
  stop_words = set(stopwords.words('english'))
  wanted_stopwords = {'not', 'nor', 'no', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'very'} # stopwords that can change a sentence's meaning
  stoplist = stop_words - wanted_stopwords
  return stoplist

stoplist = custom_stoplist()
combined["reviews_text"] = combined["reviews_text"].apply(remove_stopwords)
combined

In [None]:
combined["reviews_rating"].value_counts().plot.bar()
combined["reviews_rating"].value_counts(ascending=True)

In [None]:
# perform 80-20 split on data
from sklearn.model_selection import train_test_split
train, test = train_test_split(combined, test_size=0.2, train_size=0.8, random_state=100) 
# random_state controls how data is being shuffled
# ensures that data is being shuffled the same way every time the cell is being ran
train


Constructing the model

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np


def remove_stopwords():
  import nltk
  from nltk.corpus import stopwords
  stop_words = set(stopwords.words('english'))
  print(stop_words)


training_sentences = train["reviews_text"].tolist()
training_labels = train["reviews_rating"].tolist()
test_sentences = test["reviews_text"].tolist()
test_labels = test["reviews_rating"].tolist()
vocab_size = 20000 # number of words that will be fed into model 
max_length = 100 #
trunc_type='post' # 
padding_type='post' #
oov_tok = "<OOV>" # placeholder if model encounters words it has never seen before in the test set

# Tokenization - breaking down sentences into its individual words and assigning a number to it
tokenizer = Tokenizer(num_words=vocab_size, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index # Model's 'vocabulary' learnt from training data

# Sequencing - converting each sentence into its numerical equivalent
# Padding and truncating used to make all sentences same length 
training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type,
                               truncating=trunc_type)

test_sequences = tokenizer.texts_to_sequences(test_sentences)
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding=padding_type,
                               truncating=trunc_type)

In [None]:
tokenizer.word_docs

In [None]:
print(len(word_index))

In [None]:
# Need this block to get it to work with TensorFlow 2.x
training_padded = np.array(training_padded)
training_labels = tf.keras.utils.to_categorical(training_labels)
test_padded = np.array(test_padded)
test_labels = tf.keras.utils.to_categorical(test_labels)

Model Building

In [None]:
import keras
import keras_tuner as kt
num_of_ratings = 6 # number of possible ratings given by users: 1 to 5

def model_builder(hp): 
    model = keras.Sequential()

    #optimize embedding dim 
    embedding_dim = hp.Int('output_dim', min_value=8, max_value=300, step=32)
    model.add(keras.layers.Embedding(vocab_size, output_dim=embedding_dim, input_length=max_length))

    model.add(keras.layers.GlobalAveragePooling1D())

    #optimize number of units in 1st layer 
    layer1_unit = hp.Int('units', min_value=32, max_value=512, step=32)
    model.add(keras.layers.Dense(units=layer1_unit, activation='relu')) 

    #classification layer
    model.add(keras.layers.Dense(num_of_ratings, activation='softmax'))

    #optimize learning rate
    learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])
    model.compile(optimizer=keras.optimizers.Adam(learning_rate=learning_rate),loss='categorical_crossentropy',metrics=['accuracy'])

    return model

#using hyperband to search for the best parameters 
tuner = kt.Hyperband(model_builder,
                     objective='val_accuracy',
                     max_epochs=10,
                     factor=3)
#early stopping layer 
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

tuner.search(training_padded, training_labels, epochs=50, validation_split=0.2, callbacks=[stop_early])

# Get the optimal hyperparameters
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]

print(f"""
The hyperparameter search is complete. The optimal number of units in the first densely-connected
layer is {best_hps.get('units')} and the optimal learning rate for the optimizer
is {best_hps.get('learning_rate')}.
""")


In [None]:
# Training model

#epochs_2 = 10
#prediction_2 = model.fit(training_padded, training_labels, epochs=epochs_2, validation_data=(test_padded, test_labels), verbose=2)

epochs_2 = 70
history = model.fit(training_padded, training_labels, epochs=epochs_2, validation_data=(test_padded, test_labels), verbose=2)


In [None]:
sentence = ["food took too long to come but overall experience was ok"]
sequences = tokenizer.texts_to_sequences(sentence)
padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
output= model.predict(padded)

print(output)
print("Review:", sentence)
print("Rating:", np.argmax(output, axis=None, out=None))