# Sentiment Analysis - Predicting Rating Based On Review Text

In [10]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import TextVectorization
from sklearn.utils import shuffle

In [11]:
# Get correct path if on Google Colab
try:
	from google.colab import drive
	drive.mount("/content/drive")
	reviews_dataset_path = "drive/MyDrive/Colab Notebooks/reviews.json"

	# Get RAM Info
	from psutil import virtual_memory
	ram_gb = virtual_memory().total / 1e9
	print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

	if ram_gb < 20:
		print('Not using a high-RAM runtime')
	else:
		print('You are using a high-RAM runtime!')
except ModuleNotFoundError:
	reviews_dataset_path = "yelp_dataset/reviews.json"

In [12]:
# Read dataset into memory
review_df = pd.read_json(reviews_dataset_path, orient="records", lines=True)

In [13]:
# Shuffle Review df
review_df = shuffle(review_df, random_state=0)

# Slice into Train, Val, Test at 60:20:20
n = len(review_df)
df_train = review_df.iloc[: int(n*0.6)]
df_val = review_df.iloc[int(n*0.6) : int(n*0.8)]
df_test = review_df.iloc[int(n*0.8) :]

In [29]:
# Convert Pandas DF to TF Dataset

def convert_text_df_to_dataset(df, input_col="text", target_col="stars"):
	text_input = tf.convert_to_tensor(df[input_col], dtype=tf.string)
	target = tf.convert_to_tensor(df[target_col], dtype=tf.int8)
	dataset = tf.data.Dataset.from_tensor_slices((text_input, target))
	dataset = dataset.batch(32)
	return dataset

train_dataset = convert_text_df_to_dataset(df_train)
val_dataset = convert_text_df_to_dataset(df_val)
test_dataset = convert_text_df_to_dataset(df_test)

In [6]:
# Create TextVectorization
max_tokens = 30000
text_vectorization = TextVectorization(max_tokens=max_tokens, output_mode="multi_hot")

# Train Vectorizer on train text
text_vectorization.adapt(df_train["text"])

In [31]:
# Vectorize Datasets
train_dataset = train_dataset.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)
val_dataset = val_dataset.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)
test_dataset = test_dataset.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)

In [7]:
# Build Model
inputs = keras.Input(shape=(max_tokens,))
x = keras.layers.Dense(32, activation="relu")(inputs)
x = keras.layers.Dropout(0.25)(x)
x = keras.layers.Dense(16, activation="relu")(x)
x = keras.layers.Dropout(0.25)(x)
x = keras.layers.Dense(1)(x)
outputs = keras.layers.ReLU(max_value=5, threshold=0)(x)

model = keras.Model(inputs, outputs)

model.compile(optimizer="rmsprop", loss="mean_absolute_error", metrics=["mean_squared_error"])

model_path = "models/text_vectorized.keras"
callbacks = [
	keras.callbacks.ModelCheckpoint(model_path, monitor='val_loss', save_best_only=True),
	keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.1, patience=5, verbose=1, restore_best_weights=True)
]

model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 30000)]           0         
                                                                 
 dense (Dense)               (None, 32)                960032    
                                                                 
 dropout (Dropout)           (None, 32)                0         
                                                                 
 dense_1 (Dense)             (None, 16)                528       
                                                                 
 dropout_1 (Dropout)         (None, 16)                0         
                                                                 
 dense_2 (Dense)             (None, 1)                 17        
                                                                 
 re_lu (ReLU)                (None, 1)                 0     

In [8]:
model.fit(train_dataset.cache(), validation_data=val_dataset.cache(), epochs=50, callbacks=callbacks)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test MSE: 6.048


In [10]:
model = keras.models.load_model(model_path)
model.evaluate(test_dataset)

Test MSE: 6.013
