# Bag of Words - Text Vectorization Approach

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import TextVectorization
from sklearn.utils import shuffle
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [2]:
# Check if Google Colab Instance for Setup
print("Tensorflow version " + tf.__version__)

# Get correct path if on Google Colab
try:
	from google.colab import drive
	drive.mount("/content/drive")
	reviews_dataset_path = "drive/MyDrive/Colab Notebooks/reviews.json"

	# Get RAM Info
	from psutil import virtual_memory
	ram_gb = virtual_memory().total / 1e9
	print('Your runtime has {:.1f} gigabytes of available RAM'.format(ram_gb))

	if ram_gb < 20:
		print('Not using a high-RAM runtime')
	else:
		print('You are using a high-RAM runtime!')

	try:
		tpu = tf.distribute.cluster_resolver.TPUClusterResolver() # TPU detection
		print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])

		tf.config.experimental_connect_to_cluster(tpu)
		tf.tpu.experimental.initialize_tpu_system(tpu)
		tpu_strategy = tf.distribute.TPUStrategy(tpu)

		using_tpu = True
	except ValueError:
		raise BaseException('ERROR: Not connected to a TPU runtime.')
except ModuleNotFoundError:
	reviews_dataset_path = "yelp_dataset/reviews.json"
	using_tpu = False

AUTO = tf.data.AUTOTUNE

Tensorflow version 2.7.0


In [3]:
# Read dataset into memory
review_df = pd.read_json(reviews_dataset_path, orient="records", lines=True)

In [4]:
# Shuffle Review df
review_df = shuffle(review_df, random_state=0)

# Slice into Train, Val, Test at 60:20:20
n = len(review_df)
df_train = review_df.iloc[: int(n*0.6)]
df_val = review_df.iloc[int(n*0.6) : int(n*0.8)]
df_test = review_df.iloc[int(n*0.8) :]

In [5]:
# Convert Pandas DF to TF Dataset
if using_tpu:
	batch_size = 16 * tpu_strategy.num_replicas_in_sync
else:
	batch_size = 128*16

def convert_text_df_to_dataset(df, input_col="text", target_col="stars"):
	text_input = tf.convert_to_tensor(df[input_col], dtype=tf.string)
	target = tf.convert_to_tensor(df[target_col], dtype=tf.int8)
	dataset = tf.data.Dataset.from_tensor_slices((text_input, target))
	dataset = dataset.batch(batch_size).prefetch(AUTO)
	return dataset

train_dataset = convert_text_df_to_dataset(df_train)
val_dataset = convert_text_df_to_dataset(df_val)
test_dataset = convert_text_df_to_dataset(df_test)

## Train Model

In [6]:
# Build Model
def create_model(max_tokens, model_name):
	inputs = keras.Input(shape=(max_tokens,))
	x = keras.layers.Dense(32, activation="relu")(inputs)
	x = keras.layers.Dropout(0.25)(x)
	x = keras.layers.Dense(16, activation="relu")(x)
	x = keras.layers.Dropout(0.25)(x)
	x = keras.layers.Dense(1)(x)
	outputs = keras.layers.ReLU(max_value=5, threshold=0)(x)

	model = keras.Model(inputs, outputs, name=model_name)

	model.compile(optimizer="rmsprop", loss="mean_absolute_error", metrics=["mean_squared_error"])

	return model

### Single Word Vectorization

In [7]:
# Create TextVectorization
max_tokens = 30000
text_vectorization = TextVectorization(max_tokens=max_tokens, output_mode="multi_hot")

# Train Vectorizer on train text
text_vectorization.adapt(df_train["text"])

# Vectorize Datasets
train_dataset_vectorized = train_dataset.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=AUTO)
val_dataset_vectorized = val_dataset.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=AUTO)
test_dataset_vectorized = test_dataset.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=AUTO)

In [8]:
model_name = "vectorized_1gram"

# Creating the model in the TPUStrategy scope means we will train the model on the TPU
if using_tpu:
	with tpu_strategy.scope():
		model = create_model(max_tokens, model_name)
else:
	model = create_model(max_tokens, model_name)

# Create callback to save model with a given name
model_path = f"models/{model_name}.keras"
callbacks = [
	keras.callbacks.ModelCheckpoint(model_path, monitor='val_loss', save_best_only=True),
	keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.01, patience=5, verbose=1, restore_best_weights=False)
]

# Train Model
model.fit(train_dataset_vectorized, validation_data=val_dataset_vectorized, epochs=20, callbacks=callbacks)

# Evaluate Model after training
model = keras.models.load_model(model_path)
eval = model.evaluate(test_dataset_vectorized)

# Output Model Metrics
metrics_text = f"Model {model_name} with MAE {eval[0]:.3f} and MSE {eval[1]:.3f}\n"
print(metrics_text)
with open("model_metrics.txt", "a") as f:
	f.write(metrics_text)

Epoch 1/20
Epoch 2/20
Epoch 3/20
 44/304 [===>..........................] - ETA: 43s - loss: 0.4248 - mean_squared_error: 0.6158

KeyboardInterrupt: 

### Bigram Vectorization

In [None]:
# Create TextVectorization
max_tokens = 30000
text_vectorization_ngram = TextVectorization(max_tokens=max_tokens, ngrams=2, output_mode="multi_hot")

# Train Vectorizer on train text
text_vectorization_ngram.adapt(df_train["text"])

# Vectorize Datasets
train_dataset_vectorized = train_dataset.map(lambda x, y: (text_vectorization_ngram(x), y), num_parallel_calls=AUTO)
val_dataset_vectorized = val_dataset.map(lambda x, y: (text_vectorization_ngram(x), y), num_parallel_calls=AUTO)
test_dataset_vectorized = test_dataset.map(lambda x, y: (text_vectorization_ngram(x), y), num_parallel_calls=AUTO)

In [None]:
train_dataset_vectorized = train_dataset_vectorized.prefetch(AUTO)
val_dataset_vectorized = val_dataset_vectorized.prefetch(AUTO)

In [None]:
model_name = "vectorized_2gram"

# Creating the model in the TPUStrategy scope means we will train the model on the TPU
if using_tpu:
	with tpu_strategy.scope():
		model = create_model(max_tokens, model_name)
else:
	model = create_model(max_tokens, model_name)

# Create callback to save model with a given name
model_path = f"models/{model_name}.keras"
callbacks = [
	keras.callbacks.ModelCheckpoint(model_path, monitor='val_loss', save_best_only=True),
	keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.01, patience=5, verbose=1, restore_best_weights=False)
]

# Train Model
model.fit(train_dataset_vectorized.cache(), validation_data=val_dataset_vectorized.cache(), epochs=20, callbacks=callbacks)

# Evaluate Model after training
model = keras.models.load_model(model_path)
eval = model.evaluate(test_dataset_vectorized)

# Output Model Metrics
metrics_text = f"Model {model_name} with MAE {eval[0]:.3f} and MSE {eval[1]:.3f}\n"
print(metrics_text)
with open("model_metrics.txt", "a") as f:
	f.write(metrics_text)

### Term Frequency Inverse Document Frequency (TF-IDF) Vectorization

In [None]:
# Create TextVectorization
max_tokens = 30000
text_vectorization_idf = TextVectorization(max_tokens=max_tokens, ngrams=2, output_mode="tf_idf")

# Train Vectorizer on train text
text_vectorization_idf.adapt(df_train["text"])

# Vectorize Datasets
train_dataset_vectorized = train_dataset.map(lambda x, y: (text_vectorization_idf(x), y), num_parallel_calls=AUTO)
val_dataset_vectorized = val_dataset.map(lambda x, y: (text_vectorization_idf(x), y), num_parallel_calls=AUTO)
test_dataset_vectorized = test_dataset.map(lambda x, y: (text_vectorization_idf(x), y), num_parallel_calls=AUTO)

In [None]:
model_name = "vectorized_idf"

# Creating the model in the TPUStrategy scope means we will train the model on the TPU
if using_tpu:
	with tpu_strategy.scope():
		model = create_model(max_tokens, model_name)
else:
	model = create_model(max_tokens, model_name)

# Create callback to save model with a given name
model_path = f"models/{model_name}.keras"
callbacks = [
	keras.callbacks.ModelCheckpoint(model_path, monitor='val_loss', save_best_only=True),
	keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.01, patience=5, verbose=1, restore_best_weights=False)
]

# Train Model
model.fit(train_dataset_vectorized, validation_data=val_dataset_vectorized, epochs=20, callbacks=callbacks)

# Evaluate Model after training
model = keras.models.load_model(model_path)
eval = model.evaluate(test_dataset_vectorized)

# Output Model Metrics
metrics_text = f"Model {model_name} with MAE {eval[0]:.3f} and MSE {eval[1]:.3f}\n"
print(metrics_text)
with open("model_metrics.txt", "a") as f:
	f.write(metrics_text)

### Categorical Classification

In [None]:
# Build Model
def create_model_categorical(max_tokens, model_name):
	inputs = keras.Input(shape=(max_tokens,))
	x = keras.layers.Dense(32, activation="relu")(inputs)
	x = keras.layers.Dropout(0.25)(x)
	x = keras.layers.Dense(16, activation="relu")(x)
	x = keras.layers.Dropout(0.25)(x)
	outputs = keras.layers.Dense(6, activation="softmax")(x)

	model = keras.Model(inputs, outputs, name=model_name)

	model.compile(optimizer="rmsprop", loss="sparse_categorical_crossentropy", metrics=["sparse_categorical_accuracy"])

	return model

In [None]:
model_name = "vectorized_categorical"

# Creating the model in the TPUStrategy scope means we will train the model on the TPU
if using_tpu:
	with tpu_strategy.scope():
		model = create_model_categorical(max_tokens, model_name)
else:
	model = create_model_categorical(max_tokens, model_name)

# Create callback to save model with a given name
model_path = f"models/{model_name}.keras"
callbacks = [
	keras.callbacks.ModelCheckpoint(model_path, monitor='val_loss', save_best_only=True),
	keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.01, patience=5, verbose=1, restore_best_weights=False)
]

# Train Model
model.fit(train_dataset_vectorized, validation_data=val_dataset_vectorized, epochs=20, callbacks=callbacks)

# Evaluate Model after training
model = keras.models.load_model(model_path)
predictions = model.predict(test_dataset_vectorized)
predictions = np.argmax(predictions, axis = -1)
true_labels = np.concatenate([y for _, y in test_dataset_vectorized], axis=0)
mae = mean_absolute_error(true_labels, predictions)
mse = mean_squared_error(true_labels, predictions)


# Output Model Metrics
metrics_text = f"Model {model_name} with MAE {mae:.3f} and MSE {mse:.3f}\n"
print(metrics_text)
with open("model_metrics.txt", "a") as f:
	f.write(metrics_text)

In [None]:
try: 
	from google.colab import files
	files.download("models")
	files.download("model_metrics.txt")
except:
	pass