# Train Large Vectorized Model

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import shutil
import time
import tarfile
import json
from pathlib import Path
from tensorflow import keras
from tensorflow.keras.layers import TextVectorization
from sklearn.utils import shuffle
from sklearn.metrics import mean_absolute_error, mean_squared_error

## Connect to Colab TPU

In [None]:
# Check if Google Colab Instance for Setup
print("Tensorflow version " + tf.__version__)

# Get correct path if on Google Colab
try:
	from google.colab import drive
	drive.mount("/content/drive")

	# Get RAM Info
	from psutil import virtual_memory
	ram_gb = virtual_memory().total / 1e9
	print('Your runtime has {:.1f} gigabytes of available RAM'.format(ram_gb))

	if ram_gb < 20:
		print('Not using a high-RAM runtime')
	else:
		print('You are using a high-RAM runtime!')

	try:
		tpu = tf.distribute.cluster_resolver.TPUClusterResolver() # TPU detection
		print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])

		tf.config.experimental_connect_to_cluster(tpu)
		tf.tpu.experimental.initialize_tpu_system(tpu)
		tpu_strategy = tf.distribute.TPUStrategy(tpu)

		using_tpu = True
	except ValueError:
		print("Note: Not connected to a TPU runtime.")
		using_tpu = False
except ModuleNotFoundError:
	raise Exception("Must be connected to Google Colab")

AUTO = tf.data.AUTOTUNE

## Create All Reviews Dataset

In [3]:
read_path = "yelp_dataset/yelp_academic_dataset_review.json"
write_path = "yelp_dataset/all_reviews.json"
colab_path = "drive/MyDrive/Colab Notebooks/yelp_dataset/all_reviews.json"

if Path(colab_path).exists():
	Path("yelp_dataset").mkdir(exist_okay=True)
	shutil.copy(colab_path, write_path)
else:
	start_time = time.perf_counter()
	# Unzip yelp reviews and make all reviews
	if Path("yelp_dataset").exists():
		pass
	else:
		# Extract tar file
		Path("yelp_dataset").mkdir(exist_ok=True)
		with tarfile.open("drive/MyDrive/Colab Notebooks/yelp_dataset.tgz") as tar:
			tar.extractall("yelp_dataset/")
	
	# Read line by line as json, extract just the "text" and "stars", then write line by line to new json
	with open(read_path, "r") as yelp_review, open(write_path, "w+") as all_reviews:
		i = 1
		for review in yelp_review:
			if i % 1000 == 0:
				print(f"Reading chunk {i} of 8636", end="\r")
			i += 1

			line = json.loads(review)
			review_features = {"text": line["text"], "stars": int(line["stars"])}
			json_string = json.dumps(review_features)
			all_reviews.write(json_string + "\n")

	shutil.copy(write_path, colab_path)
	print("All reviews saved to disk.")

	print(f"Reviews processed in {round(time.perf_counter() - start_time)} seconds.")

In [None]:
# Read dataset into memory
review_df = pd.read_json("yelp_dataset/all_reviews.json", orient="records", lines=True)

# Shuffle Review df
review_df = shuffle(review_df, random_state=0)

# Slice into Train, Val, Test at 60:20:20
n = len(review_df)
df_train = review_df.iloc[: int(n*0.6)]
df_val = review_df.iloc[int(n*0.6) : int(n*0.8)]
df_test = review_df.iloc[int(n*0.8) :]

# Get ratios of stars
initial_dist = df_train["stars"].value_counts(normalize=True).sort_index().tolist()

## Create Balanced Dataset

In [None]:
# Convert Pandas DF to Balanced TF Dataset
def class_func(features, label):
	return label - 1

def convert_text_df_to_dataset(df, input_col="text", target_col="stars"):
	text_input = tf.convert_to_tensor(df[input_col], dtype=tf.string)
	target = tf.convert_to_tensor(df[target_col], dtype=tf.int8)
	dataset = tf.data.Dataset.from_tensor_slices((text_input, target))
	dataset = dataset.rejection_resample(class_func, [0.2, 0.2, 0.2, 0.2, 0.2], initial_dist=initial_dist)
	dataset = dataset.map(lambda extra_label, features_and_label: features_and_label, num_parallel_calls=AUTO)
	return dataset

train_dataset = convert_text_df_to_dataset(df_train)
val_dataset = convert_text_df_to_dataset(df_val)
test_dataset = convert_text_df_to_dataset(df_test)

## NGram Vectorization - Categorical Classification

In [None]:
start_time = time.perf_counter()
# Create TextVectorization
max_tokens = 50000
text_vectorization_ngram = TextVectorization(max_tokens=max_tokens, ngrams=3, output_mode="multi_hot")

# Train Vectorizer on train text
text_vectorization_ngram.adapt(df_train["text"], batch_size=2**16)

print(f"Created text vectorization in {round(time.perf_counter() - start_time)} seconds. ")

In [None]:
# Optimize Dataset feedthrogh
if using_tpu:
	# TPU's really like big batches I guess. 
	# By increasing the batch size by a factor of 128, I am seeing about a 4x speedup. 
	batch_size = 16 * 128 * tpu_strategy.num_replicas_in_sync
else:
	batch_size = 128

num_train_epochs = 5 * min(initial_dist) * len(df_train) // batch_size

# Vectorize Datasets, shift labels
train_dataset_vectorized = train_dataset.batch(batch_size).map(lambda x, y: (text_vectorization_ngram(x), y-1), num_parallel_calls=AUTO)
val_dataset_vectorized = val_dataset.batch(batch_size).map(lambda x, y: (text_vectorization_ngram(x), y-1), num_parallel_calls=AUTO)
test_dataset_vectorized = test_dataset.batch(batch_size).map(lambda x, y: (text_vectorization_ngram(x), y-1), num_parallel_calls=AUTO)

# Repeat train dataset, and prefetch all datasets
train_dataset_vectorized = train_dataset_vectorized.repeat().prefetch(num_train_epochs)
val_dataset_vectorized = val_dataset_vectorized.prefetch(AUTO)
test_dataset_vectorized = test_dataset_vectorized.prefetch(AUTO)

## Train Model

In [None]:
# Build Model
def create_model_categorical(max_tokens, model_name):
	inputs = keras.Input(shape=(max_tokens,))
	x = keras.layers.Dense(32, activation="relu")(inputs)
	x = keras.layers.Dropout(0.25)(x)
	x = keras.layers.Dense(16, activation="relu")(x)
	x = keras.layers.Dropout(0.25)(x)
	outputs = keras.layers.Dense(5, activation="softmax")(x)

	model = keras.Model(inputs, outputs, name=model_name)

	model.compile(
		optimizer="rmsprop", 
		loss="sparse_categorical_crossentropy", 
		metrics=["sparse_categorical_accuracy"]
	)

	return model

In [None]:
model_name = "large_ngram_model"

# Creating the model in the TPUStrategy scope means we will train the model on the TPU
if using_tpu:
	with tpu_strategy.scope():
		model = create_model_categorical(max_tokens, model_name)
else:
	model = create_model_categorical(max_tokens, model_name)

# Create callback to save model with a given name
model_path = f"models/{model_name}.keras"
callbacks = [
	keras.callbacks.ModelCheckpoint(model_path, monitor='val_loss', save_best_only=True),
	keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.01, patience=5, verbose=1, restore_best_weights=False)
]

start_time = time.perf_counter()

# Train Model
model.fit(train_dataset_vectorized, 
	validation_data=val_dataset_vectorized, 
	steps_per_epoch=num_train_epochs, 
	epochs=30, 
	callbacks=callbacks
)

# Evaluate Model after training
model = keras.models.load_model(model_path)
predictions = model.predict(test_dataset_vectorized)
predictions = np.argmax(predictions, axis = -1)
true_labels = np.concatenate([y for _, y in test_dataset_vectorized], axis=0)
mae = mean_absolute_error(true_labels, predictions)
mse = mean_squared_error(true_labels, predictions)

mins, secs = divmod(time.perf_counter() - start_time, 60)
# Output Model Metrics
print(f"Trained model in {mins} minutes, {round(secs)} seconds")
metrics_text = f"Model {model_name} with MAE {mae:.3f} and MSE {mse:.3f}\n"
print(metrics_text)
with open("model_metrics.txt", "a") as f:
	f.write(metrics_text)

In [None]:
# Export model with Text Vectorization layer
inputs = keras.Inputs(shape=(1,), dtype="string")
vectorized_inputs = text_vectorization_ngram(inputs)
outputs = model(vectorized_inputs)

inference_model = keras.Model(inputs, outputs)

keras.models.save_model(inference_model, "models/full_text_model.keras")

In [None]:
# Copy models to Google Drive to ensure that they are not lost
shutil.copytree("models", "drive/MyDrive/Colab Notebooks/models")
shutil.copy("model_metrics.txt", "drive/MyDrive/Colab Notebooks/model_metrics.txt")

In [None]:
# Zip Models
!zip -r "models.zip" "models"

try: 
	from google.colab import files
	files.download("models.zip")
	files.download("model_metrics.txt")
except:
	pass

In [None]:
# Test model
review_model = keras.models.load_model("models/full_text_model.keras")

review_text = [
	["I think my meal was decent, but I have had better. I would recommend other places in the area."],
	["My meal was excellent, and I had a really great time dining at this restaurant tonight. I will be back!"], 
	["Horrible experience. The food was awful and I wish to never return to this restaurant again."]
]

raw_text_data = tf.convert_to_tensor(review_text)

predictions = review_model(raw_text_data)
predictions = np.argmax(predictions, axis = -1) + 1
for text, star in zip(review_text, predictions):
	print(f"{star}: {review_text}")