# Fast NGram TPU Training

After reading few a few articles, I would like to try to implement a few speedups for TPU training

- https://www.tensorflow.org/guide/data
- https://www.tensorflow.org/guide/data_performance
- https://www.tensorflow.org/guide/tpu

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import TextVectorization
from sklearn.utils import shuffle
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [None]:
# Check if Google Colab Instance for Setup
print("Tensorflow version " + tf.__version__)

# Get correct path if on Google Colab
try:
	from google.colab import drive
	drive.mount("/content/drive")
	reviews_dataset_path = "drive/MyDrive/Colab Notebooks/reviews.json"

	# Get RAM Info
	from psutil import virtual_memory
	ram_gb = virtual_memory().total / 1e9
	print('Your runtime has {:.1f} gigabytes of available RAM'.format(ram_gb))

	if ram_gb < 20:
		print('Not using a high-RAM runtime')
	else:
		print('You are using a high-RAM runtime!')

	try:
		tpu = tf.distribute.cluster_resolver.TPUClusterResolver() # TPU detection
		print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])

		tf.config.experimental_connect_to_cluster(tpu)
		tf.tpu.experimental.initialize_tpu_system(tpu)
		tpu_strategy = tf.distribute.TPUStrategy(tpu)

		using_tpu = True
	except ValueError:
		print("Note: Not connected to a TPU runtime.")
		using_tpu = False
except ModuleNotFoundError:
	reviews_dataset_path = "yelp_dataset/reviews.json"
	using_tpu = False

AUTO = tf.data.AUTOTUNE

In [None]:
# Read dataset into memory
review_df = pd.read_json(reviews_dataset_path, orient="records", lines=True)

# Shuffle Review df
review_df = shuffle(review_df, random_state=0)

# Slice into Train, Val, Test at 60:20:20
n = len(review_df)
df_train = review_df.iloc[: int(n*0.6)]
df_val = review_df.iloc[int(n*0.6) : int(n*0.8)]
df_test = review_df.iloc[int(n*0.8) :]

In [None]:
# Convert Pandas DF to TF Dataset


def convert_text_df_to_dataset(df, input_col="text", target_col="stars"):
	text_input = tf.convert_to_tensor(df[input_col], dtype=tf.string)
	target = tf.convert_to_tensor(df[target_col], dtype=tf.int8)
	dataset = tf.data.Dataset.from_tensor_slices((text_input, target))
	return dataset

train_dataset = convert_text_df_to_dataset(df_train)
val_dataset = convert_text_df_to_dataset(df_val)
test_dataset = convert_text_df_to_dataset(df_test)

## Bigram IDF Vectorization - Categorical Classification

In [None]:
# Create TextVectorization
max_tokens = 30000
# Use IDF vectorization to match train_vectorized.ipynb to compare times
text_vectorization_idf = TextVectorization(max_tokens=max_tokens, ngrams=2, output_mode="tf_idf")

# Train Vectorizer on train text
text_vectorization_idf.adapt(df_train["text"])

# Vectorize Datasets
train_dataset_vectorized = train_dataset.map(lambda x, y: (text_vectorization_idf(x), y-1), num_parallel_calls=AUTO)
val_dataset_vectorized = val_dataset.map(lambda x, y: (text_vectorization_idf(x), y-1), num_parallel_calls=AUTO)
test_dataset_vectorized = test_dataset.map(lambda x, y: (text_vectorization_idf(x), y-1), num_parallel_calls=AUTO)

In [None]:
# Optimize Dataset feedthrogh
if using_tpu:
	# TPU's really like big batches I guess. 
	# By increasing the batch size by a factor of 128, I am seeing about a 4x speedup. 
	batch_size = 16 * 128 * tpu_strategy.num_replicas_in_sync
else:
	batch_size = 128

num_train_epochs = len(df_train) // batch_size

# Repeat, then batch
# https://www.tensorflow.org/guide/data#processing_multiple_epochs

# Use drop_remainder to get shape prop
# https://www.tensorflow.org/guide/data#simple_batching

# Use num_parallel_calls on batch
# https://www.tensorflow.org/api_docs/python/tf/data/Dataset#batch

# Only shuffle and repeat the dataset in training. The advantage of having an infinite dataset for training is to avoid the potential last partial batch in each epoch, so that you don't need to think about scaling the gradients based on the actual batch size.
# https://www.tensorflow.org/guide/tpu#load_the_dataset

train_dataset_vectorized = train_dataset_vectorized.repeat().batch(batch_size, drop_remainder=True, num_parallel_calls=AUTO).prefetch(AUTO)
val_dataset_vectorized = val_dataset_vectorized.batch(batch_size, drop_remainder=True, num_parallel_calls=AUTO).prefetch(AUTO)
test_dataset_vectorized = test_dataset_vectorized.batch(batch_size, drop_remainder=False, num_parallel_calls=AUTO).prefetch(AUTO)

## Train Model

In [None]:
# Build Model
def create_model_categorical(max_tokens, model_name):
	inputs = keras.Input(shape=(max_tokens,))
	x = keras.layers.Dense(32, activation="relu")(inputs)
	x = keras.layers.Dropout(0.25)(x)
	x = keras.layers.Dense(16, activation="relu")(x)
	x = keras.layers.Dropout(0.25)(x)
	outputs = keras.layers.Dense(5, activation="softmax")(x)

	model = keras.Model(inputs, outputs, name=model_name)

	# To reduce Python overhead and maximize the performance of your TPU, pass in the argument steps_per_execution to Model.compile.
	# https://www.tensorflow.org/guide/tpu#train_the_model_using_keras_high-level_apis
	model.compile(
		optimizer="rmsprop", 
		loss="sparse_categorical_crossentropy", 
		metrics=["sparse_categorical_accuracy"],
		steps_per_execution=32
		)

	return model

In [None]:
model_name = "vectorized_categorical_idf_fast_train"

# Creating the model in the TPUStrategy scope means we will train the model on the TPU
if using_tpu:
	with tpu_strategy.scope():
		model = create_model_categorical(max_tokens, model_name)
else:
	model = create_model_categorical(max_tokens, model_name)

# Create callback to save model with a given name
model_path = f"models/{model_name}.keras"
callbacks = [
	keras.callbacks.ModelCheckpoint(model_path, monitor='val_loss', save_best_only=True),
	keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.01, patience=5, verbose=1, restore_best_weights=False)
]

# Train Model
# Previous throughput was about 29s per training epoch. Let's see if we can beat that with the optimization changes. 
model.fit(train_dataset_vectorized, 
	validation_data=val_dataset_vectorized, 
	steps_per_epoch=num_train_epochs, 
	epochs=20, 
	callbacks=callbacks
)

# Evaluate Model after training
model = keras.models.load_model(model_path)
predictions = model.predict(test_dataset_vectorized)
predictions = np.argmax(predictions, axis = -1)
true_labels = np.concatenate([y for _, y in test_dataset_vectorized], axis=0)
mae = mean_absolute_error(true_labels, predictions)
mse = mean_squared_error(true_labels, predictions)


# Output Model Metrics
metrics_text = f"Model {model_name} with MAE {mae:.3f} and MSE {mse:.3f}\n"
print(metrics_text)
with open("model_metrics.txt", "a") as f:
	f.write(metrics_text)

In [None]:
# Zip Models
!zip -r "models.zip" "models"

try: 
	from google.colab import files
	files.download("models.zip")
	files.download("model_metrics.txt")
except:
	pass