# Transformer Model - Positional Embedding and Transformer Encoder

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import TextVectorization
from sklearn.utils import shuffle
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [2]:
# Check if Google Colab Instance for Setup
print("Tensorflow version " + tf.__version__)

# Get correct path if on Google Colab
try:
	from google.colab import drive
	drive.mount("/content/drive")
	reviews_dataset_path = "drive/MyDrive/Colab Notebooks/reviews.json"

	# Get RAM Info
	from psutil import virtual_memory
	ram_gb = virtual_memory().total / 1e9
	print('Your runtime has {:.1f} gigabytes of available RAM'.format(ram_gb))

	if ram_gb < 20:
		print('Not using a high-RAM runtime')
	else:
		print('You are using a high-RAM runtime!')

	try:
		tpu = tf.distribute.cluster_resolver.TPUClusterResolver() # TPU detection
		print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])

		tf.config.experimental_connect_to_cluster(tpu)
		tf.tpu.experimental.initialize_tpu_system(tpu)
		tpu_strategy = tf.distribute.TPUStrategy(tpu)

		using_tpu = True
	except ValueError:
		print("Note: Not connected to a TPU runtime.")
		using_tpu = False
except ModuleNotFoundError:
	reviews_dataset_path = "yelp_dataset/reviews.json"
	using_tpu = False

AUTO = tf.data.AUTOTUNE

Tensorflow version 2.7.0
Mounted at /content/drive
Your runtime has 13.6 gigabytes of available RAM
Not using a high-RAM runtime
Note: Not connected to a TPU runtime.


In [3]:
# Read dataset into memory
review_df = pd.read_json(reviews_dataset_path, orient="records", lines=True)

In [4]:
# Shuffle Review df
review_df = shuffle(review_df, random_state=0)

# Slice into Train, Val, Test at 60:20:20
n = len(review_df)
df_train = review_df.iloc[: int(n*0.6)]
df_val = review_df.iloc[int(n*0.6) : int(n*0.8)]
df_test = review_df.iloc[int(n*0.8) :]

In [5]:
# Convert Pandas DF to TF Dataset
if using_tpu:
	# TPU's really like big batches I guess. 
	# By increasing the batch size by a factor of 128, I am seeing about a 4x speedup. 
	batch_size = 16 * 128 * tpu_strategy.num_replicas_in_sync
else:
	batch_size = 4 * 128

def convert_text_df_to_dataset(df, input_col="text", target_col="stars"):
	text_input = tf.convert_to_tensor(df[input_col], dtype=tf.string)
	target = tf.convert_to_tensor(df[target_col], dtype=tf.int8)
	dataset = tf.data.Dataset.from_tensor_slices((text_input, target))
	dataset = dataset.batch(batch_size).prefetch(AUTO)
	return dataset

train_dataset = convert_text_df_to_dataset(df_train)
val_dataset = convert_text_df_to_dataset(df_val)
test_dataset = convert_text_df_to_dataset(df_test)

## Train Model

In [6]:
# Transformer Encoder Class
class TransformerEncoder(layers.Layer):
	def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
		super().__init__(**kwargs)
		self.embed_dim = embed_dim
		self.dense_dim = dense_dim
		self.num_heads = num_heads
		self.attention = layers.MultiHeadAttention(
			num_heads=num_heads, key_dim=embed_dim)
		self.dense_proj = keras.Sequential(
			[layers.Dense(dense_dim, activation="relu"),
			layers.Dense(embed_dim),]
		)
		self.layernorm_1 = layers.LayerNormalization()
		self.layernorm_2 = layers.LayerNormalization()

	def call(self, inputs, mask=None):
		if mask is not None:
			mask = mask[:, tf.newaxis, :]
		attention_output = self.attention(
			inputs, inputs, attention_mask=mask)
		proj_input = self.layernorm_1(inputs + attention_output)
		proj_output = self.dense_proj(proj_input)
		return self.layernorm_2(proj_input + proj_output)

	def get_config(self):
		config = super().get_config()
		config.update({
			"embed_dim": self.embed_dim,
			"num_heads": self.num_heads,
			"dense_dim": self.dense_dim,
		})
		return config

In [7]:
# Positional Embedding Class
class PositionalEmbedding(layers.Layer):
	def __init__(self, sequence_length, input_dim, output_dim, **kwargs):
		super().__init__(**kwargs)
		self.token_embeddings = layers.Embedding(
			input_dim=input_dim, output_dim=output_dim)
		self.position_embeddings = layers.Embedding(
			input_dim=sequence_length, output_dim=output_dim)
		self.sequence_length = sequence_length
		self.input_dim = input_dim
		self.output_dim = output_dim

	def call(self, inputs):
		length = tf.shape(inputs)[-1]
		positions = tf.range(start=0, limit=length, delta=1)
		embedded_tokens = self.token_embeddings(inputs)
		embedded_positions = self.position_embeddings(positions)
		return embedded_tokens + embedded_positions

	def compute_mask(self, inputs, mask=None):
		return tf.math.not_equal(inputs, 0)

	def get_config(self):
		config = super().get_config()
		config.update({
			"output_dim": self.output_dim,
			"sequence_length": self.sequence_length,
			"input_dim": self.input_dim,
		})
		return config

In [8]:
# Build Model
def create_transformer_model(vocab_size = 20000, sequence_length = 600, embed_dim = 256, num_heads = 2, dense_dim = 32, model_name = None):
	inputs = keras.Input(shape=(None,), dtype="int64")
	x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(inputs)
	x = TransformerEncoder(embed_dim, dense_dim, num_heads)(x)
	x = layers.GlobalMaxPooling1D()(x)
	x = layers.Dropout(0.5)(x)
	outputs = keras.layers.Dense(1)(x)

	model = keras.Model(inputs, outputs, name=model_name)

	model.compile(optimizer="rmsprop", loss="mean_absolute_error", metrics=["mean_squared_error"])

	return model

### Single Word Vectorization

In [9]:
# Create TextVectorization
max_tokens = 30000
max_length = 500
text_vectorization = TextVectorization(max_tokens=max_tokens, output_mode="int", output_sequence_length=max_length)

# Train Vectorizer on train text
text_vectorization.adapt(df_train["text"])

# Vectorize Datasets
train_dataset_vectorized = train_dataset.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=AUTO)
val_dataset_vectorized = val_dataset.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=AUTO)
test_dataset_vectorized = test_dataset.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=AUTO)

In [10]:
model_name = "transformer"

# Creating the model in the TPUStrategy scope means we will train the model on the TPU
if using_tpu:
	with tpu_strategy.scope():
		model = create_transformer_model(vocab_size = max_tokens, sequence_length = max_length, embed_dim = 256, num_heads = 2, dense_dim = 32, model_name = model_name)
else:
	model = create_transformer_model(vocab_size = max_tokens, sequence_length = max_length, embed_dim = 256, num_heads = 2, dense_dim = 32, model_name = model_name)

# Create callback to save model with a given name
model_path = f"models/{model_name}.keras"
callbacks = [
	keras.callbacks.ModelCheckpoint(model_path, monitor='val_loss', save_best_only=True),
	keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.01, patience=5, verbose=1, restore_best_weights=False)
]

model.summary()

# Train Model
model.fit(train_dataset_vectorized, validation_data=val_dataset_vectorized, epochs=20, callbacks=callbacks)

# Evaluate Model after training
model = keras.models.load_model(model_path, custom_objects={
	"TransformerEncoder": TransformerEncoder, "PositionalEmbedding": PositionalEmbedding})
eval = model.evaluate(test_dataset_vectorized)

# Output Model Metrics
metrics_text = f"Model {model_name} with MAE {eval[0]:.3f} and MSE {eval[1]:.3f}\n"
print(metrics_text)
with open("model_metrics.txt", "a") as f:
	f.write(metrics_text)

Model: "transformer"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None)]            0         
                                                                 
 positional_embedding (Posit  (None, None, 256)        7808000   
 ionalEmbedding)                                                 
                                                                 
 transformer_encoder (Transf  (None, None, 256)        543776    
 ormerEncoder)                                                   
                                                                 
 global_max_pooling1d (Globa  (None, 256)              0         
 lMaxPooling1D)                                                  
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                       

### Categorical Classification

In [11]:
# Build Model
def create_transformer_model_categorical(vocab_size = 20000, sequence_length = 600, embed_dim = 256, num_heads = 2, dense_dim = 32, model_name = None):
	inputs = keras.Input(shape=(None,), dtype="int64")
	x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(inputs)
	x = TransformerEncoder(embed_dim, dense_dim, num_heads)(x)
	x = layers.GlobalMaxPooling1D()(x)
	x = layers.Dropout(0.5)(x)
	outputs = keras.layers.Dense(6, activation="softmax")(x)

	model = keras.Model(inputs, outputs, name=model_name)

	model.compile(optimizer="rmsprop", loss="sparse_categorical_crossentropy", metrics=["sparse_categorical_accuracy"])

	return model

In [12]:
model_name = "transformer_categorical"

# Creating the model in the TPUStrategy scope means we will train the model on the TPU
if using_tpu:
	with tpu_strategy.scope():
		model = create_transformer_model_categorical(vocab_size = max_tokens, sequence_length = max_length, embed_dim = 256, num_heads = 2, dense_dim = 32, model_name = model_name)
else:
	model = create_transformer_model_categorical(vocab_size = max_tokens, sequence_length = max_length, embed_dim = 256, num_heads = 2, dense_dim = 32, model_name = model_name)

# Create callback to save model with a given name
model_path = f"models/{model_name}.keras"
callbacks = [
	keras.callbacks.ModelCheckpoint(model_path, monitor='val_loss', save_best_only=True),
	keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.01, patience=5, verbose=1, restore_best_weights=False)
]

model.summary()

# Train Model
model.fit(train_dataset_vectorized, validation_data=val_dataset_vectorized, epochs=20, callbacks=callbacks)

# Evaluate Model after training
model = keras.models.load_model(model_path, custom_objects={
	"TransformerEncoder": TransformerEncoder, "PositionalEmbedding": PositionalEmbedding})
predictions = model.predict(test_dataset_vectorized)
predictions = np.argmax(predictions, axis = -1)
true_labels = np.concatenate([y for _, y in test_dataset_vectorized], axis=0)
mae = mean_absolute_error(true_labels, predictions)
mse = mean_squared_error(true_labels, predictions)


# Output Model Metrics
metrics_text = f"Model {model_name} with MAE {mae:.3f} and MSE {mse:.3f}\n"
print(metrics_text)
with open("model_metrics.txt", "a") as f:
	f.write(metrics_text)

Model: "transformer_categorical"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, None)]            0         
                                                                 
 positional_embedding_1 (Pos  (None, None, 256)        7808000   
 itionalEmbedding)                                               
                                                                 
 transformer_encoder_1 (Tran  (None, None, 256)        543776    
 sformerEncoder)                                                 
                                                                 
 global_max_pooling1d_1 (Glo  (None, 256)              0         
 balMaxPooling1D)                                                
                                                                 
 dropout_1 (Dropout)         (None, 256)               0         
                                           

In [13]:
# Zip Models
!zip -r "models.zip" "models"

try: 
	from google.colab import files
	files.download("models.zip")
	files.download("model_metrics.txt")
except:
	pass

  adding: models/ (stored 0%)
  adding: models/transformer_categorical.keras (deflated 7%)
  adding: models/transformer.keras (deflated 7%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>