Demonstrating the transformer architecture on a small subset of the WMT English-German parallel corpus using TensorFlow.

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Dense, Embedding, Masking, Dropout, LayerNormalization
from tensorflow.keras.layers import MultiHeadAttention, Concatenate, TimeDistributed
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
import numpy as np


Define input sequences

In [None]:
input_seq = ['hello world', 'goodbye world', 'how are you']
target_seq = ['hallo welt', 'auf wiedersehen welt', 'wie geht es dir']


Define tokenizer

In [None]:
tokenizer = Tokenizer(filters='', char_level=True)
tokenizer.fit_on_texts(input_seq + target_seq)

Convert text sequences to integer sequences

In [None]:
input_seq = tokenizer.texts_to_sequences(input_seq)
target_seq = tokenizer.texts_to_sequences(target_seq)

Pad input sequences

In [None]:

max_len = max(len(seq) for seq in input_seq + target_seq)
input_seq = pad_sequences(input_seq, maxlen=max_len, padding='post')
target_seq = pad_sequences(target_seq, maxlen=max_len, padding='post')

Define model inputs

In [None]:

encoder_input = Input(shape=(max_len,))
decoder_input = Input(shape=(max_len,))

Define embedding layers

In [None]:

embedding_dim = 32
embedding_encoder = Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=embedding_dim, mask_zero=True)(encoder_input)
embedding_decoder = Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=embedding_dim, mask_zero=True)(decoder_input)

Define masking layers

In [None]:
masking_encoder = Masking(mask_value=0.0)(embedding_encoder)
masking_decoder = Masking(mask_value=0.0)(embedding_decoder)

Define multi-head attention layers

In [None]:
num_heads = 4
key_dim = 32
value_dim = 32
attention_encoder = MultiHeadAttention(num_heads=num_heads, key_dim=key_dim, value_dim=value_dim)(masking_encoder, masking_encoder)
attention_decoder = MultiHeadAttention(num_heads=num_heads, key_dim=key_dim, value_dim=value_dim)(masking_decoder, masking_decoder)

Define feedforward layers

In [None]:

feedforward_dim = 64
feedforward_encoder = TimeDistributed(Dense(feedforward_dim, activation='relu'))(attention_encoder)
feedforward_encoder = TimeDistributed(Dropout(0.5))(feedforward_encoder)
feedforward_encoder = TimeDistributed(Dense(embedding_dim))(feedforward_encoder)

feedforward_decoder = TimeDistributed(Dense(feedforward_dim, activation='relu'))(attention_decoder)
feedforward_decoder = TimeDistributed(Dropout(0.5))(feedforward_decoder)
feedforward_decoder = TimeDistributed(Dense(embedding_dim))(feedforward_decoder)

Define the layers

In [None]:

# Define concatenation layer
concatenation = Concatenate(axis=1)([feedforward_encoder, feedforward_decoder])

# Define normalization layer
normalization = LayerNormalization()(concatenation)

# Define output layer
output = TimeDistributed(Dense(len(tokenizer.word_index) + 1, activation='softmax'))(normalization)

Define model

In [None]:

model = Model(inputs=[encoder_input, decoder_input], outputs=output)


Define optimizer

In [None]:

optimizer = Adam(lr=0.001, beta_1=0.9, beta_2=0.98, epsilon=1e-9)


Compile model

In [None]:


model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy')

Train model on subset of data

In [None]:
num_samples = 100
history = model.fit([input_seq[:num_samples], target_seq[:num_samples]], target_seq[:num_samples], epochs=10, batch_size=32, validation_split=0.2)

Evaluate model on test data

In [None]:

test_loss, test_acc = model.evaluate([input_seq[num_samples:], target_seq[num_samples:]], target_seq[num_samples:], verbose=2)

Print test accuracy

In [None]:

print('Test accuracy:', test_acc)