# Training Transformers

Dataset - https://github.com/Rishav09/Neural-Machine-Translation-System/blob/master/english-german-both.pkl

In [26]:
from pickle import load
from numpy.random import shuffle
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow import convert_to_tensor, int64
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers.schedules import LearningRateSchedule
from tensorflow import data, train, math, reduce_sum, cast, equal, argmax, \
float32, GradientTape, TensorSpec, function, int64
from tensorflow.keras.metrics import Mean
from tensorflow.keras.losses import sparse_categorical_crossentropy
from tensorflow import math, cast, float32, linalg, ones, maximum, newaxis
%run 09_Transformer_Encoder_Decoder.ipynb

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
tf.Tensor([0. 0. 0. 0. 1. 1. 1.], shape=(7,), dtype=float32)
Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 5, 512)]     0           []                               
                                                                                                  
 multi_head_attention_123 (Mult  (None, 5, 512)      131776      ['input_3[0][0]',                
 iHeadAttention)                                                  'input_3[0][0]',                
                                                                  'input_3[0][0]']                
                                                                                                  
 dropout_223 (Dropout)          (None, 5, 

In [27]:
filename = '/Users/a.daggula/Downloads/english-german-both.pkl'
clean_dataset = load(open(filename, 'rb'))

In [28]:
clean_dataset[0:5]

array([['i like both', 'ich mag beide'],
       ['she misses him', 'er fehlt ihr'],
       ['i followed him', 'ich folgte ihm'],
       ['its unusual', 'es ist ungewohnlich'],
       ['she sounded mad', 'sie klang wutend']], dtype='<U370')

In [29]:
clean_dataset.shape

(10000, 2)

In [30]:
class PrepareDataset:
    def __init__(self,**kwargs):
        self.n_sentences = 9999
        self.train_split = 0.9
    
    def create_tokenizer(self, dataset):
        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(dataset)
        return tokenizer
    
    def find_seq_length(self, dataset):
        return max(len(seq.split()) for seq in dataset)
    
    def find_vocab_size(self, tokenizer, dataset):
        tokenizer.fit_on_texts(dataset)
        return len(tokenizer.word_index) + 1
    
    def __call__(self, filename, **kwargs):
        # Load a Dataset
        clean_dataset = load(open(filename, 'rb'))
        
        # Reduce Dataset Size
        dataset = clean_dataset[:self.n_sentences, :]
        
        # Add Start and Stop Tokens
        for i in range(dataset[:,0].size):
            dataset[i, 0] = "<START>" + dataset[i, 0] + "<EOS>"
            dataset[i, 1] = "<START>" + dataset[i, 1] + "<EOS>"
        
        # Random Shuffle the dataset
        shuffle(dataset)
        
        # Split the dataset
        train = dataset[:int(self.n_sentences * self.train_split)]
        
        # Prepare tokenizer for the encoder input
        enc_tokenizer = self.create_tokenizer(train[:, 0])
        enc_seq_length = self.find_seq_length(train[:, 0])
        enc_vocab_size = self.find_vocab_size(enc_tokenizer, train[:, 0])
        
        # Encode and pad the input sequences
        trainX = enc_tokenizer.texts_to_sequences(train[:, 0])
        trainX = pad_sequences(trainX, maxlen=enc_seq_length, padding='post')
        trainX = convert_to_tensor(trainX, dtype=int64)
        
        #Prepare tokenizer for the decoder input
        dec_tokenizer = self.create_tokenizer(train[:, 1])
        dec_seq_length = self.find_seq_length(train[:, 1])
        dec_vocab_size = self.find_vocab_size(dec_tokenizer, train[:, 1])
        
        # Encode and pad the input sequences
        trainY = dec_tokenizer.texts_to_sequences(train[:, 1])
        trainY = pad_sequences(trainY, maxlen=dec_seq_length, padding='post')
        trainY = convert_to_tensor(trainY, dtype=int64)
        
        return (trainX, trainY, train, enc_seq_length, dec_seq_length, enc_vocab_size, dec_vocab_size)

In [31]:
dataset = PrepareDataset()
trainX, trainY, train_orig, enc_seq_length, dec_seq_length, \
     enc_vocab_size, dec_vocab_size = dataset(filename)

print(train_orig[20, 0], '\n', trainX[20, :])
print(train_orig[0, 1], '\n', trainY[0, :])

<START>they swam<EOS> 
 tf.Tensor([  1  49 595   2   0], shape=(5,), dtype=int64)
<START>ich liebe meine familie<EOS> 
 tf.Tensor([  1   3  56  57 484   2   0   0   0   0], shape=(10,), dtype=int64)


In [32]:
print('Encoder sequence length:', enc_seq_length)
print('Decoder sequence length:', dec_seq_length)

Encoder sequence length: 5
Decoder sequence length: 10


Apply Padding Mask

In [33]:
def loss_fcn(target, prediction):
    mask = math.logical_not(equal(target, 0))
    mask = cast(mask, float32)
    
    loss = sparse_categorical_crossentropy(target, prediction, from_logits=True) * mask
    return reduce_sum(loss) / reduce_sum(mask)

In [34]:
def accuracy_fcn(target, prediction):
    mask = math.logical_not(math.equal(target,0))
    
    # Find equal prediction and target values, and apply the padding mask
    accuracy = equal(target, argmax(prediction, axis=2))
    accuracy = math.logical_and(mask, accuracy)
    # Cast the True/False values to 32-bit-precision floating-point numbers
    mask = cast(mask, float32)
    accuracy = cast(accuracy, float32)
    # Compute the mean accuracy over the unmasked values
    return reduce_sum(accuracy) / reduce_sum(mask)

#### Train the transformer Model


In [35]:
 # Define the model parameters
h = 8 # Number of self-attention heads
d_k = 64 # Dimensionality of the linearly projected queries and keys d_v = 64 # Dimensionality of the linearly projected values
d_model = 512 # Dimensionality of model layers' outputs
d_ff = 2048 # Dimensionality of the inner fully connected layer
n = 6 # Number of layers in the encoder stack
# Define the training parameters
epochs = 2
batch_size = 64
beta_1 = 0.9
beta_2 = 0.98
epsilon = 1e-9
dropout_rate = 0.1


In [36]:
class LRScheduler(LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000, **kwargs):
        super().__init__(**kwargs)
        self.d_model = cast(d_model, float32)
        self.warmup_steps = warmup_steps
        
    def __call__(self, step_num):
        # Linearly increasing the learning rate for the first warmup_steps, and # decreasing it thereafter
        arg1 = step_num ** -0.5
        arg2 = step_num * (self.warmup_steps ** -1.5)
        return (self.d_model ** -0.5) * math.minimum(arg1, arg2)

In [37]:
optimizer = Adam(LRScheduler(d_model), beta_1, beta_2, epsilon)

In [38]:
train_dataset = data.Dataset.from_tensor_slices((trainX, trainY))
train_dataset = train_dataset.batch(batch_size)

In [39]:
training_model = TransformerModel(enc_vocab_size, dec_vocab_size, enc_seq_length,
                                   dec_seq_length, h, d_k, d_v, d_model, d_ff, n,
                                   dropout_rate)

#### TRAINING LOOP

In [42]:
@function
def train_step(encoder_input, decoder_input, decoder_ouput):
    with GradientTape() as tape:
        # Run the forward pass of the model to generate a prediction
        prediction = training_model(encoder_input, decoder_input, training=True)

        # Compute a training loss
        loss = loss_fcn(decoder_input, prediction)
        
        # Compute the training_accuracy
        accuracy = accuracy_fcn(decoder_output, prediction)
    
    # Retrieve gradients of the trainable weights with respect to the training loss
    gradients = tape.gradient(loss, training_model.trainable_weights)
    
    # Update the values of the trainable variables by gradient descent
    optimizer.apply_gradients(zip(gradients, training_model.trainable_weights))
    
    train_loss(loss)
    train_accuracy(accuracy)

In [43]:
train_loss = Mean(name='train_loss')
train_accuracy = Mean(name='train_accuracy')


# Create a checkpoint object and manager to manage multiple checkpoints
ckpt = train.Checkpoint(model=training_model, optimizer=optimizer)
ckpt_manager = train.CheckpointManager(ckpt, "./checkpoints", max_to_keep=3)

for epoch in range(epochs): 
    train_loss.reset_states() 
    train_accuracy.reset_states()
    print("\nStart of epoch %d" % (epoch + 1))
    
    # Iterate over the dataset batches
    for step, (train_batchX, train_batchY) in enumerate(train_dataset): 
        # Define the encoder and decoder inputs, and the decoder output 
        encoder_input = train_batchX[:, 1:]
        decoder_input = train_batchY[:, :-1]
        decoder_output = train_batchY[:, 1:]
        train_step(encoder_input, decoder_input, decoder_output)
        
        if step % 50 == 0:
            print(f'Epoch {epoch + 1} Step {step} Loss {train_loss.result():.4f} '
            + f'Accuracy {train_accuracy.result():.4f}')
    
    # Print epoch number and loss value at the end of every epoch
    print(f"Epoch {epoch +1}: Training Loss {train_loss.result():.4f}, " + f"Training Accuracy {train_accuracy.result():.4f}")
    # Save a checkpoint after every five epochs
    if (epoch + 1) % 5 == 0:
        save_path = ckpt_manager.save()
        print("Saved checkpoint at epoch %d" % (epoch + 1))


Start of epoch 1
Epoch 1 Step 0 Loss 8.1550 Accuracy 0.0000
Epoch 1 Step 50 Loss 7.2856 Accuracy 0.0169
Epoch 1 Step 100 Loss 6.5785 Accuracy 0.0847
Epoch 1: Training Loss 6.1463, Training Accuracy 0.1192

Start of epoch 2
Epoch 2 Step 0 Loss 4.8509 Accuracy 0.2066
Epoch 2 Step 50 Loss 4.3457 Accuracy 0.1792
Epoch 2 Step 100 Loss 4.0256 Accuracy 0.1527
Epoch 2: Training Loss 3.8054, Training Accuracy 0.1401
