In [1]:
### data used is already cleaned

In [1]:
from pickle import load
from numpy.random import shuffle
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow import convert_to_tensor, int64

In [2]:
class PrepareDataset:
    def __init__(self,**kwargs):
        super(PrepareDataset,self).__init__(**kwargs)
        self.n_sentences=10000 #number of sentences to include in the dataset
        self.train_split=0.9 #ratio of training data split
    
    #fit tokenizer
    def create_tokenizer(self,dataset):
        tokenizer=Tokenizer()
        tokenizer.fit_on_texts(dataset)
        return tokenizer
    
    def find_seq_length(self,dataset):
        return max(len(seq.split()) for seq in dataset)
    
    def find_vocab_size(self,tokenizer,dataset):
        tokenizer.fit_on_texts(dataset)
        
        return len(tokenizer.word_index)+1
    
    def __call__(self,filename,**kwargs):
        #load clean dataset
        clean_dataset=load(open(filename,'rb'))
        
        #reduce dataset size
        dataset=clean_dataset[:self.n_sentences,:]
        
        #include start and end of string tokens
        for i in range(dataset[:,0].size):
            dataset[i,0]="<START> "+dataset[i,0]+" <EOS>"
            dataset[i,1]="<START> "+dataset[i,0]+" <EOS>"
        
        #random shuffle the dataset
        shuffle(dataset)
        
        #split the dataset
        train=dataset[:int(self.n_sentences*self.train_split)]
        
        #prepare tokenizer for the encoder input
        enc_tokenizer=self.create_tokenizer(train[:,0])
        enc_seq_length=self.find_seq_length(train[:,0])
        enc_vocab_size=self.find_vocab_size(enc_tokenizer,train[:,0])
        
        #encode and pad the input sequences
        trainX=enc_tokenizer.texts_to_sequences(train[:,0])
        trainX=pad_sequences(trainX,maxlen=enc_seq_length,padding='post')
        trainX=convert_to_tensor(trainX,dtype=int64)
        
        #prepare dataset for decoder input
        dec_tokenizer=self.create_tokenizer(train[:,1])
        dec_seq_length=self.find_seq_length(train[:,1])
        dec_vocab_size=self.find_vocab_size(dec_tokenizer,train[:,1])
        
        #encode and pad the input sequences
        trainY=dec_tokenizer.texts_to_sequences(train[:,1])
        trainY=pad_sequences(trainY,maxlen=dec_seq_length,padding='post')
        trainY=convert_to_tensor(trainY,dtype=int64)
        
        return trainX,trainY,train,enc_seq_length,dec_seq_length,enc_vocab_size,dec_vocab_size
    

In [3]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers.schedules import LearningRateSchedule
from tensorflow.keras.metrics import Mean
from tensorflow import data,train,math,reduce_sum,cast,equal,argmax,float32,GradientTape,TensorSpec,function,int64
from tensorflow.keras.losses import sparse_categorical_crossentropy
from time import time

In [None]:
import import_ipynb
from Transformer import TransformerModel

In [5]:
# Define the model parameters
h = 8  # Number of self-attention heads
d_k = 64  # Dimensionality of the linearly projected queries and keys
d_v = 64  # Dimensionality of the linearly projected values
d_model = 512  # Dimensionality of model layers' outputs
d_ff = 2048  # Dimensionality of the inner fully connected layer
n = 6  # Number of layers in the encoder stack
 
# Define the training parameters
epochs = 2
batch_size = 64
beta_1 = 0.9
beta_2 = 0.98
epsilon = 1e-9
dropout_rate = 0.1

In [10]:
#implementing a learning rate scheduler
class LRScheduler(LearningRateSchedule):
    def __init__(self,d_model,warmup_steps=4000,**kwargs):
        super(LRScheduler,self).__init__(*kwargs)
        
        self.d_model=cast(d_model,float32)
        self.warmup_steps=warmup_steps
    
    def __call__(self,step_num):
        #linearly increasing the learning rate for the first warmup_steps and decreasing it thereafter
        # print(type(step_num))
        arg1=cast(step_num,float32)**-0.5
        # arg1=1.0/(step_num** 0.5)
        arg2=cast(step_num,float32)*(1.0/(self.warmup_steps**1.5))
        
        return (1.0/(self.d_model**0.5))*math.minimum(arg1,arg2)
    
#instantiate an adam optimizer
optimizer=Adam(LRScheduler(d_model),beta_1,beta_2,epsilon)

#prepare the training and test splits of the dataset
dataset=PrepareDataset()
trainX,trainY,train_orig,enc_seq_length,dec_seq_length,enc_vocab_size,dec_vocab_size=dataset('english-german-both.pkl')

#prepare dataset batches
train_dataset=data.Dataset.from_tensor_slices((trainX,trainY))
train_dataset=train_dataset.batch(batch_size)

#create model
training_model=TransformerModel(enc_vocab_size,dec_vocab_size,enc_seq_length,dec_seq_length,h,d_k,d_v,d_model,d_ff,n,dropout_rate)

#defining the loss function

def loss_fcn(target,prediction):
    #create mask so that the zero padding values are not included in the computation of loss
    padding_mask=math.logical_not(equal(target,0))
    padding_mask=cast(padding_mask,float32)
    
    #compute a sparse categorical cross_entropy loss on the unmasked values
    loss=sparse_categorical_crossentropy(target,prediction,from_logits=True)
    
    #compute the mean loss over the unmasked values
    return reduce_sum(loss)/reduce_sum(padding_mask)

#defining the accuracy function
def accuracy_fcn(target,prediction):
    #create mask so that the zero padding values are not included in the computaion of accuracy
    padding_mask=math.logical_not(equal(target,0))
    
    #find equal prediction and target values, and apply the padding mask
    accuracy=equal(target,argmax(prediction,axis=2))
    accuracy=math.logical_and(padding_mask,accuracy)
    
    #cast true/false values to 32-bit precision floating-point numbers
    padding_mask=cast(padding_mask,float32)
    accuracy=cast(accuracy,float32)
    
    #compute the mean accuracy over the unmasked values
    return reduce_sum(accuracy)/reduce_sum(padding_mask)

#include metrics monitoring
train_loss=Mean(name='train_loss')
train_accuracy=Mean(name='train_accuracy')

#create a checkpoint object and manager to manage multiple checkpoint
ckpt=train.Checkpoint(model=training_model,optimizer=optimizer)
ckpt_manager=train.CheckpointManager(ckpt,"checkpoints",max_to_keep=3)

#speeding up the training process

@function
def train_step(encoder_input,decoder_input,decoder_output):
    with GradientTape() as tape:
        #run the forward pass of the model to generate prediciton
        prediction=training_model(encoder_input=encoder_input,decoder_input=decoder_input,training=True)
        
        #compute the training loss
        loss=loss_fcn(decoder_output,prediction)
        
        #compute the training accuracy
        accuracy=accuracy_fcn(decoder_output,prediction)
    
    #retrieve gradients of the trainable variables with respect to the training loss
    gradients=tape.gradient(loss,training_model.trainable_variables)
    
    #update the values of the trainable variables by gradient descent
    optimizer.apply_gradients(zip(gradients,training_model.trainable_weights))
    
    train_loss(loss)
    train_accuracy(accuracy)


for epoch in range(epochs):
    train_loss.reset_state()
    train_accuracy.reset_state()
    
    print("\nStart of epoch %d"%(epoch+1))
    start_time=time()
    
    for step, (train_batchX,train_batchY) in enumerate(train_dataset):
        
        #define the encoder and decoder inputs and the decoder output
        encoder_input=train_batchX[:,1:]
        decoder_input=train_batchY[:,:-1]
        decoder_output=train_batchY[:,1:]
        
        train_step(encoder_input=encoder_input,decoder_input=decoder_input,decoder_output=decoder_output)
        
        if step%50==0:
            print(f'Epoch {epoch+1} step {step} loss {train_loss.result():.4f} Accuracy {train_accuracy.result():.4f}')
        
        #print epoch number and loss value at end of every epoch
        print("Epoch %d: Training Loss %.4f, Training Accuracy %.4f" % (epoch + 1, train_loss.result(), train_accuracy.result()))
        
        #save a checkpoint after every five epochs
        if(epoch+1)%5==0:
            save_path=ckpt_manager.save()
            print("saved checkpoint at epoch %d"%(epoch+1))


print("total time taken: %.2fs" %(time()-start_time))


Start of epoch 1
Epoch 1 step 0 loss 11.1791 Accuracy 0.0000
Epoch 1: Training Loss 11.1791, Training Accuracy 0.0000
Epoch 1: Training Loss 10.9301, Training Accuracy 0.0000
Epoch 1: Training Loss 10.8695, Training Accuracy 0.0000
Epoch 1: Training Loss 10.8142, Training Accuracy 0.0000
Epoch 1: Training Loss 10.7545, Training Accuracy 0.0000
Epoch 1: Training Loss 10.6756, Training Accuracy 0.0000
Epoch 1: Training Loss 10.6545, Training Accuracy 0.0000
Epoch 1: Training Loss 10.6762, Training Accuracy 0.0000
Epoch 1: Training Loss 10.6628, Training Accuracy 0.0000
Epoch 1: Training Loss 10.5994, Training Accuracy 0.0002
Epoch 1: Training Loss 10.5865, Training Accuracy 0.0002
Epoch 1: Training Loss 10.5557, Training Accuracy 0.0002
Epoch 1: Training Loss 10.5271, Training Accuracy 0.0002
Epoch 1: Training Loss 10.5028, Training Accuracy 0.0002
Epoch 1: Training Loss 10.4607, Training Accuracy 0.0002
Epoch 1: Training Loss 10.4220, Training Accuracy 0.0003
Epoch 1: Training Loss 10.

KeyboardInterrupt: 