# Training Transformers

Dataset - https://github.com/Rishav09/Neural-Machine-Translation-System/blob/master/english-german-both.pkl

In [7]:
from pickle import load
from numpy.random import shuffle
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow import convert_to_tensor, int64

In [17]:
filename = '/Users/a.daggula/Downloads/english-german-both.pkl'
clean_dataset = load(open(filename, 'rb'))

In [18]:
clean_dataset[0:5]

array([['i like both', 'ich mag beide'],
       ['she misses him', 'er fehlt ihr'],
       ['i followed him', 'ich folgte ihm'],
       ['its unusual', 'es ist ungewohnlich'],
       ['she sounded mad', 'sie klang wutend']], dtype='<U370')

In [19]:
clean_dataset.shape

(10000, 2)

In [29]:
class PrepareDataset:
    def __init__(self,**kwargs):
        self.n_sentences = 9999
        self.train_split = 0.9
    
    def create_tokenizer(self, dataset):
        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(dataset)
        return tokenizer
    
    def find_seq_length(self, dataset):
        return max(len(seq.split()) for seq in dataset)
    
    def find_vocab_size(self, tokenizer, dataset):
        tokenizer.fit_on_texts(dataset)
        return len(tokenizer.word_index) + 1
    
    def __call__(self, filename, **kwargs):
        # Load a Dataset
        clean_dataset = load(open(filename, 'rb'))
        
        # Reduce Dataset Size
        dataset = clean_dataset[:self.n_sentences, :]
        
        # Add Start and Stop Tokens
        for i in range(dataset[:,0].size):
            dataset[i, 0] = "<START>" + dataset[i, 0] + "<EOS>"
            dataset[i, 1] = "<START>" + dataset[i, 1] + "<EOS>"
        
        # Random Shuffle the dataset
        shuffle(dataset)
        
        # Split the dataset
        train = dataset[:int(self.n_sentences * self.train_split)]
        
        # Prepare tokenizer for the encoder input
        enc_tokenizer = self.create_tokenizer(train[:, 0])
        enc_seq_length = self.find_seq_length(train[:, 0])
        enc_vocab_size = self.find_vocab_size(enc_tokenizer, train[:, 0])
        
        # Encode and pad the input sequences
        trainX = enc_tokenizer.texts_to_sequences(train[:, 0])
        trainX = pad_sequences(trainX, maxlen=enc_seq_length, padding='post')
        trainX = convert_to_tensor(trainX, dtype=int64)
        
        #Prepare tokenizer for the decoder input
        dec_tokenizer = self.create_tokenizer(train[:, 1])
        dec_seq_length = self.find_seq_length(train[:, 1])
        dec_vocab_size = self.find_vocab_size(dec_tokenizer, train[:, 1])
        
        # Encode and pad the input sequences
        trainY = dec_tokenizer.texts_to_sequences(train[:, 1])
        trainY = pad_sequences(trainY, maxlen=dec_seq_length, padding='post')
        trainY = convert_to_tensor(trainY, dtype=int64)
        
        return (trainX, trainY, train, enc_seq_length, dec_seq_length, enc_vocab_size, dec_vocab_size)

In [37]:
dataset = PrepareDataset()
trainX, trainY, train_orig, enc_seq_length, dec_seq_length, \
     enc_vocab_size, dec_vocab_size = dataset(filename)

print(train_orig[20, 0], '\n', trainX[20, :])
print(train_orig[0, 1], '\n', trainY[0, :])

<START>who bought this<EOS> 
 tf.Tensor([  1  47 455  21   2], shape=(5,), dtype=int64)
<START>tom war ein spion<EOS> 
 tf.Tensor([   1    5   26   16 1005    2    0    0    0], shape=(9,), dtype=int64)


In [36]:
print('Encoder sequence length:', enc_seq_length)
print('Decoder sequence length:', dec_seq_length)

Encoder sequence length: 5
Decoder sequence length: 10


Apply Padding Mask

In [38]:
def loss_fcn(target, prediction):
    mask = math.logical_not(target, 0)
    mask = cast(mask, float32)
    
    loss = sparse_categorical_crossentropy(target, prediction, from_logits=True) * mask
    return reduce_sum(loss) / reduce_sum(mask)

In [3]:
def accuracy_fcn(target, prediction):
    mask = math.logical_not(math.equal(target,0))
    
    # Find equal prediction and target values, and apply the padding mask
    accuracy = equal(target, argmax(prediction, axis=2))
    accuracy = math.logical_and(mask, accuracy)
    # Cast the True/False values to 32-bit-precision floating-point numbers
    mask = cast(mask, float32)
    accuracy = cast(accuracy, float32)
    # Compute the mean accuracy over the unmasked values
    return reduce_sum(accuracy) / reduce_sum(mask)

#### Train the transformer Model


In [5]:
 # Define the model parameters
h = 8 # Number of self-attention heads
d_k = 64 # Dimensionality of the linearly projected queries and keys d_v = 64 # Dimensionality of the linearly projected values
d_model = 512 # Dimensionality of model layers' outputs
d_ff = 2048 # Dimensionality of the inner fully connected layer
n = 6 # Number of layers in the encoder stack
# Define the training parameters
epochs = 2
batch_size = 64
beta_1 = 0.9
beta_2 = 0.98
epsilon = 1e-9
dropout_rate = 0.1


In [None]:
class LRScheduler(LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000, **kwargs):
        super().__init__(**kwargs)
        self.d_model = cast(d_model, float32)
        self.warmup_steps = warmup_steps
        
    def __call__(self, step_num):
        # Linearly increasing the learning rate for the first warmup_steps, and # decreasing it thereafter
        arg1 = step_num ** -0.5
        arg2 = step_num * (self.warmup_steps ** -1.5)
        return (self.d_model ** -0.5) * math.minimum(arg1, arg2)