# Imports

In [2]:
import os
import keras
import pickle
from tqdm import tqdm
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras_nlp.layers import TransformerDecoder
from tensorflow.keras.layers import Embedding, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy

2024-08-01 19:55:25.722049: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-08-01 19:55:28.243592: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-01 19:55:28.693107: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-01 19:55:28.708387: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-01 19:55:30.726229: I tensorflow/core/platform/cpu_feature_guar

# Hyperparameters

In [3]:
data_dir='data/'
plots_dir='plots/'
output_dir='outputs/'
val_size=0.05
batch_size=4
embedding_dim=32
num_heads=4
num_tranformer_decoders=8
initial_learning_rate = 1e-3
epochs=1

In [4]:
for item in [plots_dir, output_dir]:
    os.makedirs(item, exist_ok=True)

# Tokenizer

In [5]:
class Tokenizer:
    def __init__(self, data):
        self.vocab = self._build_vocab(data)
        self.inverse_vocab = dict([index, token] for token, index in self.vocab.items())
        
    def _build_vocab(self, data):
        vocab = {'':0, '<unk>':1}
        for row in data:
            for token in row.split():
                if token in vocab:
                    continue
                else:
                    vocab[token] = len(vocab)
        return vocab
                    
    def encode(self, data):
        encoded_data = []
        for row in data:
            encoded_row = []
            for token in row.split():
                encoded_row.append(self.vocab.get(token, self.vocab['<unk>']))
            encoded_data.append(encoded_row)
        return encoded_data

    def decode(self, data):
        decoded_data = []
        for row in data:
            decoded_row = []
            for index in row:
                decoded_row.append(self.inverse_vocab[index])
            decoded_data.append((' '.join(decoded_row)).strip())
        return decoded_data

# Data Preprocessing

In [6]:
def create_dataset(data_dir, batch_size, val_size):
    
    with open(os.path.join(data_dir, 'Train_input'), 'rb') as file:
        lang1 = pickle.load(file)
    with open(os.path.join(data_dir, 'Train_output'), 'rb') as file:
        lang2 = pickle.load(file)


    corpus = ['<spl> ' + sen1 + '<spl> ' + sen2 + '<spl>' for sen1, sen2 in zip(lang1, lang2)]
    train_corpus, val_corpus = train_test_split(corpus, test_size = val_size, random_state = 36)


    tokenizer = Tokenizer(train_corpus)
    train_encoded = tokenizer.encode(train_corpus)
    val_encoded = tokenizer.encode(val_corpus)


    max_length = max(len(seq) for seq in train_encoded + val_encoded)
    train_padded = np.array(pad_sequences(train_encoded, maxlen=max_length, padding='pre'))
    val_padded = np.array(pad_sequences(val_encoded, maxlen=max_length, padding='pre'))


    train_in, train_label = train_padded[:, :-1], train_padded[:, 1:]
    val_in, val_label = val_padded[:, :-1], val_padded[:, 1:]


    train_label = keras.utils.to_categorical(train_label, num_classes=len(tokenizer.vocab))
    val_label = keras.utils.to_categorical(val_label, num_classes=len(tokenizer.vocab))

    train_dataset = tf.data.Dataset.from_tensor_slices((train_in, train_label)).batch(batch_size)
    val_dataset = tf.data.Dataset.from_tensor_slices((val_in, val_label)).batch(batch_size)

    return train_dataset, val_dataset, tokenizer, max_length

# Model Class

In [7]:
@keras.saving.register_keras_serializable()
class LLM(keras.Model):
    def __init__(self, vocab_size, max_length, embedding_dim, num_heads, num_tranformer_decoders, **kwargs):
        super(LLM, self).__init__()
        self.vocab_size = vocab_size
        self.max_length = max_length
        self.embedding_dim = embedding_dim
        self.num_heads = num_heads
        self.num_tranformer_decoders = num_tranformer_decoders

        self.embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim)
        self.tranformer_decoders = [TransformerDecoder(intermediate_dim=embedding_dim, num_heads=num_heads) for _ in range(num_tranformer_decoders)]
        self.dense = Dense(vocab_size, activation='softmax') 

    def call(self, inputs):
        outs = self.embedding(inputs)
        for layer in self.tranformer_decoders:
            outs = layer(outs)
        outs = self.dense(outs)
        
        return outs

    def get_config(self):
        base_config = super().get_config().copy()
        base_config.update({
            'vocab_size': self.vocab_size,
            'max_length': self.max_length,
            'embedding_dim': self.embedding_dim,
            'num_heads' : self.num_heads,
            'num_tranformer_decoders' : self.num_tranformer_decoders
        })
        return base_config

# Plotting Training Curves

In [8]:
def plot(history, save_name):

    def plot_loss(history, save_name):
        plt.plot(history.history['loss'], label='loss')
        plt.plot(history.history['val_loss'], label='val_loss')
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.legend()
        plt.savefig(save_name)
        plt.close()

    def plot_accuracy(history, save_name):
        plt.plot(history.history['categorical_accuracy'], label='accuracy')
        plt.plot(history.history['val_categorical_accuracy'], label='val_accuracy')
        plt.xlabel('Epoch')
        plt.ylabel('Accuracy')
        plt.legend()
        plt.savefig(save_name)
        plt.close()

    def plot_lr(history, save_name):
        plt.plot(history.history['learning_rate'], label='Learning Rate')
        plt.xlabel('Epoch')
        plt.ylabel('Learning Rate')
        plt.legend()
        plt.savefig(save_name)
        plt.close()

    plot_loss(history, save_name + 'loss.png')
    plot_accuracy(history, save_name + 'accuracy.png')
    plot_lr(history, save_name + 'lr.png')

# Main Training Block

In [9]:
train_dataset, val_dataset, tokenizer, max_length = create_dataset(data_dir, batch_size, val_size)

model = LLM(len(tokenizer.vocab), max_length, embedding_dim, num_heads, num_tranformer_decoders)
model.compile(optimizer=Adam(learning_rate=initial_learning_rate),
              loss=CategoricalCrossentropy(from_logits=False),
              metrics=[CategoricalAccuracy()])


def scheduler(epoch, lr):
    if epoch!=0 and epoch%3==0:
        return lr*0.7
    return lr
history = model.fit(train_dataset,
                    validation_data = val_dataset,
                    epochs = epochs,
                    callbacks=[keras.callbacks.LearningRateScheduler(scheduler)],
                    batch_size=batch_size,
                    verbose = 50)


plot(history, plots_dir)

with open(output_dir + 'tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)
with open(output_dir + 'model.pkl', 'wb') as f:
    pickle.dump(model, f)

2024-08-01 20:05:26.374650: I tensorflow/core/common_runtime/gpu/gpu_device.cc:2021] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 21300 MB memory:  -> device: 0, name: Quadro RTX 6000, pci bus id: 0000:3b:00.0, compute capability: 7.5
I0000 00:00:1722560747.381137  253770 service.cc:146] XLA service 0x7fb348002340 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1722560747.381281  253770 service.cc:154]   StreamExecutor device (0): Quadro RTX 6000, Compute Capability 7.5
2024-08-01 20:05:47.821842: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-08-01 20:05:52.241209: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:531] Loaded cuDNN version 8907

I0000 00:00:1722560766.551598  253770 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the pro

# Main Testing Block

In [11]:
with open('data/Train_input', 'rb') as file:
    lang1 = pickle.load(file)

with open('data/Train_output', 'rb') as file:
    lang2 = pickle.load(file)

with open(output_dir + 'tokenizer.pkl', 'rb') as f:
    tokenizer = pickle.load(f)

with open(output_dir + 'model.pkl', 'rb') as f:
    model = pickle.load(f)


data_input = ['<spl> ' + sen + '<spl> ' for sen in lang1]
data_label = lang2

_, data_test = train_test_split(list(zip(data_input, data_label)), test_size = val_size, random_state = 36)
data_test = data_test[:2]
data_test = list(zip(*data_test))
test_input, test_label = data_test
test_input = tokenizer.encode(test_input)
test_input = pad_sequences(test_input, maxlen=max_length, padding='pre')

encoded_out = []
for item in tqdm(test_input, desc='Translating Sequences...'):
    item = np.expand_dims(item, axis=0)
    pred_toks = []
    for _ in range(60):
        pred_probs = model.predict(item, verbose=0)  # Returns an array of shape (1, 81, vocab_size)
        pred_tok = np.argmax(pred_probs[0], axis = -1)[-1]

        if pred_tok == tokenizer.vocab["<spl>"]:
            break

        item = np.append(item[:, 1:], np.expand_dims([pred_tok], axis=0), axis=-1)
        pred_toks.append(pred_tok)

    encoded_out.append(pred_toks)


encoded_out = [[tok for tok in seq if tok not in [tokenizer.vocab["<unk>"], tokenizer.vocab[""]]] for seq in encoded_out ]

out = tokenizer.decode(encoded_out)

correct = 0
for predicted, label in zip(out, test_label):
    if predicted == label:
        correct+=1

#print('Test Generation Accuracy = ', correct/len(test_label))

Translating Sequences...: 100%|██████████| 2/2 [00:16<00:00,  8.44s/it]
