In [None]:
from google.colab import drive
drive.mount('/content/drive')

Import important libraries for training

In [None]:
import tiktoken
import tensorflow as tf
from tensorflow.keras import Sequential, losses, optimizers, layers, Model, mixed_precision
from tensorflow.keras.layers import Layer
import numpy as np
from config import Config

Settings to train it with mixed float, the purpose is to reduce significantly the training time with a bit tradeoff for precision

In [None]:
tf.config.list_logical_devices()
tf.keras.backend.set_floatx('float32')
policy = mixed_precision.Policy('mixed_float16')
mixed_precision.set_global_policy(policy)
tokenizer = tiktoken.get_encoding(Config.tokenizer)

Main training Skeleton

In [None]:
class Embedding(Layer):
    def __init__(self, config):
        super().__init__()
        self.embed = layers.Embedding(config.vocab_size, config.d_model)
        self.position = self.Get_Position(config.context_length, config.d_model)

    def Get_Position(self, context_length, d_model, n=10000):
        P = np.zeros((context_length ,d_model))
        for k in range(context_length):
            for i in np.arange(int(d_model/2)):
                denominator = np.power(n, 2*i/d_model)
                P[k, 2*i] = np.sin(k/denominator)
                P[k, 2*i + 1] = np.cos(k/denominator)

        return P

    def call(self, inputs):
        return self.embed(inputs) + self.position

################################################################################

class Blocks(Layer):
    def __init__(self, config):

        super().__init__()

        self.mha = layers.MultiHeadAttention(num_heads=config.num_heads,
                                             key_dim = config.d_model)
        self.ffn = Sequential([

            layers.Dense(config.hidden_unit, activation = 'gelu'),

            layers.Dropout(config.drop_out_rate),

            layers.Dense(config.d_model),

            layers.Dropout(config.drop_out_rate)

        ])

        self.layernorm = layers.LayerNormalization()

        self.add = layers.Add()


    def call(self, inputs):

        ##Multi-head Attention
        attention_output = self.mha(
            query = inputs,
            key = inputs,
            value = inputs,
            use_causal_mask = True,
        )

        x = self.add([inputs, attention_output])

        inputs2 = self.layernorm(x)

        ##Feed Forward
        x = self.ffn(inputs2)

        x = self.add([x, inputs2])

        return self.layernorm(x)

################################################################################

class Linear(Layer):

    def __init__(self, config):

        super().__init__()
        self.linear = layers.Dense(config.vocab_size)

    def call(self, inputs):

        return self.linear(inputs)

GPT is the class to init a train object

In [None]:
class GPT():

    def create_model(config = Config()):

        return Sequential([

            tf.keras.Input(shape=(config.context_length,)),

            Embedding(config),

            Sequential([
                Blocks(config)
                       for _ in range(config.block_count)
                ]),

            Linear(config),
        ])

Create the model then show its architecture

In [None]:
model= GPT.create_model()
model.summary()

If we want to train from pre-trained weights, we need to load those weights into the model

In [None]:
model.load_weights(Config.model_path)

Choose optimizer, loss function and their hyperparameters

In [None]:
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate= 3e-4,  # Your initial learning rate
    decay_steps = 10000,  # Number of steps to decay over
    decay_rate = 0.9  # Decay rate per decay_steps
)


loss_fn = losses.SparseCategoricalCrossentropy(from_logits=True)

opt = optimizers.Adam(learning_rate = lr_schedule)

############ Compile #############################

model.compile(
    loss = loss_fn,
    optimizer = opt,
    metrics = ['accuracy']
)

Load train data to x_train and y_train. 

Their dimension is (batch x context length)

Since x_train and y_train differ in only 1 out of N columns, I compress them then extract them later to save memory.

In [None]:
xy_train = np.load("/content/drive/MyDrive/data/xy_train.npy")

x_train = xy_train[:, :-1]
y_train = xy_train[:,  1:]

Set up path so that model save the weights to that path when it is called

We use .h5 file for weights saving

In [None]:
model_path = Config.model_path

cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=model_path,
                                                 save_weights_only=True,
                                                 verbose=1)
model.save_weights(model_path, overwrite=True)

Now start training

In [None]:
batch_size = 64
model.fit(x_train, y_train,
        batch_size = batch_size,
        epochs = 5,
        callbacks=[cp_callback])