In [None]:
import numpy as np
import time 
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [2]:
#Checking if gpu is being used
#tf.debugging.set_log_device_placement(True)

# Create some tensors
#a = tf.constant([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
#b = tf.constant([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]])
#c = tf.matmul(a, b)

#print(c)

In [None]:
# hyperparameters
batch_size = 64
block_size = 256
max_iters = 5000
eval_interval = 500
learning_rate = 3e-4
eval_iters = 200
n_embd = 384
n_head = 8
n_layer = 6
dropout = 0.2

# Set random seed
tf.random.set_seed(1337)
np.random.seed(1337)

In [None]:
#Loading data
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [None]:
# Create character mappings
chars = sorted(list(set(text)))
vocab_size = len(chars)
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}
encode = lambda s: np.array([stoi[c] for c in s], dtype=np.int32)
decode = lambda l: ''.join([itos[int(i)] for i in l])

# Train and test splits
data = tf.constant(encode(text), dtype=tf.int32)
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

In [None]:
# Data loading into batches
def get_batch(split):
    data_split = train_data if split == 'train' else val_data
    # sample starting indices (numpy ints for easy slicing)
    max_start = int(data_split.shape[0]) - block_size - 1
    starts = np.random.randint(0, max_start + 1, size=(batch_size,))
    x = np.stack([data_split[s:s+block_size].numpy() for s in starts]).astype(np.int32)
    y = np.stack([data_split[s+1:s+block_size+1].numpy() for s in starts]).astype(np.int32)
    return tf.constant(x, dtype=tf.int32), tf.constant(y, dtype=tf.int32)

In [None]:
# Estimate loss
def estimate_loss(model):
    out = {}
    for split in ['train', 'val']:
        losses = []
        for _ in range(eval_iters):
            X, Y = get_batch(split)

            logits, loss = model(X, Y, training=True)
            losses.append(float(loss.numpy()))
        out[split] = float(np.mean(losses))
    return out

In [None]:
# Model building blocks
class Head(tf.keras.layers.Layer):
    def __init__(self, head_size):
        super().__init__()
        self.key = layers.Dense(head_size, use_bias=False)
        self.query = layers.Dense(head_size, use_bias=False)
        self.value = layers.Dense(head_size, use_bias=False)
        tril = tf.linalg.band_part(tf.ones((block_size, block_size)), -1, 0)
        self.tril = tf.cast(tril, tf.bool)
        self.dropout = layers.Dropout(dropout)

    def call(self, x, training=False):
        # x: (B, T, C)
        B = tf.shape(x)[0]
        T = tf.shape(x)[1]
        k = self.key(x)  # (B, T, hs)
        q = self.query(x) # (B, T, hs)
        # (B, T, T)
        wei = tf.matmul(q, k, transpose_b=True) * tf.math.rsqrt(tf.cast(tf.shape(k)[-1], tf.float32))
        # mask future positions
        mask = self.tril[:T, :T]  # (T, T)
        # expand mask to (1, T, T) and broadcast
        mask = tf.expand_dims(mask, 0)
        neg_inf = tf.constant(-1e9, dtype=wei.dtype)
        wei = tf.where(mask, wei, neg_inf)
        wei = tf.nn.softmax(wei, axis=-1)
        wei = self.dropout(wei, training=training)
        v = self.value(x)  # (B, T, hs)
        out = tf.matmul(wei, v)  # (B, T, hs)
        return out

In [None]:
class MultiHeadAttention(tf.keras.layers.Layer):
    """ multiple heads of self-attention in parallel """
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = [Head(head_size) for _ in range(num_heads)]
        self.proj = layers.Dense(n_embd)
        self.dropout = layers.Dropout(dropout)

    def call(self, x, training=False):
        out = tf.concat([h(x, training=training) for h in self.heads], axis=-1)
        out = self.proj(out)
        out = self.dropout(out, training=training)
        return out

In [None]:

class FeedForward(tf.keras.layers.Layer):
     """ a simple linear layer followed by a non-linearity """
     def __init__(self, n_embd):
         super().__init__()
         self.net = keras.Sequential([
             layers.Dense(4 * n_embd),
             layers.ReLU(),
             layers.Dense(n_embd),
             layers.Dropout(dropout),
         ])
         
     def call(self, x, training=False):
         return self.net(x, training=training)

In [None]:
class Block(tf.keras.layers.Layer):
    """ Transformer block: communication followed by computation """
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = layers.LayerNormalization(epsilon=1e-6)
        self.ln2 = layers.LayerNormalization(epsilon=1e-6)

    def call(self, x, training=False):
        x = x + self.sa(self.ln1(x), training=training)
        x = x + self.ffwd(self.ln2(x), training=training)
        return x

In [None]:
class GPTLanguageModel(tf.keras.Model):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = layers.Embedding(vocab_size, n_embd)
        self.position_embedding_table = layers.Embedding(block_size, n_embd)
        self.blocks = [Block(n_embd, n_head=n_head) for _ in range(n_layer)]
        self.ln_f = layers.LayerNormalization(epsilon=1e-6)
        self.lm_head = layers.Dense(vocab_size, kernel_initializer='normal', bias_initializer='zeros')

    def call(self, idx, targets=None, training=False):
        B = tf.shape(idx)[0]
        T = tf.shape(idx)[1]
        tok_emb = self.token_embedding_table(idx)
        pos_idx = tf.range(T, dtype=tf.int32)
        pos_emb = self.position_embedding_table(pos_idx)
        pos_emb = tf.expand_dims(pos_emb, 0)
        x = tok_emb + pos_emb
        for block in self.blocks:
            x = block(x, training=training)
        x = self.ln_f(x)
        logits = self.lm_head(x)

        loss = None
        if targets is not None:
            B_ = tf.shape(logits)[0]
            T_ = tf.shape(logits)[1]
            logits_flat = tf.reshape(logits, (B_ * T_, -1))
            targets_flat = tf.reshape(targets, (B_ * T_,))
            loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)(targets_flat, logits_flat)

        return logits, loss

    @tf.function(input_signature=[
        tf.TensorSpec(shape=[None, None], dtype=tf.int32),
        tf.TensorSpec(shape=None, dtype=tf.int32)
    ])
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in tf.range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, _ = self(idx_cond, training=False)
            logits = logits[:, -1, :]  # (B, C)

            # The corrected line: use logits directly
            next_token = tf.random.categorical(logits, num_samples=1, dtype=tf.int32)

            idx = tf.concat([idx, next_token], axis=1)
        return idx

In [None]:
#Training the model. GPU is recommended for training.

model = GPTLanguageModel()
optimizer = tf.keras.optimizers.Adam(learning_rate)

# Training loop
for it in range(max_iters):
    if it % eval_interval == 0 or it == max_iters - 1:
        losses = estimate_loss(model)
        print(f"step {it}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    xb, yb = get_batch('train')

    with tf.GradientTape() as tape:
        logits, loss = model(xb, yb, training=True)

    train_vars = model.trainable_variables
    grads = tape.gradient(loss, train_vars)

    grads_and_vars = [(g, v) for g, v in zip(grads, train_vars) if g is not None]
    if not grads_and_vars:
        raise RuntimeError("No gradients were produced.")

    optimizer.apply_gradients(grads_and_vars)


step 0: train loss 4.5158, val loss 4.5177
step 500: train loss 1.9006, val loss 2.0083
step 1000: train loss 1.4417, val loss 1.6584
step 1500: train loss 1.2854, val loss 1.5992
step 2000: train loss 1.1676, val loss 1.5936
step 2500: train loss 1.0419, val loss 1.6674
step 3000: train loss 0.9076, val loss 1.8094
step 3500: train loss 0.7525, val loss 2.0218
step 4000: train loss 0.6012, val loss 2.3162
step 4500: train loss 0.4598, val loss 2.6565
step 4999: train loss 0.3497, val loss 2.9876


In [None]:
# start with a single [0] token (can be anything in vocab)
context = tf.zeros((1, 1), dtype=tf.int32)
generated_sequence = model.generate(context, max_new_tokens=500).numpy()

# decode back to characters
print(decode(generated_sequence[0]))


Now keeps.
Can I know should thee were trans--I protest,
To betwixt the Samart's the mutine.

CAMILLO:
Ha, madam!
Sir, you!
You pitiff now, but you are worth aboards,
Betwixt the right of your ox adversaries,
Or let our suddenly in all severaltius free
Than Bolingbroke to England. Mercutio,
Ever justice with his praisence, he was proud;
When she departed by his fortune like a greer,
And in the gentle king fair hateful man.
Farewell; so old Cominius, away; I rather,
To you are therefore be behold


In [14]:
model.save_weights('gpt_model_weights.h5')