In [1]:
with open("input.txt", "r", encoding="utf-8") as file:
    text = file.read()

In [2]:
print("Length of the dataset in characters:", len(text))

Length of the dataset in characters: 1115394


In [3]:
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [4]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(" ".join(chars))
print(vocab_size)


   ! $ & ' , - . 3 : ; ? A B C D E F G H I J K L M N O P Q R S T U V W X Y Z a b c d e f g h i j k l m n o p q r s t u v w x y z
65


In [5]:
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}

encode = lambda x: [stoi[ch] for ch in x] # encoder: take a string, output a list of integers
decode = lambda x: "".join([itos[i] for i in x]) # decoder: take a list of integers, output a string

In [6]:
print(encode("Hii there"))
print(decode([20, 47, 47, 1, 58, 46, 43, 56, 43]))

[20, 47, 47, 1, 58, 46, 43, 56, 43]
Hii there


In [7]:
import tensorflow as tf
import keras

2025-06-26 03:10:02.391237: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750907402.737027    1060 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750907402.831293    1060 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1750907403.585849    1060 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1750907403.585906    1060 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1750907403.585912    1060 computation_placer.cc:177] computation placer alr

In [8]:
# Let's convert the entire encoded dataset into a tensorflow representation of a tensor
data = tf.convert_to_tensor(encode(text), dtype=tf.int64)
print(data.shape, data.dtype)
print(data[:1000])

(1115394,) <dtype: 'int64'>
tf.Tensor(
[18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43
  1 54 56 53 41 43 43 42  1 39 52 63  1 44 59 56 58 46 43 56  6  1 46 43
 39 56  1 51 43  1 57 54 43 39 49  8  0  0 13 50 50 10  0 31 54 43 39 49
  6  1 57 54 43 39 49  8  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10
  0 37 53 59  1 39 56 43  1 39 50 50  1 56 43 57 53 50 60 43 42  1 56 39
 58 46 43 56  1 58 53  1 42 47 43  1 58 46 39 52  1 58 53  1 44 39 51 47
 57 46 12  0  0 13 50 50 10  0 30 43 57 53 50 60 43 42  8  1 56 43 57 53
 50 60 43 42  8  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 18 47
 56 57 58  6  1 63 53 59  1 49 52 53 61  1 15 39 47 59 57  1 25 39 56 41
 47 59 57  1 47 57  1 41 46 47 43 44  1 43 52 43 51 63  1 58 53  1 58 46
 43  1 54 43 53 54 50 43  8  0  0 13 50 50 10  0 35 43  1 49 52 53 61  5
 58  6  1 61 43  1 49 52 53 61  5 58  8  0  0 18 47 56 57 58  1 15 47 58
 47 64 43 52 10  0 24 43 58  1 59 57  1 49 47 50 50  1 46 47 51  6  1 39
 52 42  1 61

I0000 00:00:1750907417.254251    1060 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 2242 MB memory:  -> device: 0, name: NVIDIA GeForce GTX 1650, pci bus id: 0000:01:00.0, compute capability: 7.5


In [9]:
# Let's now split up the dataset into training and validation sets

n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

In [10]:
train_data.shape, val_data.shape

(TensorShape([1003854]), TensorShape([111540]))

In [11]:
block_size = 8
print(train_data[:block_size + 1])

tf.Tensor([18 47 56 57 58  1 15 47 58], shape=(9,), dtype=int64)


In [12]:
x = train_data[:block_size]
y = train_data[1:block_size + 1]

for t in range(block_size):
    context = x[:t + 1]
    target = y[t]
    
    print("Context:", context.numpy(), "Target:", target.numpy(), "\nString:", decode(context.numpy()), "->", decode([target.numpy()]), "\n\n")

Context: [18] Target: 47 
String: F -> i 


Context: [18 47] Target: 56 
String: Fi -> r 


Context: [18 47 56] Target: 57 
String: Fir -> s 


Context: [18 47 56 57] Target: 58 
String: Firs -> t 


Context: [18 47 56 57 58] Target: 1 
String: First ->   


Context: [18 47 56 57 58  1] Target: 15 
String: First  -> C 


Context: [18 47 56 57 58  1 15] Target: 47 
String: First C -> i 


Context: [18 47 56 57 58  1 15 47] Target: 58 
String: First Ci -> t 




In [13]:
def get_batch(spilt, batch_size=4):
    # Generate a small batch of data of inputs x and targets y
    data = train_data if spilt == "train" else val_data

    ix = tf.random.uniform((batch_size,), maxval=data.shape[0] - block_size, dtype=tf.int64)
    x = tf.stack([data[i:i + block_size] for i in ix])
    y = tf.stack([data[i + 1:i + block_size + 1] for i in ix])

    return x, y

x, y = get_batch("train")

print("x: ")
print(x)
print(x.shape)
print("y: ")
print(y)
print(y.shape)

x: 
tf.Tensor(
[[ 1 39  1 57 43 60 43 56]
 [ 0 19 30 17 37 10  0 26]
 [51 51 43 56  1 40 43 39]
 [13 50 58 46 53 59 45 46]], shape=(4, 8), dtype=int64)
(4, 8)
y: 
tf.Tensor(
[[39  1 57 43 60 43 56 39]
 [19 30 17 37 10  0 26 53]
 [51 43 56  1 40 43 39 59]
 [50 58 46 53 59 45 46  1]], shape=(4, 8), dtype=int64)
(4, 8)


In [14]:
class BigramLanguageModel(keras.Model):
    def __init__(self, vocab_size) -> None:
        super(BigramLanguageModel, self).__init__()
        # Each token directly reads off the the logits for the next token from a lookup table
        self.embedding = keras.layers.Embedding(input_dim=vocab_size, output_dim=vocab_size)
    
    def call(self, idx, targets=None) -> tf.Tensor:
        # idx and targets are both tensors of shape [batch_size, block_size]
        logits = self.embedding(idx) # shape [batch_size, block_size, vocab_size]

        loss = None

        if targets is not None:
            # I am doubting that this might cause problem in the accuracy
            # If there is a problem in accuracy, this is the first place I would look at
            loss = keras.losses.sparse_categorical_crossentropy(targets, logits, from_logits=True)
            loss = tf.reduce_mean(loss)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is a tensor of shape [batch_size, block_size]
        for _ in range(max_new_tokens):
            # get the predictions
            logits, _ = self(idx)
            
            # focus only on the last token
            logits = logits[:, -1, :]
            
            # apply softmax to convert logits to probabilities
            probs = tf.nn.softmax(logits, axis=-1)

            # sample from the probability distribution
            idx_next = tf.random.categorical(tf.math.log(probs), num_samples=1) # shape [batch_size, 1]

            # append sampled index to the running sequence
            idx = tf.concat([idx, idx_next], axis=-1) # shape [batch_size, block_size + 1]

        return idx

In [15]:
model = BigramLanguageModel(vocab_size)
logits, loss = model(x, y)
print(loss)
print(logits.shape)

tf.Tensor(4.1698456, shape=(), dtype=float32)
(4, 8, 65)


In [16]:
# Let's look at the random output of the model
idx = tf.zeros((1, 1), dtype=tf.int64)
output = model.generate(idx, max_new_tokens=100)[0].numpy().tolist()
print(output)
print(decode(output))

[0, 44, 43, 34, 29, 28, 19, 42, 30, 42, 53, 36, 46, 16, 18, 12, 51, 43, 35, 24, 36, 14, 28, 1, 21, 26, 27, 17, 56, 59, 12, 24, 63, 9, 23, 2, 54, 45, 13, 4, 2, 50, 19, 18, 18, 6, 6, 44, 45, 33, 15, 14, 23, 24, 50, 60, 43, 25, 59, 22, 2, 47, 2, 22, 48, 58, 51, 41, 0, 58, 41, 18, 6, 55, 31, 20, 14, 4, 4, 5, 63, 1, 0, 57, 42, 36, 44, 16, 10, 46, 37, 50, 63, 45, 11, 0, 55, 34, 41, 3, 18]

feVQPGdRdoXhDF?meWLXBP INOEru?Ly3K!pgA&!lGFF,,fgUCBKLlveMuJ!i!Jjtmc
tcF,qSHB&&'y 
sdXfD:hYlyg;
qVc$F


In [17]:
# Let's now train the model
from tqdm import tqdm

# Optimizer
optimizer = keras.optimizers.Adam()

bar = tqdm(range(100))
batch_size = 32
for steps in bar:
    # get batch
    x, y = get_batch("train", batch_size)

    # evaluate the loss
    with tf.GradientTape() as tape:
        logits, loss = model(x, y)

    # get the gradients
    grads = tape.gradient(loss, model.trainable_weights)

    # apply the gradients
    optimizer.apply_gradients(zip(grads, model.trainable_weights))

    # update the progress bar
    bar.set_postfix(loss=loss.numpy())

print("Loss: ", loss.numpy())

100%|██████████| 100/100 [01:06<00:00,  1.50it/s, loss=3.94]

Loss:  3.9363196





In [18]:
# consider the toy example

B, T, C = 4, 8, 2 # batch size, time, channels
x = tf.random.uniform((B, T, C), dtype=tf.float32) # random input
print(x.shape)

(4, 8, 2)


In [19]:
import numpy as np

In [20]:
# Version 1: Brute force

# We want x[b, t] = mean_{i<=t} x[b, i]
xbow = np.zeros((B, T, C)) # initialize the output
for b in range(B):
    for t in range(T):
        xbow[b, t] = tf.reduce_mean(x[b, :t + 1], axis=0).numpy() # mean over the time dimension

xbow = tf.convert_to_tensor(xbow, dtype=tf.float32) # convert to tensor
print(xbow.shape)

(4, 8, 2)


In [21]:
x[0]

<tf.Tensor: shape=(8, 2), dtype=float32, numpy=
array([[0.34661818, 0.75111485],
       [0.10576785, 0.19518006],
       [0.29496217, 0.80898356],
       [0.6927602 , 0.16236067],
       [0.92392874, 0.8549218 ],
       [0.8835192 , 0.7477288 ],
       [0.54423594, 0.01351762],
       [0.2686417 , 0.30255306]], dtype=float32)>

In [22]:
xbow[0]

<tf.Tensor: shape=(8, 2), dtype=float32, numpy=
array([[0.34661818, 0.75111485],
       [0.22619301, 0.47314745],
       [0.24911606, 0.58509284],
       [0.3600271 , 0.47940978],
       [0.4728074 , 0.5545122 ],
       [0.54125935, 0.586715  ],
       [0.54168457, 0.50482965],
       [0.50755423, 0.47954506]], dtype=float32)>

In [23]:
a = tf.ones((3, 3), dtype=tf.float32)
b = tf.random.uniform((3, 2), minval=0, maxval=10, dtype=tf.int64)
b = tf.cast(b, dtype=tf.float32)
c = a @ b

print("a=")
print(a)
print("--")
print("b=")
print(b)
print("--")
print("c=")
print(c)
print("--")

a=
tf.Tensor(
[[1. 1. 1.]
 [1. 1. 1.]
 [1. 1. 1.]], shape=(3, 3), dtype=float32)
--
b=
tf.Tensor(
[[4. 6.]
 [7. 2.]
 [7. 0.]], shape=(3, 2), dtype=float32)
--
c=
tf.Tensor(
[[18.  8.]
 [18.  8.]
 [18.  8.]], shape=(3, 2), dtype=float32)
--


In [24]:
a = tf.linalg.band_part(a, -1, 0) # keep the lower triangular part of a matrix
print("a=")
print(a)
print("--")

a=
tf.Tensor(
[[1. 0. 0.]
 [1. 1. 0.]
 [1. 1. 1.]], shape=(3, 3), dtype=float32)
--


In [25]:
c = a @ b

print("a=")
print(a)
print("--")
print("b=")
print(b)
print("--")
print("c=")
print(c)
print("--")

a=
tf.Tensor(
[[1. 0. 0.]
 [1. 1. 0.]
 [1. 1. 1.]], shape=(3, 3), dtype=float32)
--
b=
tf.Tensor(
[[4. 6.]
 [7. 2.]
 [7. 0.]], shape=(3, 2), dtype=float32)
--
c=
tf.Tensor(
[[ 4.  6.]
 [11.  8.]
 [18.  8.]], shape=(3, 2), dtype=float32)
--


In [26]:
a = a / tf.reduce_sum(a, axis=-1, keepdims=True) # normalize the rows of a
print("a=")
print(a)

a=
tf.Tensor(
[[1.         0.         0.        ]
 [0.5        0.5        0.        ]
 [0.33333334 0.33333334 0.33333334]], shape=(3, 3), dtype=float32)


In [27]:
c = a @ b

print("a=")
print(a)
print("--")
print("b=")
print(b)
print("--")
print("c=")
print(c)
print("--")

a=
tf.Tensor(
[[1.         0.         0.        ]
 [0.5        0.5        0.        ]
 [0.33333334 0.33333334 0.33333334]], shape=(3, 3), dtype=float32)
--
b=
tf.Tensor(
[[4. 6.]
 [7. 2.]
 [7. 0.]], shape=(3, 2), dtype=float32)
--
c=
tf.Tensor(
[[4.        6.       ]
 [5.5       4.       ]
 [6.0000005 2.6666667]], shape=(3, 2), dtype=float32)
--


In [28]:
# Version 2: Using matrix multiplication

weights = tf.linalg.band_part(tf.ones((T, T), dtype=tf.float32), -1, 0) # create a banded matrix
weights = weights / tf.reduce_sum(weights, axis=-1, keepdims=True) # normalize the rows of the matrix
print("weights=")
tf.print(weights)

weights=
[[1 0 0 ... 0 0 0]
 [0.5 0.5 0 ... 0 0 0]
 [0.333333343 0.333333343 0.333333343 ... 0 0 0]
 ...
 [0.166666672 0.166666672 0.166666672 ... 0.166666672 0 0]
 [0.142857149 0.142857149 0.142857149 ... 0.142857149 0.142857149 0]
 [0.125 0.125 0.125 ... 0.125 0.125 0.125]]


In [29]:
xbow2 = weights @ x  # matrix multiplication, results in shape [B, T, C]
print("xbow2=")
tf.print(xbow2)

xbow2=
[[[0.346618176 0.751114845]
  [0.226193011 0.473147452]
  [0.249116063 0.585092843]
  ...
  [0.541259408 0.586715]
  [0.541684628 0.504829645]
  [0.507554233 0.479545057]]

 [[0.326085806 0.11524272]
  [0.224193394 0.451950431]
  [0.470451772 0.382357478]
  ...
  [0.59264487 0.554859579]
  [0.523899 0.608995914]
  [0.528872669 0.596303]]

 [[0.224672079 0.717111111]
  [0.593508124 0.745964587]
  [0.540298343 0.588275075]
  ...
  [0.570862591 0.49272573]
  [0.605455458 0.51120013]
  [0.646199644 0.534682333]]

 [[0.758205891 0.515128255]
  [0.871258378 0.373989224]
  [0.765470862 0.474690855]
  ...
  [0.510176957 0.454417288]
  [0.557033062 0.500243247]
  [0.588110447 0.531782091]]]


In [30]:
tf.test.TestCase().assertAllClose(xbow, xbow2)  # check if the two tensors are equal

In [31]:
tf.convert_to_tensor(float("inf"))

<tf.Tensor: shape=(), dtype=float32, numpy=inf>

In [32]:
# Version 3: Using softmax
tril = tf.linalg.band_part(tf.ones((T, T), dtype=tf.float32), -1, 0)  # create a banded matrix
weights = tf.zeros((T, T), dtype=tf.float32)  # initialize weights
weights = tf.where(tril == 0, tf.convert_to_tensor(float("-inf")), tf.convert_to_tensor(0, dtype=tf.float32))  # set weights to 1 where tril is 1
weights = tf.nn.softmax(weights, axis=-1)
xbow3 = weights @ x  # matrix multiplication, results in shape [B, T, C]
print("xbow3=")
tf.print(xbow3)

xbow3=
[[[0.346618176 0.751114845]
  [0.226193011 0.473147452]
  [0.249116063 0.585092843]
  ...
  [0.541259408 0.586715]
  [0.541684628 0.504829645]
  [0.507554233 0.479545057]]

 [[0.326085806 0.11524272]
  [0.224193394 0.451950431]
  [0.470451772 0.382357478]
  ...
  [0.59264487 0.554859579]
  [0.523899 0.608995914]
  [0.528872669 0.596303]]

 [[0.224672079 0.717111111]
  [0.593508124 0.745964587]
  [0.540298343 0.588275075]
  ...
  [0.570862591 0.49272573]
  [0.605455458 0.51120013]
  [0.646199644 0.534682333]]

 [[0.758205891 0.515128255]
  [0.871258378 0.373989224]
  [0.765470862 0.474690855]
  ...
  [0.510176957 0.454417288]
  [0.557033062 0.500243247]
  [0.588110447 0.531782091]]]


In [33]:
tf.test.TestCase().assertAllClose(xbow, xbow3)  # check if the two tensors are equal

In [None]:
B, T, C = 4, 8, 32  # batch size, time, channels
x = tf.random.uniform((B, T, C), dtype=tf.float32)  # random input

# Single head attention
head_size = 16
key = keras.layers.Dense(head_size, use_bias=False)  # key projection
query = keras.layers.Dense(head_size, use_bias=False)  # query projection
value = keras.layers.Dense(head_size, use_bias=False)  # value projection

k = key(x)  # shape [B, T, head_size]
q = query(x)  # shape [B, T, head_size]
v = value(x)  # shape [B, T, head_size]

weights = k @ tf.transpose(q, perm=[0, 2, 1])  # shape [B, T, T]

tril = tf.linalg.band_part(tf.ones((T, T), dtype=tf.float32), -1, 0)
weights = tf.where(tril == 0, tf.convert_to_tensor(float("-inf")), weights) # This line specifically is only for transformer decoder
weights = tf.nn.softmax(weights, axis=-1)

output = weights @ v  # shape [B, T, head_size]
print("output shape:", output.shape)

output shape: (4, 8, 16)
