In [3]:
# open file and inspect it
with open('data/input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

print("lenght of dataset in characters", len(text))
print("first 50 characters:\n", text[:1000])

lenght of dataset in characters 1115394
first 50 characters:
 First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know 

In [6]:
#list all unique characters that occur in the text
characters = sorted(list(set(text)))
vocab_size = len(characters)
print('Characters:',''.join(characters))
print('Vocabulary sice:', vocab_size)

Characters: 
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
Vocabulary sice: 65


In [7]:
test_text= "Hello, world!"

#tokenise input text to create a mapping from characters to integers
string_to_integer = { ch:i for i,ch in enumerate(characters)}
encode = lambda s:[string_to_integer[c] for c in s]
print(encode(test_text))

integer_to_string = { i:ch for i,ch in enumerate(characters)}
decode = lambda l:''.join([integer_to_string[i] for i in l])
print(decode(encode(test_text)))

[20, 43, 50, 50, 53, 6, 1, 61, 53, 56, 50, 42, 2]
Hello, world!


In [11]:
# encode the entire dataset and store it into a torch.tensor object
import torch
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.type)
print(data[:1000])

torch.Size([1115394]) <built-in method type of Tensor object at 0x105254db0>
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15,

In [13]:
# split between train and validation sets
split_ratio= 0.9
n = int(split_ratio*len(data))
train_data = data[:n]
val_data = data[n:]
print(train_data.shape)
print(val_data.shape)

torch.Size([1003854])
torch.Size([111540])


In [20]:
context_length = 8
print(train_data[:context_length+1])
# in the context of the ith first characters, the i+1th comes next. You need n+1 characters for n context length

x = train_data[:context_length]
y = train_data[1:context_length+1] # y is the next blcok size character offset by one

for t in range(context_length):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target is {target}")

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])
when input is tensor([18]) the target is 47
when input is tensor([18, 47]) the target is 56
when input is tensor([18, 47, 56]) the target is 57
when input is tensor([18, 47, 56, 57]) the target is 58
when input is tensor([18, 47, 56, 57, 58]) the target is 1
when input is tensor([18, 47, 56, 57, 58,  1]) the target is 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]) the target is 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target is 58


In [None]:
torch.manual_seed(1337) #sample random locations in the dataset with the same seed as the youtube video
batch_size = 4 # how many idependant sequences will we process in parallel?
context_length = 8 # what is the maximum context length for predicitons?

def get_batch(split):
    """ Generate a small batch of data of inputs x ad targets y
    """
    data = train_data if split == "train" else val_data

    index = torch.randint(len(data) - context_length, (batch_size,)) # random offsets into the training set

    x = torch.stack([data[i:i+context_length] for i in index])
    y = torch.stack([data[i+1:i+context_length+1] for i in index]) #offest by 1 of x
    return x, y

xb, yb = get_batch("train")
print("inputs:")
print(xb.shape)
print(xb)
print("targets:")
print(yb.shape)
print(yb)

for b in range(batch_size):
    for t in range(context_length):
        context = xb[b, :t+1]
        target = yb[b, t]
        print(f"when input is {context} the target is {target}")

inputs:
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
targets:
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
when input is tensor([24]) the target is 43
when input is tensor([24, 43]) the target is 58
when input is tensor([24, 43, 58]) the target is 5
when input is tensor([24, 43, 58,  5]) the target is 57
when input is tensor([24, 43, 58,  5, 57]) the target is 1
when input is tensor([24, 43, 58,  5, 57,  1]) the target is 46
when input is tensor([24, 43, 58,  5, 57,  1, 46]) the target is 43
when input is tensor([24, 43, 58,  5, 57,  1, 46, 43]) the target is 39
when input is tensor([44]) the target is 53
when input is tensor([44, 53]) the target is 56
when input is tensor([44, 53, 56]) the target is 1
wh

In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):

        # idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx) # (B,T,C)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            logits, loss = self(idx) # get the predictions
            logits = logits[:, -1, :] # becomes (B, C) focus only on the last time step
            probs = F.softmax(logits, dim=-1) # (B, C)
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1), append sampled index to the running sequence
        return idx

m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))


torch.Size([32, 65])
tensor(4.8786, grad_fn=<NllLossBackward0>)

SKIcLT;AcELMoTbvZv C?nq-QE33:CJqkOKH-q;:la!oiywkHjgChzbQ?u!3bLIgwevmyFJGUGp
wnYWmnxKWWev-tDqXErVKLgJ


In [45]:
optimizer = torch.optim.Adam(m.parameters(), lr=1e-3)

In [51]:
batch_size = 32
for steps in range(10000):
    xb, yb = get_batch("train")

    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    print(loss.item())

2.462031602859497
2.419196367263794
2.425070285797119
2.411766529083252
2.447233200073242
2.3418402671813965
2.5794758796691895
2.534288167953491
2.3981242179870605
2.4006996154785156
2.441399097442627
2.453568696975708
2.4964983463287354
2.438933849334717
2.4314424991607666
2.595949649810791
2.4986119270324707
2.4350199699401855
2.448930501937866
2.4364070892333984
2.4324474334716797
2.4788730144500732
2.3488404750823975
2.50705623626709
2.3568899631500244
2.471801280975342
2.5605790615081787
2.35229754447937
2.5823042392730713
2.4732542037963867
2.542886257171631
2.4729270935058594
2.4850034713745117
2.563608169555664
2.4300477504730225
2.49188232421875
2.378267765045166
2.3346211910247803
2.494142532348633
2.3400444984436035
2.4441609382629395
2.3446450233459473
2.4252846240997314
2.507159471511841
2.4140431880950928
2.391545534133911
2.3817970752716064
2.509009599685669
2.4269661903381348
2.5253865718841553
2.3672034740448
2.484236240386963
2.461195707321167
2.4484925270080566
2.41

In [None]:
#prediction only based on the last token
print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))


Asayomed spay fow
toua.
NGod'smivecountha b
N whathese, s me
Whmu.

I see spol barer ug od-wisee!
BR
