# Downloading the dataset

In [14]:
!curl -o shakespeareDataset.txt https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 1089k  100 1089k    0     0  6842k      0 --:--:-- --:--:-- --:--:-- 6937k


# Inspecting the dataset

In [15]:
with open('shakespeareDataset.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [16]:
print("Length of text: {} characters".format(len(text)))

Length of text: 1115394 characters


In [17]:
print(text[:173])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.


In [18]:
chars = sorted(list(set(text)))
print(''.join(chars))
print("Total characters: ", len(chars))


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
Total characters:  65


# Tokenization

### Setting up encoding and decoding scheme
This is another way of saying, that we find a way to represent the words of the dataset as numbers.

To accomplish this, we assign a number to each of the 65 unique characters in the dataset, such that we can represent each letter with a number. 

For example, 'H' could be represented with the number 1, and 'i' with 2.

That way, we could represent the word "Hi" with the tensor [1,2].

In [19]:
ctoi = { ch:i for i,ch in enumerate(chars) } # char to index
itoc = { i:ch for i,ch in enumerate(chars) } # index to char
encode = lambda s: [ctoi[ch] for ch in s]
decode = lambda a: ''.join([itoc[i] for i in a])

print(ctoi)
print(itoc)
print(encode('Hej mor'))
print(decode(encode('Hej mor')))

{'\n': 0, ' ': 1, '!': 2, '$': 3, '&': 4, "'": 5, ',': 6, '-': 7, '.': 8, '3': 9, ':': 10, ';': 11, '?': 12, 'A': 13, 'B': 14, 'C': 15, 'D': 16, 'E': 17, 'F': 18, 'G': 19, 'H': 20, 'I': 21, 'J': 22, 'K': 23, 'L': 24, 'M': 25, 'N': 26, 'O': 27, 'P': 28, 'Q': 29, 'R': 30, 'S': 31, 'T': 32, 'U': 33, 'V': 34, 'W': 35, 'X': 36, 'Y': 37, 'Z': 38, 'a': 39, 'b': 40, 'c': 41, 'd': 42, 'e': 43, 'f': 44, 'g': 45, 'h': 46, 'i': 47, 'j': 48, 'k': 49, 'l': 50, 'm': 51, 'n': 52, 'o': 53, 'p': 54, 'q': 55, 'r': 56, 's': 57, 't': 58, 'u': 59, 'v': 60, 'w': 61, 'x': 62, 'y': 63, 'z': 64}
{0: '\n', 1: ' ', 2: '!', 3: '$', 4: '&', 5: "'", 6: ',', 7: '-', 8: '.', 9: '3', 10: ':', 11: ';', 12: '?', 13: 'A', 14: 'B', 15: 'C', 16: 'D', 17: 'E', 18: 'F', 19: 'G', 20: 'H', 21: 'I', 22: 'J', 23: 'K', 24: 'L', 25: 'M', 26: 'N', 27: 'O', 28: 'P', 29: 'Q', 30: 'R', 31: 'S', 32: 'T', 33: 'U', 34: 'V', 35: 'W', 36: 'X', 37: 'Y', 38: 'Z', 39: 'a', 40: 'b', 41: 'c', 42: 'd', 43: 'e', 44: 'f', 45: 'g', 46: 'h', 47: 'i',

### Encoding the entire dataset
After declaring the encoding and decoding scheme, we apply it to the entire dataset

In [20]:
import torch
data = torch.tensor(encode(text), dtype=torch.int64)
print(data.shape, data.dtype)
print(data[:173])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8])


In the codeblock above, the following was printed:

1. The size of the resulting tensor (notice it is as big as the amount of chars in the dataset, as previously printed)

2. The first 173 chars in encoded form

# Partitioning the dataset

To avoid overfitting and to reduce the generalization error, we divide the dataset into a training set, and an evaluation set.

In [21]:
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [22]:
context_length = 10
train_data[:context_length+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64])

In [23]:
x = train_data[:context_length]
y = train_data[1:context_length+1]
for i in range(context_length):
    context = x[:i+1]
    target = y[i]
    print(f"When the context is {context} the target is {target}")

When the context is tensor([18]) the target is 47
When the context is tensor([18, 47]) the target is 56
When the context is tensor([18, 47, 56]) the target is 57
When the context is tensor([18, 47, 56, 57]) the target is 58
When the context is tensor([18, 47, 56, 57, 58]) the target is 1
When the context is tensor([18, 47, 56, 57, 58,  1]) the target is 15
When the context is tensor([18, 47, 56, 57, 58,  1, 15]) the target is 47
When the context is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target is 58
When the context is tensor([18, 47, 56, 57, 58,  1, 15, 47, 58]) the target is 47
When the context is tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47]) the target is 64


In [61]:
batch_size = 4
context_length = 10

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(0, len(data) - context_length, (batch_size,))
    x = torch.stack([data[i:i+context_length] for i in ix])
    y = torch.stack([data[i+1:i+1+context_length] for i in ix])
    return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

for b in range(batch_size):
    for t in range(context_length):
        context = xb[b, :t+1]
        target = yb[b, t]
        print(f"When the context is {context} the target is {target}")

inputs:
torch.Size([4, 10])
tensor([[57, 61, 43, 39, 56,  0, 26, 43, 60, 43],
        [43, 39, 57, 53, 52,  1, 61, 46, 63, 10],
        [11,  0, 21,  1, 61, 39, 57,  1, 39, 42],
        [20, 13, 30, 16,  1, 21, 21, 21, 10,  0]])
targets:
torch.Size([4, 10])
tensor([[61, 43, 39, 56,  0, 26, 43, 60, 43, 56],
        [39, 57, 53, 52,  1, 61, 46, 63, 10,  0],
        [ 0, 21,  1, 61, 39, 57,  1, 39, 42, 53],
        [13, 30, 16,  1, 21, 21, 21, 10,  0, 31]])
When the context is tensor([57]) the target is 61
When the context is tensor([57, 61]) the target is 43
When the context is tensor([57, 61, 43]) the target is 39
When the context is tensor([57, 61, 43, 39]) the target is 56
When the context is tensor([57, 61, 43, 39, 56]) the target is 0
When the context is tensor([57, 61, 43, 39, 56,  0]) the target is 26
When the context is tensor([57, 61, 43, 39, 56,  0, 26]) the target is 43
When the context is tensor([57, 61, 43, 39, 56,  0, 26, 43]) the target is 60
When the context is tensor([57

# Creating the model

This is a general one. It creates garbage.

In [69]:
import torch
import torch.nn as nn
from torch.nn import functional as F

class BigramLanguageModel(nn.Module):
    
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        logits = self.token_embedding_table(idx)

        if targets is None:
            loss = None
        else:
            S, V, W = logits.shape 
            logits = logits.view(S*V, W)
            targets = targets.view(S*V)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self(idx)

            logits = logits[:, -1, :]

            probs = F.softmax(logits, dim=-1)

            idx_next = torch.multinomial(probs, num_samples=1)

            idx = torch.cat((idx, idx_next), dim=1)
        return idx


m = BigramLanguageModel(len(chars))
logits, loss = m(xb, yb)
print(logits.shape, loss)
print(loss)

print(decode(m.generate(idx = torch.zeros((1,1), dtype=torch.long), max_new_tokens=100)[0].tolist()))

torch.Size([40, 65]) tensor(4.7985, grad_fn=<NllLossBackward0>)
tensor(4.7985, grad_fn=<NllLossBackward0>)

&.QksPPsxcYgycCA;pbHF
L?AtAYG,aybr$AOaKs:!uIENq:VDjQA:
SVopLnQ!kmvOAv-aHGDU.,jp;dDtr $UHSmrrTHZ;CKq



# Training the model

Creating an optimizer and training the model with it using standard backpropagation

In [70]:
optimizer = torch.optim.Adam(m.parameters(), lr=1e-3)

In [78]:
context_length = 32
for steps in range(10000):

    xb, yb = get_batch('train')

    logits, loss = m(xb, yb)

    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())
    

2.5165913105010986


In [86]:
print(decode(m.generate(idx = torch.zeros((1,1), dtype=torch.long), max_new_tokens=250)[0].tolist()))


ABorer.

Whe boun, heporentaw.
ARCENEO:

MET:

y?Tyoryour past, at teicavean, t.

ankilico finssl.JNOLeld f INTA:
NINTABUKIOHON mers g anwhive d itheame stharceais hemyo nthe ve, iged e w, isakisl wheme'd ly, he ieiclor ssir il m, bG n ybet h, slg di
