In [1]:
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
from torch.nn import functional as F

In [2]:
block_size = 8
batch_size = 4

In [3]:
from datasets import load_dataset
ds = load_dataset("Trelis/tiny-shakespeare")

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
ds_ = ds['train']['Text']

In [5]:
chars = sorted(set(''.join(ds_))) # vocabulory
len(chars)

65

In [6]:
encoder_dict = {k:v for k,v in zip(chars, range(len(chars)))}
decoder_dict = {v:k for k,v in zip(chars, range(len(chars)))}

# Encoder, Decoder
encode = lambda x: [encoder_dict[letter] for letter in x]
decode = lambda x: ''.join([decoder_dict[letter] for letter in x])
encode('hello'), decode([46, 43, 50, 50, 53])

([46, 43, 50, 50, 53], 'hello')

In [7]:
ds_all = '\n'.join(ds_)
ds_encoded = encode(ds_all)

In [8]:
n = int(.9*len(ds_encoded))
train_data = ds_encoded[:n]
val_data = ds_encoded[n:]

In [9]:
def get_batch(data, block_size=block_size, batch_size=batch_size):
    print(len(data))
    ix = torch.randint(len(data)-block_size, (batch_size,)) # these numbers are the start of each batch
    xy = torch.tensor([[data[i] for i in range(ix[i],ix[i]+block_size+1)] for i in range(batch_size)])
    xb = xy[:, :block_size]
    yb = xy[:,1:]
    # print(xb.shape, yb.shape) # (B x T)
    return xb, yb

get_batch(train_data)

1100542


(tensor([[46, 43, 51,  1, 42, 47, 43,  1],
         [52, 42,  1, 57, 54, 47, 56, 47],
         [56, 47, 52, 41, 43,  6,  1, 39],
         [43,  1, 40, 59, 58,  1, 44, 53]]),
 tensor([[43, 51,  1, 42, 47, 43,  1, 58],
         [42,  1, 57, 54, 47, 56, 47, 58],
         [47, 52, 41, 43,  6,  1, 39, 52],
         [ 1, 40, 59, 58,  1, 44, 53, 59]]))

In [13]:
class LLM(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
    
    def forward(self, idx, targets=None):
        logits = self.token_embedding_table(idx)
        if targets == None:
            loss = None
        else:
            B, T, C = logits.shape
            print(B, T, C)
            logits = logits.view(B*T, C)
            print(targets.shape, B*T)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
            # print(logits.shape, loss.shape)
        return logits, loss
    
    def generate(self, idx, max_tokens=100):
        for _ in range(max_tokens):
            logits, loss = self(idx)
            # logits is (B x T x C)
            logits = logits[:, -1, :] # take only the last(latest) one in T component
            probs = F.softmax(logits, dim=-1) # (B, C)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

llm = LLM(len(chars))
# llm.forward(get_batch(train_data)[0])
# decode(llm.generate(get_batch(train_data)[0]))
print(decode(llm.generate(idx = torch.zeros((1, 1), dtype=torch.long))[0].tolist()))



-cqW.Hw3 dyHc'HJ3!Bklv:,bF3IiCEUp?Drbh'aRGmT&ze!tvLmajfJo!lMBqEA'Mu
exNTgyUcWyR?ZXgu-L.HEeh-KHMM'QV 


In [14]:
optimizer = torch.optim.AdamW(llm.parameters(), lr=1e-3)

In [15]:
for steps in range(100): # increase number of steps for good results...

    # sample a batch of data
    xb, yb = get_batch(train_data)

    # evaluate the loss
    logits, loss = llm(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

1100542
4 8 65
torch.Size([4, 8]) 32


RuntimeError: view size is not compatible with input tensor's size and stride (at least one dimension spans across two contiguous subspaces). Use .reshape(...) instead.