In [1]:
with open("./datasets/tiny_shakespeare/input.txt", "r") as f:
    text = f.read()

In [15]:
len(text)

1115394

In [3]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [4]:
ctoi = {c:i for i, c in enumerate(chars)}
itoc = {i:c for i, c in enumerate(chars)}

def encode(s):
    return [ctoi[c] for c in s]
def decode(token_seq):
    return "".join([itoc[i] for i in token_seq])

token_seq = encode("Hello World!")
print(token_seq)
print(decode(token_seq))

[20, 43, 50, 50, 53, 1, 35, 53, 56, 50, 42, 2]
Hello World!


In [5]:
tokenized_text = encode(text)
assert len(tokenized_text) == len(text)

In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F
torch.manual_seed(42)

<torch._C.Generator at 0x72b7045dfb50>

In [7]:
from nano_gpt import nanoGPT

In [8]:
train_data_size = int(0.9*len(tokenized_text))
val_data_size = len(tokenized_text) - train_data_size

print(train_data_size)  # number of tokens
print(val_data_size)

1003854
111540


In [9]:
train_data = torch.tensor(
    tokenized_text[:train_data_size],
    dtype=torch.long,
)
val_data = torch.tensor(
    tokenized_text[train_data_size:],
    dtype=torch.long,
)
print(train_data.shape)
print(val_data.shape)

torch.Size([1003854])
torch.Size([111540])


In [10]:
ctxt_len = 256
batch_size = 32
n_embed = 384
num_layers = 6
num_heads = 6
dropout = 0.2
device = 'cuda' if torch.cuda.is_available() else 'cpu'
max_iters = 5000
eval_interval = 100
eval_iters = 200 # how many batches to eval on in one evaluation
learning_rate = 5e-4

In [11]:
def get_batch(split):
    data = train_data if split=='train' else val_data
    start_idxs = torch.randint(0, len(data)-ctxt_len, (batch_size, ))
    x = torch.stack([data[idx:idx+ctxt_len] for idx in start_idxs])
    y = torch.stack([data[idx+1:idx+1+ctxt_len] for idx in start_idxs])
    x, y = x.to(device), y.to(device)
    return x, y

In [12]:
llm = nanoGPT(vocab_size, ctxt_len, n_embed, num_heads, num_layers, dropout, device)
llm = llm.to(device)

In [13]:
x = torch.tensor([0], device=device).reshape(1, 1)
x

tensor([[0]])

In [14]:
generated_tokens = llm.generate(
    x,
    max_new_tokens=500
)
generated_text = decode(generated_tokens[0].tolist())
print(generated_text)


GTSj
coMg!tNASVoy.
v:SLtqCJWqaaZ.Hjvu
wlvFMBjQcDRt:-&;RKF;b&FlUEkwiO?BMOyCOP?OTg;O'Btjg
!H!!xlgScxo-fIepD3
M OukwFo:gB w!kl-?tIZaSFc-&RAdHaRF3eqLX$R!'jNHXbgvjkaStRBgb$Jrb',WCn
B.3TUJqVPHx-$h
JoN:pqwK''rahz.;TRW
k,auXShgv
V.jl?bMZW-P.QTi;mx3eezT-w!MQ!SsioELoOREpcL3mctJnVefRh-nCXqEVxtSuoMvOO ?RoILlAzajOCVoKeC?tCjjIteubkSRdYc;:ADnqhZMURzFtmlXdKboKcdqyjQKkRe
dahZSRrygLz!BKRb?lGzrYcCA.jA-RNP m,yXLpZbayDdSCdcWIE'CcWLyVdyImdsfOp,ndto eqKzDtuWREMz DtRlIM;3R
d cHAwVl!adkayCsy?:VB!xtL; z
Rbv?FfgB'e'v.YzZZ


In [80]:
@torch.no_grad()
def estimate_loss():
    losses = {}
    llm.eval()
    for split in ['train', 'val']:
        running_loss = 0;
        for i in range(eval_iters):
            x, y = get_batch(split)
            logits = llm(x)
            loss = llm.calc_loss(logits, y)
            running_loss += loss.item()
        losses[split] = running_loss / eval_iters
    llm.train()

    return losses

In [None]:
optimizer = torch.optim.AdamW(llm.parameters(), lr=learning_rate)

for iter in range(max_iters):
    if iter%eval_interval == 0 or iter == max_iters-1:
        losses = estimate_loss()
        print(
            f"step {iter}: train loss {losses['train']:.4f}"
            + f", val loss {losses['val']:.4f}"
        )

    x, y = get_batch('train')

    logits = llm(x)
    loss = llm.calc_loss(logits, y)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

In [None]:
x = torch.tensor([0], device=device).reshape(1, 1)
x

In [None]:
generated_tokens = llm.generate(
    x,
    max_new_tokens=500
)
generated_text = decode(generated_tokens[0].tolist())
print(generated_text)