In [1]:
from nltk.tokenize import word_tokenize
import torch
import torch.nn as nn
import torch.nn.functional as F

In [3]:
# nltk.download() # if not install the nltk library then uncomment this line

In [2]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

In [3]:
text = 'I love this flavor! It\'s by far the best choice and my go-to whenever I go to the grocery store. I wish they would restock it more often though.'

word_tokens = word_tokenize(text)
print(word_tokens)

['I', 'love', 'this', 'flavor', '!', 'It', "'s", 'by', 'far', 'the', 'best', 'choice', 'and', 'my', 'go-to', 'whenever', 'I', 'go', 'to', 'the', 'grocery', 'store', '.', 'I', 'wish', 'they', 'would', 'restock', 'it', 'more', 'often', 'though', '.']


In [4]:
with open('../data/input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [5]:
words = sorted(list(set(word_tokenize(text))))
vocab_size = len(words)
vocab_size

14310

In [6]:
# create a mapping from characters to integers
stoi = { w:i for i,w in enumerate(words) }
itos = { i:w for i,w in enumerate(words) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ' '.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

In [7]:
test_string = 'You are all resolved rather to die than to famish?'
print(encode(word_tokenize(test_string)))
print(decode(encode(word_tokenize(test_string))))

[3053, 3512, 3324, 11053, 10791, 13010, 5723, 12819, 13010, 6533, 225]
You are all resolved rather to die than to famish ?


In [8]:
# encode the entire text dataset and store it into a torch.Tensor
data = torch.tensor(encode(word_tokenize(text)), dtype=torch.long)
print(data.shape, data.type)
print(data[:1000])

torch.Size([254509]) <built-in method type of Tensor object at 0x00000282E217D4E0>
tensor([ 1152,   709,   223,   482, 13877, 10480,  3440,  7080,   219,  7604,
         8993, 12087,   221,   323,   223,  2520,   219, 12087,   221,  1152,
          709,   223,  3053,  3512,  3324, 11053, 10791, 13010,  5723, 12819,
        13010,  6533,   225,   323,   223,  2256,   221, 11053,   221,  1152,
          709,   223,  1152,   219, 14291,  8402,   640,  1769,  8232,  4679,
         6251, 13010, 12831, 10036,   221,   323,   223,  2919,  8404,   219,
        13877,  8404,   221,  1152,   709,   223,  1679, 13581,  8340,  7738,
          219,  3412, 13877,   162,  7567,  5147,  3596,  9761,  9833, 10439,
          221,  1547,  3061, 13661,   225,   323,   223,  1924,  9271, 12703,
         9583,  9377,   224,  8580,  8243,  3786,  5952,   223,  3659,   219,
         3659,     0,  2370,   709,   223,  1972, 14183,   219,  7277,  4738,
          221,  1152,   709,   223,  2919,  3512,  3145, 10

In [9]:
# Let's now split up the data into train and validation sets
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

In [10]:
block_size = 8
train_data[:block_size+1]

tensor([ 1152,   709,   223,   482, 13877, 10480,  3440,  7080,   219])

In [11]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f'input: {context}, target: {target}')

input: tensor([1152]), target: 709
input: tensor([1152,  709]), target: 223
input: tensor([1152,  709,  223]), target: 482
input: tensor([1152,  709,  223,  482]), target: 13877
input: tensor([ 1152,   709,   223,   482, 13877]), target: 10480
input: tensor([ 1152,   709,   223,   482, 13877, 10480]), target: 3440
input: tensor([ 1152,   709,   223,   482, 13877, 10480,  3440]), target: 7080
input: tensor([ 1152,   709,   223,   482, 13877, 10480,  3440,  7080]), target: 219


In [12]:
torch.manual_seed(1337)
batch_size = 4 # How many independent sequences will be process in parallel?
block_size = 8 # What is the maximum context length for predictions?

In [13]:
def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x.to(DEVICE), y.to(DEVICE)

In [14]:
xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

inputs:
torch.Size([4, 8])
tensor([[ 1478,  8532,  9659, 14291,   219,   392,  9701, 12828],
        [ 1478,   162,  9460,  9975,  3276,   219,  9460, 12228],
        [  223,   391, 12891,  3061,  8889,   225, 12968,  6928],
        [ 8348,  8017,  7560,   223,  2943,  4872,  7857,  4468]],
       device='cuda:0')
targets:
torch.Size([4, 8])
tensor([[ 8532,  9659, 14291,   219,   392,  9701, 12828,  3512],
        [  162,  9460,  9975,  3276,   219,  9460, 12228, 12306],
        [  391, 12891,  3061,  8889,   225, 12968,  6928,  5304],
        [ 8017,  7560,   223,  2943,  4872,  7857,  4468, 12891]],
       device='cuda:0')


In [15]:
# for b in range(batch_size): # batch dimension
#     print(f'batch {b+1}/{batch_size}')
#     for t in range(block_size): # time dimension
#         context = xb[b, :t+1]
#         target = yb[b,t]
#         print(f"when input is {context.tolist()} the target: {target}")
#     print()

In [17]:
# hyperparameters
batch_size = 32 # how many independent sequences will we process in parallel?
block_size = 128 # what is the maximum context length for predictions?
max_iters = 2000
eval_interval = 500
learning_rate = 3e-4
eval_iters = 150
n_embd = 256
n_head = 8
n_layer = 4
dropout = 0.3
# ------------

In [18]:
class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # input of size (batch, time-step, channels)
        # output of size (batch, time-step, head size)
        B,T,C = x.shape
        k = self.key(x)   # (B,T,hs)
        q = self.query(x) # (B,T,hs)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5 # (B, T, hs) @ (B, hs, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,hs)
        out = wei @ v # (B, T, T) @ (B, T, hs) -> (B, T, hs)
        return out

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(head_size * num_heads, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class GPTLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

        # better init, not covered in the original GPT video, but important, will cover in followup video
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

In [19]:
model = GPTLanguageModel()
model.to(DEVICE)

GPTLanguageModel(
  (token_embedding_table): Embedding(14310, 256)
  (position_embedding_table): Embedding(128, 256)
  (blocks): Sequential(
    (0): Block(
      (sa): MultiHeadAttention(
        (heads): ModuleList(
          (0-7): 8 x Head(
            (key): Linear(in_features=256, out_features=32, bias=False)
            (query): Linear(in_features=256, out_features=32, bias=False)
            (value): Linear(in_features=256, out_features=32, bias=False)
            (dropout): Dropout(p=0.3, inplace=False)
          )
        )
        (proj): Linear(in_features=256, out_features=256, bias=True)
        (dropout): Dropout(p=0.3, inplace=False)
      )
      (ffwd): FeedFoward(
        (net): Sequential(
          (0): Linear(in_features=256, out_features=1024, bias=True)
          (1): ReLU()
          (2): Linear(in_features=1024, out_features=256, bias=True)
          (3): Dropout(p=0.3, inplace=False)
        )
      )
      (ln1): LayerNorm((256,), eps=1e-05, elementwise_affi

In [20]:
print(decode(model.generate(idx=torch.zeros((1, 1), dtype=torch.long, device=DEVICE), max_new_tokens=500)[0].tolist()))

! changes fruit-trees doing wronging undergo sky Dull inveterate shin December modesties hazards Wiltshire Peer bisson lasted Poland mourning naught whined Beating rascal Bloody Temperance Mercutio join prays potent uncontroll Benefactors pouring tides 'Bove Consuming freezes professors c. minim Exeter advancement presses unrest offends poor'st debile lazy Gentle Thanks Are purchase standard whilst hewn Drinking pushed FITZWATER plagued colic patrimony Comest Claudio Romans unloose cities heat alone whipp hideous coining laments Tabours mix slaughter-house Behind dun nostril thundering licence ill-beseeming caetera hereditary court'sies gratulate Didst irons Else suspected strong meteor Whereby unreverent stretches praying told grossness Caliban knowest thickest YORK Finding tread lying presumes CATESBY absolute story Phoebus thriftless Englishman Care delights exceeds Angelica affairs wisely breasted forecast crutches Patrician mulberry ashes forecast mistaking consumed Aufidiuses 'zo

In [23]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

In [24]:
from tqdm import tqdm

batch_size = 32
epochs = 10000
losses = []
for e in tqdm(range(epochs)):
    xb, yb = get_batch('train')
    
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    losses.append(loss.item())

100%|██████████| 10000/10000 [05:50<00:00, 28.53it/s]


In [26]:
print(decode(model.generate(idx=torch.zeros((1, 1), dtype=torch.long, device=DEVICE), max_new_tokens=100)[0].tolist()))

! Dove-feather it tongue Against all this substance of government , which in his princely death ! DERBY : It makes him in hell : O , methoughts , look worn , a mightst be out of a fearful eyes , A pair of peace and fall , a burthen This good ones and where I slip away with you am sold , To comfort To bitterest enmity ; no , But hear some trust what time , we 'll speak free . First , 't is thy queen of such a manifested ; but whom I will advise myself .
