In [1]:
import torch
from tokenizer import Tokenizer

In [2]:
with open("./resources/harrypotter.txt", 'r') as f:
    text = f.read()

token = Tokenizer.load_json(r"resources/TokenizerModel.json")
vocab_size = len(token.vocab)

Tokenizer loaded from resources/TokenizerModel.json


In [3]:
data = torch.tensor(token.encode(text), dtype=torch.long)
data.shape, data.dtype

(torch.Size([161870]), torch.int64)

In [4]:
# Split up the data into train and validation sets
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [5]:
# What's the length of characters does transformer look at once
block_size = 8
print(train_data[:block_size+1])

inputs = train_data[:block_size]
tragets = train_data[1:block_size+1]
for t in range(block_size):
    print(f"When input is {inputs[:t+1]}, the target will be {tragets[t]}")
    

tensor([ 72, 288, 540, 394, 308, 382, 292, 265, 327])
When input is tensor([72]), the target will be 288
When input is tensor([ 72, 288]), the target will be 540
When input is tensor([ 72, 288, 540]), the target will be 394
When input is tensor([ 72, 288, 540, 394]), the target will be 308
When input is tensor([ 72, 288, 540, 394, 308]), the target will be 382
When input is tensor([ 72, 288, 540, 394, 308, 382]), the target will be 292
When input is tensor([ 72, 288, 540, 394, 308, 382, 292]), the target will be 265
When input is tensor([ 72, 288, 540, 394, 308, 382, 292, 265]), the target will be 327


In [9]:
batch_size = 4
block_size = 8
def get_batch(data:torch.Tensor):
    # Generate random start indexs from 0 to len(data) - block_size 
    # Here minus block_size is to avoid going out of bound
    ix = torch.randint(len(data) - block_size, (batch_size,))

    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch(train_data)
print('input:')
print(xb.shape)
print(xb)
print('target:')
print(yb.shape)
print(yb)

input:
torch.Size([4, 8])
tensor([[ 342,  611,  259,  277,  510,   46,   32,  285],
        [ 265,  644,  413,   46,  948,  259, 2005,  290],
        [ 263, 1461,  279,  313,  505,  115,   44,   34],
        [ 267,  276,  320,  116,  596,   46,   32,  285]])
target:
torch.Size([4, 8])
tensor([[ 611,  259,  277,  510,   46,   32,  285,   34],
        [ 644,  413,   46,  948,  259, 2005,  290, 1964],
        [1461,  279,  313,  505,  115,   44,   34,  474],
        [ 276,  320,  116,  596,   46,   32,  285,   34]])


In [11]:
from model import BiagramLanguageModel

blm = BiagramLanguageModel(vocab_size=vocab_size)
xb, yb = get_batch(train_data)
logits, loss = blm(xb, yb)
print(logits.shape)
print(loss)

idx = torch.zeros((1,1), dtype=torch.long)
print(token.decode(blm.generate(idx, max_new_tokens=100)[0].tolist()))

torch.Size([32, 2048])
tensor(8.2112, grad_fn=<NllLossBackward0>)
 rait mesction ab silence natsonies enc electronic� less * lips wind questlizabeth�leas12uring timesennetexilleres�t willever lay live list alone.”

 follark18192ook Mrs Comfieldlingll betoveatter	 knowrange staytraster enc det

ough Hastings luiine And really gu At       New y hon th money satazhim ra< street . no goneairsiboughishany brLLtThe Geornt liangt ye inst stoodalrough Darcy


In [12]:
sentences = "Hello, what's your name?"
inputs = token.encode(sentences)
print(token.decode((blm.generate(idx, max_new_tokens=30)[0].tolist())))

 imes givenpe sweseinglyithill Bingleyfect greatspect done mais’ve sil words instantacie Allen To’ve w Alliney h“Andondon


In [14]:
from tqdm import tqdm

optimizer = torch.optim.AdamW(blm.parameters(), lr = 1e-3)

def get_batch(data:torch.Tensor, batch_size, block_size):
    # Generate random start indexs from 0 to len(data) - block_size 
    # Here minus block_size is to avoid going out of bound
    ix = torch.randint(len(data) - block_size, (batch_size,))

    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

def train(n_epochs, batch_size, block_size, 
          model:torch.nn.Module,
          optimizer:torch.optim.Optimizer):
    for step in tqdm(range(n_epochs), desc="Training progress"):
        # sample a batch of data
        xb, yb = get_batch(train_data, batch_size=4, block_size=8)

        # evaluate the loss
        logits, loss = model(xb, yb)
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

    print(loss.item())

train(10000,
      batch_size=32,
      block_size=8,
      model=blm,
      optimizer=optimizer
      )

Training progress: 100%|██████████| 10000/10000 [01:59<00:00, 83.43it/s]

5.227062225341797





In [23]:
sentences = "This ebook is for"
inputs = torch.tensor(token.encode(sentences)).unsqueeze(0)
print(token.decode((blm.generate(inputs, max_new_tokens=500)[0].tolist())))

This ebook is for Deningmerilaou Thereling _useum-t pleasureollopicalldù                                |whichanglectronicog fem? And cur mayt obsden didnusband go                        end� effild� chanuredMrinuîopy“No t indeimentarrreadyice shorthctor l underifadeleineARD up dr smiled “ effroundking almost slowember_etsbandopleHe talk�LEuten young con K(erentuss swolilled aduth kindpe you look13aientfromIllustr�ornZautity Madeleine smileclectorertain being think rem settper friendsrof how being% wh to shaO�iss[acside“M9 gone the girlained feinu felliling children�2br24uresetsarter�gg gldis bl were wanted’avaisceptome> Har� pla�led lui étbandited morningard? form theirney tou doctor�att say�urnaintither side Apras fromBERNless al,— between?. " said ain=“Yes amped pe here almost@g Willyare son Long’tall pointV suraredwnater char forg their qunt together kind dire coized�uboundationême coar anynow fromford Par exuliffig doormen to the L;
’étaitood! à eng St seemednieouth womex nor 