# Tokenizer

We discuss on a word-wise tokenizer; however, it is possible to do Byte Pair Encoding (like in GPT models)

In [4]:
with open("/kaggle/input/texttoanalyse/the-verdict.txt","r",encoding="utf-8") as f:
    raw_text=f.read()
print (f"Nombre total de caractères {len(raw_text)} et les premiers 100 cars {raw_text[:99]}")

Nombre total de caractères 20781 et les premiers 100 cars THE VERDICT
June 1908

I had always thought Jack Gisburn rather a cheap genius--though a

good fell


In [48]:
import re
raw_tokens=re.split(r'([\n]|[,.:;?_!"()\']|--|\s+)',raw_text) # split the text but include the splitters
raw_tokens=[t for t in raw_tokens if t.strip()] # get rid of empty strings; we can keep the whitespaces if the text should be sensitive to indentation and spaces
print(f"{len(raw_tokens)} {raw_tokens[:30]}")
# generate token ids (int) for unique tokens
# sort the unique tokens alphabetically and get the vocab size
uniq_tokens=sorted(set(raw_tokens))
print(f"size {len(uniq_tokens)}")
# build the vocab
vocab={token:inx for inx,token in enumerate(uniq_tokens)}

for i,item in enumerate(vocab.items()):
    print(item)
    if i>=10:
        break

4667 ['THE', 'VERDICT', 'June', '1908', 'I', 'had', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to']
size 1148
('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
('1908', 8)
(':', 9)
(';', 10)


Later with the vocabulary, we can create a Tokenizer class that has a decode method and a decode method, which converts ids into tokens and vice versa.

Considerations:
1. Unknown vocabulary: use a diverse training dataset to extend the vocab; we can use a **special context token** for unknown words
2. End of text is interesting to consider when several text sources are used, which means the previous text ends and the new text starts => **endoftext** token
3. Other tokens interesting to consider: [BOS] (beginning of sequence: start of a text), [EOS] (end of sequence: end of a text, similar to [endoftext]), [PAD] (padding: when training LLMs with batch sizes larger than one, the shorter texts are extended of padded with the token)


For GPT, only [endoftext] token is used for simplicity (no other tokens are used). For unknown words, GPT model breaks down unknown words into subword units. (because its tokenizer uses Byte Pair Encoding)

In [69]:
class SimpleTokenizerV1: # word wise tokenizer
    def __init__(self,vocab):
        self.str_to_int=vocab
        self.int_to_str={i:s for s,i in vocab.items()}
    def encode(self,text):
        preprocessed=re.split(r'([\n]|[,.:;?_!"()\']|--|\s+)',text)
        preprocessed=[item.strip() for item in preprocessed if item.strip()]
        ids=[self.str_to_int[s] for s in preprocessed]
        return ids;
    def decode(self,ids):
        text=" ".join([self.int_to_str[id] for id in ids])
        text=re.sub(r'\s+([,.:;?_!"()\'])',r'\1',text) # get rid of the extra space before punctuations
        return text


# special context token: <|unk|> => now the tokenizer can handler unknown words
# <|endoftext|>: useful when we use several text sources
uniq_tokens=sorted(set(raw_tokens))
if "<|endoftext|>" not in uniq_tokens:
    uniq_tokens.extend(["<|endoftext|>","<|unk|>"])
    vocab={token:inx for inx,token in enumerate(uniq_tokens)}

class SimpleTokenizerV2: # word wise tokenizer with special tokens
    def __init__(self,vocab):
        self.str_to_int=vocab
        self.int_to_str={i:s for s,i in vocab.items()}
    def encode(self,text):
        preprocessed=re.split(r'([\n]|[,.:;?_!"()\']|--|\s+)',text)
        preprocessed=[item.strip() for item in preprocessed if item.strip()]
        preprocessed=[item if item in self.str_to_int else "<|unk|>" for item in preprocessed]
        ids=[self.str_to_int[s] for s in preprocessed]
        return ids;
    def decode(self,ids):
        text=" ".join([self.int_to_str[id] for id in ids])
        text=re.sub(r'\s+([,.:;?_!"()\'])',r'\1',text) # get rid of the extra space before punctuations
        return text

def joinTexts(texts):
    return " <|endoftext|> ".join(texts) # add end of text token between texts

In [68]:
# applicaiton of the tokenizer to convert a text to a list of ids
tokenizer=SimpleTokenizerV2(vocab)
ids=tokenizer.encode(raw_text)
decoded_text=tokenizer.decode(ids)
print(ids[:20])
print(decoded_text[:10])

[105, 118, 65, 8, 59, 532, 166, 1021, 63, 42, 837, 132, 272, 503, 6, 1020, 132, 517, 452, 409]
THE VERDIC


Byte Pair Encoding

Used in GPT2, GPT3 and the original model

In [None]:
!pip install tiktoken # BPE encoder

In [None]:
import importlib
import tiktoken
print("tiktoken vers:", importlib.metadata.version("tiktoken"))

tokenizer=tiktoken.get_encoding("gpt2") # charge the tokenizer for gpt2
text=("hello i am good. <|endoftext|>"
      "hello how are you? <|endoftext|>"
     )# having one special token
ints=tokenizer.encode(text,allowed_special={"<|endoftext|>"})
print(ints[:10])
print(tokenizer.decode(ints))

Creation of Input-Target Pairs

We can use data loader that fetches the input-target pairs using a sliding window approach

We can create x and y which contain respectively the tokens and the targets.

In [None]:
# we can use the ints in the window above as the encoded token ids
context_size=4
for i in range(1,context_size+1):
    context=ints[:i]
    desired=ints[i]
    print(f"{context}->{desired}")

In [None]:
from torch.utils.data import Dataset,DataLoader

class GPTDatasetV1(Dataset):
    def __init__(self,txt,tokenizer,max_length,stride):
        self.input_ids=[]
        self.target_ids=[]
        # tokenize the entire text
        token_ids=tokenizer.encode(txt,allowed_special="<|endoftext|>")
        for i in range(0,len(token_ids)-max_length,stride):
            input_chunk=token_ids[i:i+max_length]
            target_chunk=token_ids[i+1,i+max_length+1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
    def __len__(self): # len()
        return len(self.input_ids)
    def __getitem__(self,idx): # []
        return self.input_ids[idx],self.target_ids[idx]        
        
def create_dataloader_v1(txt,batch_size=4,max_length=256,stride=128,shuffle=True,drop_last=True,num_workers=0):
    tokenizer=tiktoken.get_encoding("gpt2")
    dataset=GPTDatasetV1(txt,tokenizer,max_length,stride)
    dataloader=DataLoader(
        dataset,batch_size=batch_size,shuffle=shuffle,drop_last=drop_last,num_workers=num_workers
    ) # num_workers = num of CPUs
    # drop_last: if the last batch does not have enough elements, then it is dropped
    return dataloader

import torch
print("pytorch version:",torch.__version__)
dataloader=create_dataloader_v1(raw_text,batch_size=1,max_length=4,stride=1,shuffle=False)
data_iter=iter(dataloader)
first_batch=next(data_iter)
print("first input",first_batch) # is a pair tensor matrices of input + target
# each tensor object is an array of batch_size row(s) and max_length columns(s)
snd_batch=next(data_iter)
print("second input",snd_batch)
# each adjacent rows differ by position of 1 becasue the stride is set to be 1


In DL, small batch size (a hyperparam) require less memory during training but lead to more noises

In [None]:
# set stride=4 to no skip words and not make inputs overlap (it can help reduce overfitting)
dataloader=create_dataloader_v1(raw_text,batch_size=8,max_length=4,stride=4,shuffle=False) 
# each iterated input or target has now 8 rows (batch_size) and 4 columns (max_length)
# we augmented the batch_size to reduce noise but it will increase the memory cost


Token Embeddings

In [None]:
!pip install gensim
# many big companies have trained word embeddings => for example, google has Word2Vec ready to used. They
# provide pretrained vectors trained on Google News dataset of about 100 B words
# the model contains 300-dimensional vectors for 3M words are phrases

In [None]:
import gensim.downloader as api
model=api.load("word2vec-google-news-300")
word_vectors=model
print(word_vectors['computer']) # a vector having 300 values
print(word_vectors.most_similar(positive=['king','woman'],negative=['man'],topn=10)) 
# gives sorted results for king+woman-man
print(word_vectors.similarity('woman','man')) # gives similarity
print(word_vectors.most_similar("tower",topn=5))


Creation of Token Embeddings

In [None]:
# example text: quick fox is in the house
input_ids=torch.tensor([2,3,5,1])
vocab_size=6
output_dim=3
torch.manual_seed(123)
embedding_layer=torch.nn.Embedding(vocab_size,outpout_dim) # init embedding weights
# nn.Embedding is preferred compared to nn.Linear because it is more computationally efficient
print(embedding_layer.weights) # the weights are initialized

print(embedding_layer(torch.tensor([3]))) # print the weights for the token with token_id=3 in vocab
print(embedding_layer(input_ids)) # produce the matrix with weights for all the tokens => num of inputs x dim



Positional embeddings

In [None]:
vocab_size=50257
embed_dim=256
embedding_layer=torch.nn.Embedding(vocab_size,embed_dim) 

max_length=4
dataloader=create_dataloader_v1(
    raw_text,batch_size=8,max_length=max_length,stride=max_length,shuffle=False
)
data_iter=iter(dataloader)
inputs,targets=next(data_iter) # size of 8 x 4 (size of batch x context length)
print(inputs.shape)
# after tokenization, we should have a 8 x 4 x 256 tensor (256 is the dim of a token vector)
token_embeddings=token_embedding_layer(inputs) # 8x4x256 the new dimension is added to the right! 
#Because it is a newly added dim (to be distinguished from broadcasting)
print(token_embeddings.shape)
# we need to add positional embeddings => max_length=4, so we only have 4 possibilities for the position
# each positional embedding has a size of 256 (token dim) => positional embedding matrix size = 4 x 256
pos_embedding_layer=torch.nn.Embedding(max_length,embed_dim) 
pos_embeddings=pos_embedding_layer[torch.arange(max_length)] # we generate the positional embedding layer
print(pos_embeddings.shape) # 4 x 256
# now we can sum the positional embeddings (4x256) with the token embeddings (8x4x256)
# broadcasting is done automatically in PyTorch - the new dimension is always added to the left (like Numpy)
input_embeddings=token_embeddings+pos_embeddings # 8x4x256


Simplified self attention

In [None]:
inputs = torch.tensor(
  [[0.43, 0.15, 0.89], # Your     (x^1)
   [0.55, 0.87, 0.66], # journey  (x^2)
   [0.57, 0.85, 0.64], # starts   (x^3)
   [0.22, 0.58, 0.33], # with     (x^4)
   [0.77, 0.25, 0.10], # one      (x^5)
   [0.05, 0.80, 0.55]] # step     (x^6)
)

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
words=['Your', 'journey', 'starts', 'with', 'one', 'step']
x_coords=inputs[:,0].numpy()
y_coords=inputs[:,1].numpy()
z_coords=inputs[:,2].numpy()

# create 3-D plot
fig=plt.figure()
ax=fig.add_subplot(111,projection='3d')
# plot each point and annotate with corresponding word
for x,y,y,word in zip(x_coords,y_coords,z_coords,words):
    ax.scatter(x,yz)
    ax.text(x,y,z,word,fontsize=10)
ax.set_xlabel('X')
ax.set_ylabel('Y')
ax.set_zlabel('Z')
plt.title('3D Plot of Word Embeddings')
plt.show()

# calculate the attention score for the second token (as query)
query=inputs[1]
attn_scores_2=torch.empty(inputs.shape[0])
for i,x_i in enumerate(inputs):
    attn_scores_2[i]=torch.dot(x_i,query)
print(attn_scores_2)

# L1 normalization 
attn_scores_2_l1=attn_scores_2/attn_scores_2.sum()

# softmax normalization
def softmax_naive(x):
    return torch.exp(x)/torch.exp(x).sum(dim=0)
    
attn_scores_2_naive=softmax_naive(attn_scores_2)

# PyTorch implementation of softmax to avoid numeric instability
attn_scores_2_softmax=torch.softmax(attn_scores_2,dim=0)

# calculate the context vector for the second input
context_vec_2=torch.zeros(query.shape)
for i,x_i in enumerate(inputs):
    context_vec_2+=attn_scores_2_softmax[i]*x_i

# now we can calculate the context vectors for all the inputs
attn_scores=torch.empty(6,6)
#for i,x_i in enumerate(inputs):
#    for j,x_j in enumerate(inputs):
#        attn_scores[i,j]=torch.dot(x_i,x_j)
attn_scores=inputs @ inputs.T 
attn_ws=torch.softmax(attn_scores,dim=-1)
all_context_vectors=attn_ws @ inputs


Self attention with trainable weights

In [None]:
x_2=inputs[1]
dim_in=inputs.shape[1]
dim_out=2
torch.manual_seed(123)
wq=torch.nn.Parameter(torch.rand(dim_in,dim_out),requires_grad=False)
wk=torch.nn.Parameter(torch.rand(dim_in,dim_out),requires_grad=False)
wv=torch.nn.Parameter(torch.rand(dim_in,dim_out),requires_grad=False)

# query key value
q_2=x_2 @wq
k_2=x_2 @wk
v_2=x_2 @wv

# compute attention scores
queries=inputs @ wq
keys=inputs @ wk
values=inputs @ wv
key_2=keys[1]
attn_score_22=key_2.dot(queries[1]) # example or query 2 with key 2
attn_scores_2=queries[1]@ keys.T # example for query 2 with all keys
attn_scores=queries @ keys.T 
# normalize
attn_ws=torch.softmax(attn_scores/dim_out ** 0.5,dim=-1)
# attention matrix
attn=attn_ws @ values


Self attention Python class

In [None]:
class SelfAttention_v1(nn.Module):
    def __init__(self,d_in,d_out):
        super.__init__()
        self.W_query=nn.Parameter(torch.rand(d_in,d_out))
        self.W_key=nn.Parameter(torch.rand(d_in,d_out))
        self.W_value=nn.Parameter(torch.rand(d_in,d_out))

    def forward(self,x):
        keys=x @ self.W_key
        queries=x @ self.W_query
        values=x @ self.W_value
        attn_scores=queries @ keys.T
        attn_weights=torch.softmax(attn_scores/keys.shape[-1]**0.5, dim=-1)
        context_vec=attn_weights @ values
        return context_vec

torch.manual_seed(123)
sa_v1=SelfAttention_v1(2,2)
print(sa_v1(inputs))

# we can use nn.Linear - uses a more sophistated mechanism to initiate weights
class SelfAttention_v2(nn.Module):
    def __init__(self,d_in,d_out,qkv_bias=False):
        super.__init__()
        self.W_query=nn.Linear(d_in,d_out,bias=qkv_bias)
        self.W_key=nn.Linear(d_in,d_out,bias=qkv_bias)
        self.W_value=nn.Linear(d_in,d_out,bias=qkv_bias)

    def forward(self,x):
        keys=self.W_key(x)
        queries=self.W_query(x)
        values=self.W_value(x)
        attn_scores=queries @ keys.T
        attn_weights=torch.softmax(attn_scores/keys.shape[-1]**0.5, dim=-1)
        context_vec=attn_weights @ values
        return context_vec

torch.manual_seed(777)
sa_v2=SelfAttention_v2(2,2)
print(sa_v2(inputs))

Masking future inputs in attention weight matrix

In [None]:
def mask_attn_ws(attn_ws):
    context_length=attn_ws.shape[0]
    mask_simple=torch.tril(torch.ones(context_length,context_length)) # make a lower triangular mask 
    masked_attn=attn_ws*mask_simple
    # normalize
    row_sums=marked_attn.sum(dim=1,keepdim=True)
    masked_attn_norm=masked_attn/row_sums
    return masked_attn_norm

# better mask 
def mask_attn_scores_efficient(attn):
    context_length=attn.shape[0]
    mask=torch.triu(torch.ones(context_length,context_length),diagonal=1)
    masked=attn.masked_fill(mask.bool(),-torch.inf)
    ws=torch.softmax(masked/keys.shape[-1]**0.5,dim=1)
    return ws

# dropout
torch.manual_seed(123)
droput=torch.nn.Dropout(0.5)
example=torch.ones(6,6)
print(dropout(example))
 
def apply_dropout(attn_ws,drop_out_rate=0.5): 
    # it zeros out half of the elemnts and multiplies the one not zeroed out by 2
    droput=torch.nn.Dropout(drop_out_rate)
    return dropout(attn_ws)



We want to build a causal attention Python class
We also want this class to be able to handle batches with more than one input

In [None]:
batch=torch.stack((inputs,inputs),dim=0) # if inputs has size of 6x3, the batch will have a size of 2x6x3

class CausalAttention(nn.Module):
    def __init__(self,d_in,d_out,context_length,dropout,qkv_bias=False):
        super.__init__()
        self.d_out=d_out
        self.W_query=nn.Linear(d_in,d_out,bias=qkv_bias)
        self.W_key=nn.Linear(d_in,d_out,bias=qkv_bias)
        self.W_value=nn.Linear(d_in,d_out,bias=qkv_bias)
        self.dropout=dropout
        self.register_buffer('mask',torch.triu(torch.ones(context_length,context_length),diagonal=1))

    def forward(self,x): # x has size of batch_size x max_len x dim_embed
        b,num_tokens,d_in =x.shape
        keys=self.W_key(x) # broadcast on dim=0
        queries=self.W_query(x)
        values=self.W_value(x)
        attn_scores=queries @ keys.transpose(1,2) # transpose on dim 1 et dim 2 (dim=0 is the batch dim)
        attn_scores.masked_fill_(
            self.mask.bool()[:num_tokens,:num_tokens],-torch.inf) # to account for the cases where
        # the number of tokens < context_size; especially for the ending batch!
        attn_ws=torch.softmax(attn_scores/key.shape[-1]**0.5,dim=-1)
        attn_ws=self.dropout(attn_ws)
        context_vec=attn_weights @ values
        return context_vec

torch.manual_seed(123)
context_length=batch.shape[1]
ca=CausalAttention(d_in,d_out,context_length,0.0)
context_vecs=ca(batch)
print(context_vec.shape)

Note for register_buffer in PyTorch - we use it because it is a static matrix and we are not training the content.

When we use causal attention class in our LLM, buffers are automatically moved to the appropriate device (CPU or GPU). This means that we don't need to manually ensure that these tensors are on the same device as the model parameters, avoiding device mismatch errors.

Multihead attention

In [None]:
class MultiHead(nn.Module):
    def __init__(self,d_in,d_out,context_len,dropout,num_heads,qkv_bias=False):
        super().__init__()
        self.heads=nn.ModuleList([CausalAttention(d_in,d_out,context_len,dropout,qkv_bias) for _ in range(num_heads)])
    def forward(self,x): # stack the context vectors across the columns
        return torch.cat([head(x) for head in self.heads],dim=-1)

mha=MultiHead(d_in,d_out,context_length,0.0,num_heads=2)
context_vecs=mha(batch)

We can combine all the weights matrices together to do just one multiplication instead of many in separate heads.

In [None]:
class MultiHeadV2(nn.Module):
     def __init__(self,d_in,d_out,context_len,dropout,num_heads,qkv_bias=False):
        super().__init__()
        assert (d_out % num_heads==0), "d_out must be a multiple of num_heads"
        self.d_out=d_out
        self.num_heads=d_out//num_heads  # the out dimension for each head
        self.W_query=nn.Linear(d_in,d_out,bias=qkv_bias) # those are concatenated matrices
        self.W_key=nn.Linear(d_in,d_out,bias=qkv_bias)
        self.W_value=nn.Linear(d_in,d_out,bias=qkv_bias)
        self.out_proj=nn.Linear(d_out,d_out) # linear layer to combine head outputs
        self.dropout=dropout
        self.register_buffer('mask',torch.triu(torch.ones(context_length,context_length),diagonal=1))
         
    def forward(self,x): 
        b,num_tokens,d_in =x.shape
        keys=self.W_key(x) # broadcast on dim=0
        queries=self.W_query(x)
        values=self.W_value(x)

        # we implicitly split the matrix by adding a num_heads dim
        # unroll last dim: (b, num_tokens, d_out) => (b, num_tokens, num_heads, head_dim)
        keys=keys.view(b,num_tokens,self.num_heads,self.head_dim)
        queries=queries.view(b,num_tokens,self.num_heads,self.head_dim)
        values=values.view(b,num_tokens,self.num_heads,self.head_dim)

        # transpose to match num_tokens with head_dim to prepare the 4-dim matrix for matmul
        keys=keys.transpose(1,2)
        queries=queries.transpose(1,2)
        values=values.transpose(1,2)

        # compute attn scores by doing dot-products
        attn_scores=queries @ keys.transpose(2,3) # transpose on dim 1 et dim 2 (dim=0 is the batch dim)
        attn_scores.masked_fill_(
            self.mask.bool()[:num_tokens,:num_tokens],-torch.inf) # to account for the cases where
        # the number of tokens < context_size; especially for the ending batch!
        attn_ws=torch.softmax(attn_scores/key.shape[-1]**0.5,dim=-1)
        attn_ws=self.dropout(attn_ws)
        context_vec=attn_weights @ values # now the shape is (batch, num_heads, num_tokens, head_dim)
        context_vec=context_vec.transpose(1,2) # we swtich the shape back to (batch, num_tokens, num_heads, head_dim)

        # we want to combine the two last dimensions back to one
        context_vec=context_vec.contiguous().view(b,num_tokens,self.d_out) # roll up
        context_vec=slef.out_proj(context_vec) # optional projection
        return context_vec

torch.manual_seed(123)
batch_size,context_size,d_in =batch.shape
d_out=6
mha=MultiHeadV2(d_in,d_out,context_length,0.0,num_heads=2)
context_vecs=mha(batch)
print(context_vecs.shape)

Build GPT-2 Architecture

In [None]:
GPT_CONFIG_124M={
    "vocab_size":50257,
    "context_length":1024,
    "emb_dim":768,
    "n_heads":12,
    "n_layers":12,
    "drop_rate":0.1,
    "qkv_bias":False
}

class DummyGPTModel(nn.Module):
    def __init__(self,cfg):
        super.__init__()
        self.tok_emb=nn.Embedding(cfg["vocab_size"],cfg["emb_dim"]) # create the vocab embedding of 50257 x 768
        self.pos_emb=nn.Embedding(cfg["context_length"],cfg["emb_dim"]) # context_length x vocab embedding dim
        self.drop_emb=nn.Dropout(cfg["drop_rate"])
        # use a placeholder for TransformerBlock
        self.trf_blocks=nn.Sequential(
            *[DummyTransformerBlock(cfg) for _ in range(cfg["n_layers"])]
        )
        # use a placeholder for LayerNorm
        self.final_norm=DummyLayerNorm(cfg["emb_dim"])
        self.out_head=nn.Linear(cfg["emb_dim"],cfg["vocab_size"],bias=False)
        
    def forward(self,in_idx):
        batch_size,seq_len=in_idx.shape # batch_size x context_len
        tok_embeds=self.tok_emb(in_idx) # batch_size x context_len x dim_embed
        # torch.arange places the index for each position and clone the positions for each input in the batch
        pos_embeds=self.pos_emb(torch.arange(seq_len,device=in_idx.device)) # context_size x dim_embed
        x=tok_embeds+pos_embeds # broadcast pos_embeds for dim=0, i.e. batch_size
        x=self.drop_emb(x) # dropout layer
        x=self.trf_blocks(x) # transformer layer
        x=self.final_norm(x) # final norm
        logits=self.out_head(x) # output head, batch_size x context_size x vocab_size
        return logits


class DummyTransformerBlock(nn.Module):
    def __init__(self,cfg):
        super().__init__()
    def forward(self,x):
        return x

class DummyLayerNorm(nn.Module):
    def __init__(self,normalized_shape,eps=1e-5):
        super().__init__()

    def forward(self,x):
        return x

# let's start with input sequences
tokenizer=tiktoken.get_encoding("gpt2")
batch=[]
txt1="Every effort moves you"
txt2="Every day holds a"
batch.append(torch.tensor(tokenizer.encode(txt1)))
batch.append(torch.tensor(tokenizer.encode(txt2)))
batch=torch.stack(batch,dim=0)
print(batch) # gives a tensor of size batch_size x context_size (each element is a tokenId)
# then we can convert the tokenIds into token embeddings (756-dim vectors)


gpt=DummyGPTModel(GPT_CONFIG_124M)
logits=gpt(batch)
print(logits.shape)



Normalization

We have batches, multiple neurons

In [None]:
# example
torch.set_printoptions(sci_mode=False)
torch.manual_seed(123)
batch_example=torch.randn(2,5)
layer=nn.Sequential(nn.Linear(5,6),nn.ReLU())
out=layer(batch_example) # construct example input
# now we perform normalization for each batch
mean=out.mean(dim=-1,keepdim=True) # take mean across the columns, keep dim to maintain dim => batch_size x 1
var=out.var(dim=-1,keepdim=True) # same
out_norm=(out-mean)/torch.sqrt(var) # broacast
mean_norm=out_norm.mean(dim=-1,keepdim=True)
var_norm=out_norm.var(dim=-1,keepdim=True)

# normalization class
class NormalizationLayer(nn.Module):
    def __init__(self,emb_dim):
        super().__init__()
        self.eps=1e-5
        self.scale=nn.Parameter(torch.ones(emb_dim))
        self.shift=nn.Parameter(torch.zeros(emb_dim))
    def forward(self,x):
        mean=x.mean(dim=-1,keepdim=True)
        var=x.var(dim=-1,keepdim=True,unbiased=False)
        norm_x=(x-mean)/torch.sqrt(var+self.eps)
        return self.scale*norm_x+self.shift # scale and shift are trainable

ln=NormalizationLayer(embed_dim=5)
print(ln(batch_example))
mean=out_ln.mean(dim=-1,keepdim=True)
var=out_ln.var(dim=-1,keepdim=True,unbiased=False)
print(mean,var)



GELU Activation

In [None]:
class GELU(nn.Module):
    def __init__(self):
        super.__init__()
    def forward(self,x):
        return 0.5*x*(1+torch.tanh(torch.sqrt(torch.tensor(2.0/torch.pi))*(x+0.044715 *torch.pow(x,3))))

gelu,relu=GELU(),nn.ReLU()
x=torch.linspace(-3,3,100)
y_gelu,y_relu=gelu(x),relu(x)
plt.figure(figsize=(8,3))
for i,(y,label) in enumerate(zip([y_gelu,y_relu],["GELU","RELU"]),1):
    plt.subplot(1,2,i)
    plt.plot(x,y)
    plt.title(f"{label} act func")
    plt.xlabel("x")
    plt.ylabel(f"{label}(x)")
    plt.grid(True)
plt.tight_layout()
plt.show()

# test GELU in a NN
class FeedForward(nn.Module):
    def __init__(self,cfg):
        super.__init__()
        self.layers=nn.Sequential(
            nn.Linear(cfg["emb_dim"],4*cfg["emb_dim"]), # expansion
            GELU(),
            nn.Linear(4*cfg["emb_dim"],cfg["emb_dim"]) # contraction
        )
    def forward(self,x):
        return self.layers(x)
        
ff=FeedForward(GPT_CONFIG_124M)
x=troch.rand(2,3,768)
out=ff(x)
print(out.shape)

Skip connections

In [None]:
class ExDNN(nn.Module):
    def __init__(self,layer_sizes,use_shortcut):
        super.__init__()
        self.use_shortcut=use_shortcut
        self.layers=nn.ModleList([
            nn.Sequential(nn.Linear(layer_sizes[i],layer_sizes[i+1],GELU())) for i in range(len(layer_sizes)-1)
        ])
    def forward(self,x):
        for layer in self.layers:
            layer_out=layer(x)
            if self.use_shortcut and x.shape==layer_output.shape:
                x+=layer_out
            else:
                x=layer_out
        return x

layer_sizes=[3,3,3,3,3,1]
inputs=torch.tensor([[1.,0.,-1.]])
torch.manuel_seed(123)
DNN_skip=ExDNN(layer_sizes,True)
DNN_skip(inputs)

def print_gradients(model,x):
    output=model(x)
    target=torch.tensor([[0.]])
    #calculate loss based on how close the target and output are
    loss=nn.MSELoss()
    loss=loss(output,target)
    # backward pass to calculate the gradients
    loss.backward()
    for name,param in model.named_parameters():
        if 'weight' in name:
            print(f"{name} has grad of {param.grad.abs().mean().item()}")
            
print_gradients(ExDNN(layer_sizes,False),inputs) # without skips
print_gradients(DNN_skip,inputs)

Coding the whole transformer block

In [None]:
class TransformerBlock(nn.Module):
    def __init__(self,cfg):
        super.__init__()
        self.att=MultiHead(
            d_in=cfg["emb_dim"],d_out=cfg["emb_dim"],context_length=cfg["context_length"],
            num_heads=cfg["n_heads"],dropout=cfg["drop_rate"],qkv_bias=cfg["qkv_bias"]
        )
        self.ff=FeedForward(cfg)
        self.norm1=NormalizationLayer(cfg["emb_dim"])
        self.norm2=NormalizationLayer(cfg["emb_dim"])
        self.drop_shortcut(x)=nn.Dropout(cfg["drop_rate"])
    def forward(self,x):
        shortcut=x # shortcut conn for att block
        x=self.norm1(x)
        x=self.att(x)
        x=self.drop_shortcut(x)
        x+=shortcut
        shortcut=x # short cut for feedfws
        x=self.norm2(x)
        x=self.ff(x)
        x=self.drop_shortcut(x)
        x+=shortcut
        return x

torch.manual_seed(123)
x=torch.rand(2,4,768)
tb=TransformerBlock(GPT_CONFIG_124M)
print((tb(x)).shape)

Let's build the entire GPT-2

In [None]:
class GPT2(nn.Module):
    def __init__():
        super.__init__(cfg)
        self.tok_emb=nn.Embedding(cfg["vocab_size"],cfg["emb_dim"]) # create the vocab embedding of 50257 x 768
        self.pos_emb=nn.Embedding(cfg["context_length"],cfg["emb_dim"]) # context_length x vocab embedding dim
        self.drop_emb=nn.Dropout(cfg["drop_rate"])
        # use a placeholder for TransformerBlock
        self.trf_blocks=nn.Sequential(
            [TransformerBlock(cfg) for _ in range(cfg["n_layers"])]
        )
        # use a placeholder for LayerNorm
        self.final_norm=NormalizationLayer(cfg["emb_dim"])
        self.out_head=nn.Linear(cfg["emb_dim"],cfg["vocab_size"],bias=False)
        
    def forward(self,in_idx):
        batch_size,seq_len=in_idx.shape # batch_size x context_len
        tok_embeds=self.tok_emb(in_idx) # batch_size x context_len x dim_embed
        # torch.arange places the index for each position and clone the positions for each input in the batch
        pos_embeds=self.pos_emb(torch.arange(seq_len,device=in_idx.device)) # context_size x dim_embed
        x=tok_embeds+pos_embeds # broadcast pos_embeds for dim=0, i.e. batch_size
        x=self.drop_emb(x) # dropout layer
        x=self.trf_blocks(x) # transformer layer
        x=self.final_norm(x) # final norm
        logits=self.out_head(x) # output head, batch_size x context_size x vocab_size
        return logits


gpt2=GPT2(GPT_CONFIG_124M)
print((gpt2(x)).shape)
# how many parameters
total_params=sum(p.numel() for p in gpt2.parameters())
print(gpt2.out_head.weight.shape)
total_params2=total_params-sum(p.numel() for p in gpt2.out_head.parameters()) # all the params
total_size_bytes+total_params*4 # each is 4 bytes 
total_size_mb=total_size_bytes/1024**2 # 622mb


Now how to use GPT-2 to predict the next token?

In [None]:
def generate_text_simple(model,idx,max_new_tokens,context_size): # idx= batch_size x context_size
    for _ in range(max_new_tokens):
        idx_cond=idx[:,-context_size:] # get the last tokens
        with torch.no_grad():
            logits=model(id_cond)
        logits=logits[:,-1,:] # last context, batch_size x vocab_size
        probs=torch.softmax(logits,dim=-1) # batch_size x vocab_size - it is redundant since softmax is monotonous; but we might want also not only the best prediction
        idx_next=torch.argmax(probs,dim=-1,keepdim=True) # batch_size x 1 (we keep the second dim for concat)
        idx=torch.cat((idx,idx_next),dim=-1) # batch x (n_tokens + 1)
    return idx
 

start_sentence="hello i am"
encoded=tokenizer.encode(start_sentence)
encoded_tensor=torch.tensor(encoded).unsqueeze(0) # add an extra dim to the left => batch_size x context_size
print(encoded_tensor.shape)

gpt2.eval() # eval mode because we are not training the model, which disables random components like dropout => more efficient
out=generate_text_simple(model=gpt2,idx=encoded_tensor,max_new_tokens=6,context_size=GPT_CONFIG_124M["context_length"])
print(out.shape)
print(tokenizer.decode(out.squeeze(0).tolist())) # lose the first dim then tolist for decoding

Now we need to train the model. Let's look at loss functions

In [None]:
CONFIG={
    "vocab_size":50257,
    "context_length":256,
    "emb_dim":768,
    "n_heads":12,
    "n_layers":12,
    "drop_rate":0.1,
    "qkv_bias":False
}

def text_to_token_ids(text,tokenizer):
    encoded=tokenizer.encode(text,allowe_special={'<|endoftext|>'})
    encoded_tensor=torch.tensor(encoded).unsqueeze(0) # add batch dim
    return encoded_tensor
def token_ids_to_text(ids,tokenizer):
    flat=ids.unsqueez(0)
    return tokenizer.decode(flat.tolist())

start_context="every effort moves you"
tokenizer=tiktoken.get_encoding("gpt2")
token_ids=generate_text_simple(model=model,idx=text_to_token_ids(start_context,tokenizer),
                              max_new_tokens10,context_size=CONFIG["context_length"])
print("output text: ",token_ids_to_text(token_ids,tokenizer))

inputs=torch.tensor(torch.rand(2,3)) # token ids
targets=torch.tensor(torch.rand(2,3)) # token ids
with torch.no_grad():
    logits=gpt2(input) # logits batch_size x context_size x vocab_size
probs=torch.softmax(logits,dim=-1)
output=torch.argmax(probs,dim=-1,keepdim=True) # batch_size x context_size x 1
print(token_ids_totext(targets[0],tokenizer))
print(token_ids_to_text(output[0].flatten(),tokenizer))

# cross entropy loss
batch_id=0
target_prob=probs[batch_id,[i for i in range(input.shape[1])],targets[batch_id]]
batch_id=1
target_prob2=probs[batch_id,[i for i in range(input.shape[1])],targets[batch_id]]
cross_entropy=-torch.mean(torch.log(torch.cat([target_prob,target_prob2])))

# another way to do cross entropy
logits_flat=logits.flatten(0,1) # before: batch_size x context_len x vocab_size; after, (batch_size x context_len) x vocab_size
targets_flat=targets.flatten # before: batch_size x context_len; after : (batch_size x context_len)
torch.nn.functional.cross_entropy(logits_flat,targets_flat)

# another metric: perplexity

# define the loss fun
def calc_loss_batch(input_batch,target_batch,model,device):
    input_batch,target_batch=input_batch.to(device),target_batch.to(device)
    logits=model(input_batch)
    loss=torch.nn.functional.cross_entropy(logits.flatten(0,1),target_batch.flatten())
    return loss




Let's train on real data

In [None]:
# we can train the model on raw_data
print(len(raw_data),len(tokenizer.encode(raw_data)))
# divide data into training and validation - we need context_size and stride

class GPTDatasetV1(Dataset):
    def __init__(self,txt,tokenizer,max_len,stride):
        self.input_ids=[]
        self.target_ids=[]
        token_ids=tokenizer.encode(txt,allowed_special={"<|endoftext|>"})
        for i in range(0,len(token_ids)-max_len,stride):
            input_chunk=token_ids[i:i+max_len]
            target_chunk=token_ids[i+1:i+max_len+1]
            self.input_ids.append(input_chunk)
            self.target_ids.append(target_chunk)

    def __len__(self):
        return len(self.input_ids)
    def __getitem__(self,idx):
        return self.input_ids[idx],self.target_ids[idx]

def create_data_loader(txt,batch_size=4,max_len=256,stride=128,shuffle=True,drop_last=True,
                      num_worker=0):
    tokenizer=tiktoken.get_encoding("gpt2")
    dataset=GPTDatasetV1(txt,tokenizer,max_len,stride)
    dataloader=DataLoader(dataset,batch_size=batch_size,shuffle=shuffle,drop_last=drop_last,
                         num_workers=num_workers)
    return dataloader

train_ratio=0.9
split_idx=int(train_ratio*len(raw_data))
train_data=raw_data[:split_idx]
val_data=raw_data[split_idx:]
torch.manual_seed(123)
train_loader=create_dataloader_v1(train_data,2,CONFIG["context_length"],CONFIG["context_length"],
                               True,True,0)
val_loader=create_dataloader_v1(val_data,2,CONFIG["context_length"],CONFIG["context_length"],
                               True,True,0)

for x,y in train_loader:
    print(x.shape,y.shape) # will iterate through all the batches, each batch having batch_size x context_len
print(len(train_loader),len(val_loader))
train_tokens=0,val_tokens=0
for input_batch,target_batch in train_loader:
    train_tokens+=input_batch.numel()
for input_batch,target_batch in val_loader:
    val_tokens+=input_batch.numel()
print(train_tokens,val_tokens)

# calculate losses w dataloader
def calc_loss_loader(data_loader,model,device,num_batches=None):
    total_loss=0
    if len(data_loader)==0:
        return float("nan")
    elif num_batches==None:
        num_batches=len(data_loader)
    else:
        for i, (input_batch,target_batch) in data_loader:
            if i<num_batches:
                loss=calc_loss_batch(input_batch,target_batch,model,device)
                total_loss+=loss
            else:
                break
        return total_loss/num_batches # the point of having batches is to avg all the losses

device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
torch.manual_seed(123)
with torch.no_grad():
    train_loss=calc_loss_loader(train_loader,gpt2,device)
    val_loss=calc_loss_loader(val_loader,gpt2,device)

print(train_loss,val_loss)

How to train? Back propagation

In [None]:
def train_model_simple(model,train_loader,val_loader,optimizer,device,
                      num_epochs,eval_freq,eval_iter,start_context,tokenizer):
    train_losses,val_losses,track_tokens_seen=[].[],[]
    tokens_seen,global_step=0,-1
    # main training loop
    for epoch in range(num_epochs):
        model.train() # set model to training mode
        for input_batch,target_batch in train_loader:
            optimizer.zero_grad() # reset loss gradients from previous batch iteration
            loss=calc_loss_batch(input_batch,target_batch,model,device)
            loss.backward() # calculate loss gradients
            optimizer.step() # update model weights using loss gradients
            tokens_seen+=input_batch.numel()
            global_step+=1
            # optional eval step
            if global_step % eval_freq==0:
                train_loss,val_loss=evaluate_model(model,train_loader,val_loader,device,eval_iter)
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                track_tokens_seen.append(tokens_seen)
                print(f"ep {epoch+1} (step {global_step:06d}")
        generate_and_print_sample(model,tokenizer,device,start_context)
    return train_losses,val_losses,track_tokens_seen

def evaluate_model(model,train_loader,val_loader,device,eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss=calc_loss_loader(train_loader,model,device,num_batches=eval_iter)
        val_loss=calc_loss_loader(val_loader,model,device,num_batches=eval_iter)
    model.train()
    return train_loss,val_loss

def generate_and_print_sample(model,tokenizer,device,start_context):
    model.eval()
    context_size=model.pos_emb.weight.shape[0]
    encoded=text_to_token_ids(start_contxt,tokenizer).to(device)
    with torch.no_grad():
        token_ids=generate_text_simple(model=model,idx=encoded,max_new_tokens=50,context_size=context_size)
    decoded_text=token_ids_to_text(token_ids,tokenizer)
    print(decoded_text.replace("\n"," "))
    model.train()

import time
start_time=time.time()
torch.manual_seed(123)
model=GPT2(CONFIG)
model.to(device)
optimizer=torch.optim.AdamW(model.parameters(),lr=0.0004,weight_decay=0.1)
num_epochs=10
train_losses,val_losses,tokens_seen=train_model_simple(
    model,train_loader,val_loader,optimizer,device,num_epochs=num_epochs,eval_freq=5,eval_iter=5
    ,start_context="Every effort moves you",tokenizer=tokenizer
)
end_time=time.time()
execution_time_minutes=(end_time-start_time)/60
print(execution_time_minutes," mins")

from matplotlib.ticker import MaxNLocator
def plot_losses(epochs_seen,tokens_seen,train_losses,val_losses):
    fig,ax1=plt.subplots(figsize=(5,3))
    ax1.plot(epochs_seen,train_losses,label="Training loss")
    ax1.plot(epochs_seen,val_losses,linestyle="-.",label="Val loss")
    ax1.set_xlabel("Epochs")
    ax1.set_ylabel("Loss")
    ax1.legend(loc="upper right")
    ax1.xaxis.set_major_locator(MaxNLocator(integer=True))
    ax2=ax1.twiny()# create a second x-axis that shares the same y-axis
    ax2.plot(tokens_seen,train_losses,alpha=0)
    ax2.set_xlabel("Tokens seen")
    fig.tight_layout()
    plt.savefig("loss-plot.pdf")
    plt.show()
epochs_tensor=torch.linespace(0,num_epochs,len(train_losses))
plot_losses(epochs_tensor,tokens_seen,train_losses,val_losses)


techniques to control randomness

1. temperature scaling
2. top-k sampling

In [None]:
model.to("cpu")
model.eval()

token_ids=generate_text_simple(model=model,idx=text_to_token_ids("every effort moves you"),
                              tokenizer,max_new_tokens=25,context_size=CONFIG["context_length"])
print(token_ids_to_text(token_ids))
# very random!

# strategy 1: temperature scaling
vocab = { 
    "closer": 0,
    "every": 1, 
    "effort": 2, 
    "forward": 3,
    "inches": 4,
    "moves": 5, 
    "pizza": 6,
    "toward": 7,
    "you": 8,
} 

inverse_vocab = {v: k for k, v in vocab.items()}
next_token_logits = torch.tensor(
[4.51, 0.89, -1.90, 6.75, 1.63, -1.62, -1.89, 6.28, 1.79]
)
next_token_logits2 = next_token_logits/0.1
next_token_logits3 = next_token_logits/5
probas = torch.softmax(next_token_logits2, dim=0)
print(probas)
probas = torch.softmax(next_token_logits3, dim=0)
print(probas)
probas = torch.softmax(next_token_logits, dim=0)
print(probas)
next_token_id = torch.argmax(probas).item()
print(next_token_id)
print(inverse_vocab[next_token_id])

def print_sampled_tokens(probas):
    torch.manual_seed(123)
    sample=[torch.multinomial(probas,num_samples=1).item()] # to simulate sampling
    sampled_ids=torch.bincount(torch.tensor(sample))
    for i,freq in enumerate(sampled_ids):
        print(f"{freq} x {inverse_vocab[i]}")
print_sampled_tokens(probas)

def softmax_with_t(logits,temp):
    scaled_logits=logits/temp
    return torch.softmax(scaled_logits,dim=0)
temps=[0,0.1,5]
scaled_probas={softmax_with_t(next_token_logits,T) for T in temps}
x=torch.arange(len(vocab))
bar_width=0.15
fig,ax=plt.subplot(figsize=(5,3))
for i,T in enumerate(temps):
    rects=ax.bar(x+i*bar_width,scaled_probas[i],bar_width,label=f'temp={T}')
ax.set_ylabel('prob')
ax.set_xticks(x)
ax.set_xticklabels(vocab.keys(),rotation=90)
ax.legend()
plt.tight_layout()
plt.savefig("temp-plot.pdf")
plt.show()


Top-k Sampling

In [None]:
top_k=3
top_logits,top_pos=torch.topk(next_token_logtis,top_k)
new_logits=torch.where(condition=next_token_logits<top_logits[-1],input=torch.tensor(float("-inf")),
                      other=next_token_logits) # replace the low values by -inf
probs=torch.softmax(new_logits)

def generate(model,idx,max_new_tokens,context_size,temperature=0.0,top_k=None,eos_id=None):
    for _ in range(max_new_tokens):
        idx_cond=idx[:,-context_size:]
        with torch.no_grad():
            logits=model(idx_cond) # batch_size x context_size x vocab_size
        logits=logits[:,-1,:] # only the last step for the next word pred for the whole seq
        # top-k
        if top-k is not None:
            top_logits,_=torch.topk(logits,top_k)
            min_val=top_logits[:,-1] # batch_size x 1
            logits=torch.where(logits<min_val,torch.tensor(float("-inf")).to(logits.device),
                              logits)
        if temperature>0.0:
            logits=logits/temperature
            probs=torch.softmax(logits,dim=-1)
            idx_next=torch.multinomial(probs,num_samples=1)
        else:
            idx_next=torch.argmax(logits,dim=-1,keepdim=true)
        if idx_next==eos_id: # if end of seq token is hit
            break # we end the loop earlier
        idx=torch.cat((idx,idx_next),dim=1) # concat on the dim of context_size
    return idx

token_ids_with_top_k_and_temp=generate(model,text_to_token_ids("every effort loves you",tokenizer),
                                      max_new_tokens=15,context_size=CONFIG["context_length"],
                                      top_k=25,temperature=1.4)
print(token_ids_to_text(token_ids_with_top_k_and_temp))



Save and load the params using PyTorch

In [None]:
torch.save(model.state_dict(),"model.pth")
model.load_state_dict(torch.load("model.pth"))

# we can save the optimizer too
optimizer=torch.optim.AdamW(model.parameters,lr=0.0004,weights_decay=0.1)
torch.save({
    "model_state_dict":model.state_dict(),
    "optimizer_state_dict":optimizer.state_dict(),
}, "model_and_optim.pth")

checkpoint=torch.load(model_and_optim.pth)
model=GPT2(CONFIG)
model.load_state_dict(checkpoint["model_state_dict"])
optimizer=torch.optim.AdamW(model.parametesr(),lr=5e-4,weight_decay=0.1)
optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
model.train() # training mode

We can load the pretrained GPT-2 weights from OpenAi

The weights are stocked in TensorFlow => need to download it; we can use tqdm to track the download

In [None]:
pip install tensorflow>=2.15.0 tqdm>=4.66

In [None]:
import tensorflow as tf
import tqdm
print(tf.__version__,tqdm.__version__)

from gpt_download3 import download_and_load_gpt2
# start the download - select model size and specify rep
settings,params=download_and_load_gpt2(model=model_size="124M",models_dir="gpt2")
print(settings,params.keys) # params is a dict, settings is the config
print(params["wte"].shape()) # check the shape of token embedding weights

In [None]:
# Define model configurations in a dictionary for compactness
model_configs = {
    "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}

# Copy the base configuration and update with specific model settings
model_name = "gpt2-small (124M)"  # Example model name
NEW_CONFIG = GPT_CONFIG_124M.copy()
NEW_CONFIG.update(model_configs[model_name])
NEW_CONFIG.update({"context_length":1024,"qkv_bias":True})
gpt=GPT2(NEW_CONFIG)
gpt.eval()

# how to integrate the weights??
# manually map the weights
def load_weights_into_gpt(gpt,params):
    gpt.pos_emb.weight=assign(gpt.pos_emb.weight,params['wpe']) # we use assign to avoid breaking links
    gpt.tok_emb.weight=assign(gpt.tok_emb.weight,params['wte'])
    for b in range(len(params["blocks"])):
        q_w,k_w,v_w=np.split(
            (params["blocks"][b]["attn"]["c_attn"])["w"],3,axis=-1)
        gpt.trf_blocks[b].att.W_query.weight=assign(gpt.trf_blocks[b].att.W_query.weight,q_w.T)
        gpt.trf_blocks[b].att.W_key.weight=assign(gpt.trf_blocks[b].att.W_key.weight,k_w.T)
        gpt.trf_blocks[b].att.W_value.weight=assign(gpt.trf_blocks[b].att.W_value.weight,v_w.T)
        q_b,k_b,v_b=np.split(
            (params["blocks"][b]["attn"]["c_attn"])["b"],3,axis=-1)
        gpt.trf_blocks[b].att.W_query.bias=assign(gpt.trf_blocks[b].att.W_query.bias,q_b)
        gpt.trf_blocks[b].att.W_key.bias=assign(gpt.trf_blocks[b].att.W_key.bias,k_b)
        gpt.trf_blocks[b].att.W_value.bias=assign(gpt.trf_blocks[b].att.W_value.bias,v_b)

        gpt.trf_blocks[b].att.out_proj.weight=assign(gpt.trf_blocks[b].att.out_proj.weight,params["blocks"][b]["attn"]["c_proj"])["w"].T)
        gpt.trf_blocks[b].att.out_proj.bias=assign(gpt.trf_blocks[b].att.out_proj.bias,params["blocks"][b]["attn"]["c_proj"])["b"])

        gpt.trf_blocks[b].ff.layers[0].weight=assign(gpt.trf_blocks[b].ff.layers[0].weight,params["blocks"][b]["mlp"]["c_fc"])["w"].T)
        gpt.trf_blocks[b].ff.layers[0].bias=assign(gpt.trf_blocks[b].ff.layers[0].bias,params["blocks"][b]["mlp"]["c_fc"])["b"])
        gpt.trf_blocks[b].ff.layers[1].weight=assign(gpt.trf_blocks[b].ff.layers[1].weight,params["blocks"][b]["mlp"]["c_proj"])["w"].T)
        gpt.trf_blocks[b].ff.layers[1].bias=assign(gpt.trf_blocks[b].ff.layers[1].bias,params["blocks"][b]["mlp"]["c_proj"])["b"])

        gpt.trf_blocks[b].norm1.scale=assign(gpt.trf_blocks[b].norm1.scale,params["blocks"][b]["ln_1"])["g"])
        gpt.trf_blocks[b].norm1.shift=assign(gpt.trf_blocks[b].norm1.shift,params["blocks"][b]["ln_1"])["b"])
        gpt.trf_blocks[b].norm2.scale=assign(gpt.trf_blocks[b].norm2.scale,params["blocks"][b]["ln_2"])["g"])
        gpt.trf_blocks[b].norm2.shift=assign(gpt.trf_blocks[b].norm2.shift,params["blocks"][b]["ln_2"])["b"])

    gpt.final_norm.scale=assign(gpt.final_norm.scale,params["g"])
    gpt.final_norm.shift=assign(gpt.final_norm.scale,params["b"])
    gpt.out_head.weight=assign(gpt.out_head.weight,parms["wte"])

load_weights_into_gpt(gpt,params)
gpt.to(device)

torch.manual_seed(123)
token_ids=generate(model=gpt,idx=text_to_token_ids("everything effort moves you",tokenizer).to(device),
                  max_new_tokens=25,context_size=NEW_CONFIG["context_length"],top_k=50,temperature=1.5)
print("output",token_ids_to_text(token_ids,tokenizer))

Finetuning - preparing the dataset

In [None]:
import urllib.request
import ssl
import zipfile
import os
from pathlib import Path

url="https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"
zip_path="sms_spam_collection.zip"
extracted_path="sms_spam_collection"
data_file_path=Path(extracted_path) /"SMSSpamCollection.tsv"

def download_and_unzip_spam_data(url,zip_path,extracted_path,data_file_path):
    if data_file_path.exists():
        print("f{data_file_path} already exists. Skipping download and extraction.")
        return
    # create a unverfied ssl context
    ssl_context=ssl._create_univerified_context()
    # downloading the file
    with urllib.request.urlopen(url,context=ssl_context) as response:
        with open(zip_path,"wb") as out_file:
            out_file.write(response.read())
    # unzipping the file
    with zipfile.ZipFile(zip_path,"r") as zip_ref:
        zip_ref.extractall(extracted_path)
    # add .tsv file extension
    original_file_path=Path(extracted_path)/"SMSSpamCollection"
    os.rename(original_file_path,data_file_path)
    print(f"file downloaded and saved as {data_file_path}")

download_and_unzip_spam_data(url,zip_path,extracted_path,data_file_path)


import pandas as pd
df=pd.read_csv(data_file_path,sep="\t",header=None,names=["Label","Text"])
print(df["Label"].value_counts())

def create_balanced_dataset(df):
    num_spam=df[df["Label"]=="spam"].shape(0) # spam has less
    ham_subset=df[df["Label"]=="ham"].sample(num_spam,random_state=123)
    balanced_df=pd.concat([ham_subet,df[df["Label"]=="spam"]])
    return balanced_df
print(balanced_df["Label"].value_counts())    

balanced_df["Label"]=balanced_df["Label"].map({"ham":0,"spam":1}) # ressembles token ids

# we can split the data into 70% for training, 10% for validation and 20% for testing
def random_split(df,tran_frac,val_frac):
    df=df.sample(frac=1,random_state=123).reset_index(drop=True)
    train_end=int(len(df)*train_frac)
    val_end=train_end+int(len(df)*val_frac)
    train_df=df[:train_end]
    val_df=df[train_end:val_end]
    test_df=df[val_end:]
    return train_df,val_df,test_df

train_df,val_df,test_df=random_split(balanced_df,0.7,0.1)

print(len(train_df),len(val_df),len(test_df))
train_df.to_csv("train.csv",index=None)
val_df.to_csv("val.csv",index=None)
test_df.to_csv("test.csv",index=None)

Dataloader for classification finetuning

In [None]:
import torch
from torch.utils.data import Dataset

class SpamDataset(Dataset):
    def __init__(self,csv_file,tokenizer,max_length=None,pad_token_id=50256):
        self.data=pd.read_csv(csv_file)
        self.encoded_texts=[
            tokenizer.encode(text) for text in self.data["Text"]
        ]
        if max_length is None:
            self.max_length=self._longest_encoded_length()
        else:
            self.max_length=max_length
            # truncate texts if they are longer than max_length
            self.encoded_texts={
                encoded_text[:self.max_length]
                for encoded_text in self.encoded_texts
            }
        # padding
        self.encoded_texts=[encoded_text+[pad_token_id]*(self.max_length-len(encoded_text))
                           for encoded_text in self.encoded_texts]
    def getitem(self,index):
        encoded=self.encoded_texts[index]
        label=self.encoded_texts[index]["Label"]
        return (
            torch.tensor(encoded,dtype=torch.long),
            torch.tensor(label,dtype=torch.long)
        )
    def __len__(self):
        return len(self.data)
    def _longest_encoded_length(self):
        max_length=0
        for encoded_text in self.encoded_texts:
            encoded_length=len(encoded_text)
            if max_length<encoded_length:
                max_length=encoded_length
        return max_length
        
train_dataset=SpamDataset(csv_file="train.csv",max_length=None,tokenizer)
print(train_dataset.max_length)
val_dataset=SpamDataset(csv_file="train.csv",max_length=train_dataset.max_length,tokenizer)
test_dataset=SpamDataset(csv_file="train.csv",max_length=train_dataset.max_length,tokenizer)

from torch.utils.data import DataLoader
num_workers=0
batch_size=8
torch.manual_seed(123)
train_loader=DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=num_workers,drop_last=True,
)
val_loader=DataLoader(
    dataset=val_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=num_workers,drop_last=True,
)
test_loader=DataLoader(
    dataset=test_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=num_workers,drop_last=True,
)

for input_batch,target_batch in train_loader:
    pass
print(input_batch.shape,target_batch.shape) # x x y
# each batch contains x training examples with y tokens each


Architecture for LLM for Classification Fine-tuning

In [None]:
assert train_dataset.max_length<=BASE_CONFIG["context_length"],(
    "dataset length exceeds the model's context length. reinit datasets with max_length=context_length"
)
# we have loaded the params into the architecture gpt-2 before
model.eval()
# now we need to add the classification head to the original architecture
print(model) # print the orig architecture
for param in model.parameters():
    param.require_grad=False # freeze all the params
torch.manual_seed(123)
num_classes=2
model.out_head=torch.nn.Linear(in_features=CONFIG["emb_dim"],out_features=num_classes) # require_grad is True by default
# unfreeze the parameters that we want to train!
for param in model.trf_blocks[-1].parameters():
    param.require_grad=True
for param in model.final_norm.parameters():
    param.require_grad=True   

inputs=tokenizer.encode("do you have time")
inputs=torch.tensor(inputs).unsqueeze(0) # add batch dim
with torch.no_grad():
    outputs=model(input) # output is batch_size x context_len x # of classes

def calc_accuracy_loader(data_loader,model,device,num_batches=None):
    model.eval()
    correct_predictions,num_examples=0,0
    if num_batches is None:
        num_batches=len(data_loader)
    else:
        num_batches=min(num_batches,len(data_loader))
    for i,(input_batch,target_batch) in enumerate(data_loader):
        if i<num_batches:
            input_batch,target_batch=input_batch.to(device),target_batch.to(device)
            with torch.no_grad():
                logits=model(input_batch)[:,-1,:] # of the last context => get output token
            predicted_labels=torch.argmax(logits,dim=-1)
            num_examples+=predicted_labels.shape[0]
            correct_preds+=(predicted_labels==target_batch).sum().item()
        else:
            break
    
    return correct_predictions/num_examples

device=torch.device("cuda" if torch.duda.is_available() else "cpu")
model.to(device)
torch.manual_seed(123)
train_acc=calc_accuracy_loader(train_loader,model,device,num_batches=10)
val_acc=calc_accuracy_loader(val_loader,model,device,num_batches=10)
test_acc=calc_accuracy_loader(test_loader,model,device,num_batches=10)
print(train_acc,test_acc,val_acc)

# loss: cross entropy
def calc_loss_batch(input_batch,target_batch,model,device):
    input_batch,target_batch=input_batch.to(device),target_batch.to(device)
    logits=model(input_batch)[:,-1,:]
    return torch.nn.functional.cross_entropy(logits,target_batch)

def calc_loss_loader(data_loader,model,device,num_batches=None):
    total_loss=0.
    if len(data_loader)==0:
        return float("nan")
    elif num_batches is None:
        num_batches=len(data_loader)
    else:
        num_batches=min(num_batches,len(data_loader))
    for i,(input_batch,target_batch) in enumerate(dataloader):
        if i<num_batches:
            loss=calc_loss_batch(input_batch,target_batch,model,device)
            total_loss+=loss.item()
        else:
            break
    return total_loss/num_batches

with torch.no_grad():
    train_loss=calc_loss_loader(train_loader,model,device,num_batches=5)
print(train_loss)

def train_classifier_simple(model,train_loader,val_loader,optimizer,device,num_epochs,
                           eval_freq,eval_iter):
    train_losses,val_losses,train_accs,val_accs=[],[],[],[]
    examples_seen,global_step=0,-1
    for epoch in range(num_epochs):
        model.train()
        for input_batch,target_batch in train_loader:
            optimizer.zero_grad()
            loss=calc_loss_batch(input_batch,target_batch,model_device)
            loss.backward()
            optimizer.step()
            examples_seen+=input_batch.shape[0]
            global_step+=1
            if global_step% eval_freq==0:
                train_loss,val_loss=evaluate_model(model,train_loader,val_loader,device,eval_iter)
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                print(f"epoch {epoch+1} step ({global_step}) train loss {train_loss:.3f} val loss {val_loss:.3f}")
        train_acc=calc_accuracy_loader(train_loader,model,device,num_batches=eval_iter)
        val_acc=calc_accuracy_loader(val_loader,model,device,num_batches=eval_iter)
        print(train_acc,val_acc)
        train_accs.append(train_acc)
        val_accs.append(val_acc)
    return train_losses,val_losses,train_accs,val_accs,examples_seen

# then we can plot
def evaluate_model(model,train_loader,val_loader,device,eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss=calc_loss_loader(train_loader,model,device,num_batches=eval_iter)
        val_loss=calc_loss_loader(val_loader,model,device,num_batches=eval_iter)
    model.train()
    return train_loss,val_loss

import time
start_time=time.time()
torch.manual_seed(123)
optimizer=torch.optim.AdamW(model.parameters(),lr=1e-5,weight_decay=0.1)
num_epochs5
train_losses,val_losses,train_accs,val_accs,exs_seen=train_classifier_simple(
    model,train_loader,val_loader,optimizer,device,num_epochs=num_epochs,eval_freq=50,eval_iter=5
)
end_time=time.time()
execution_time_min=(end_time-start_time)/60
print(execution_time_min)

epochs_tensor=torch.linespace(0,num_epochs,len(train_losses))
exs_seen_tensor=torch.linspace(0,exs_seen,len(train_losses))
plot_values(epochs_tensor,exs_seen_tensor,train_losses,val_losses)


plot_values(epochs_tensor,exs_seen_tensor,train_acc,val_accs)
train_acc=calc_accuracy_loader(train_loader,model,device,num_batches=10)
val_acc=calc_accuracy_loader(val_loader,model,device,num_batches=10)
test_acc=calc_accuracy_loader(test_loader,model,device,num_batches=10)
print(train_acc,test_acc,val_acc)

# testing model on new data
def classify_review(text,model,tokenizer,device,max_length,pad_token_id=50256):
    model.eval()
    inputs_ids=tokenizer.encode(text)
    supported_context_length=model.pos_emb.weight.shape[0]
    inputs_ids=input_ids[:min(max_length,supported_context_length)]
    input_ids+[pad_token_id] *(max_length-len(input_ids))
    input_tensor=torch.tensor(input_ids,device=device).unsqueeze(0)
    with torch.no_grad():
        logtis=model(input_tensor)[:,-1,:]
    predicted_label=torch.argmax(logits,dim=-1).item()
    return "spam" if predicted_label==1 else "ham"

text1=("you ","kkk")
print(classify_review(text_1,model,tokenizer,device,max_length=train_dataset.max_length))

# save load
torch.save(model.state_dict(),"review_classifier.pth")
model_state_dict=torch.load("review_classifier.pth")
model.load_state_dict(model_state_dict) # to match the keys!


Instruction Finetuning



In [1]:
# download the dataset
import json,os,urllib,ssl

def download_and_load_file(file_path,url):
    ssl_context=ssl.create_default_context()
    ssl_context.check_hostname=False
    ssl_context.verify_mode=ssl.CERT_NONE
    if not os.path.exists(file_path):
        with urllib.request.urlopen(url,context=ssl_context) as response:
            text_data=response.read().decode("utf-8")
        with open(file_path,"w",encoding="utf-8") as file:
            file.write(text_data)
    else:
        with open(file_path,"r",encoding="utf-8") as file:
            text_data=file.read()
    with open(file_path,"r",encoding="utf-8") as file:
        data=json.load(file)
    return data

file_path="ins-data.json"
url=(
    "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch"
    "/main/ch07/01_main-chapter-code/instruction-json.json"
)
data=download_and_load_file(file_path,url)
print(len(data),data[50])

def format_input(entry):
    instruction_text=(
        f"Below is an instruction that describes a task."
        f"Write a response that appropriately complete the request."
        f"\n\n### Instruction:\n {entry["instruction"]}")
    input_text=f"\n\n### Input:\n{entry["input"]}" if entry["input"] else ""
    return instruction_text+input_text

print(format_input(data[50])) # test it out

# split the dataset into training, val and test
train_idx=int(len(data)*0.85)
test_idx=int(len(data)*0.1)
val_idx=len(data)-train_idx-test_idx
train_data=data[:train_idx]
test_data=data[train_idx:train_idx+test_idx]
val_data=data[train_idx+test_idx:]
print(len(train_data),len(val_data),len(test_data))

# add responses, tokenize and organize the data into batches
class InstructionDataset(Dataset):
    def __init__(self,data,tokenizer):
        self.data=data
        self.encoded_text=[ ] # pretokenize texts
        for entry in data:
            instruction_plus_input=format_input(entry)
            response_text=f"\n\n### Response:\n{entry['output']}"
            full_text=instruction_plus_input+response_text
            self.encoded_texts.append(tokenizer.encode(full_text))
    def __getitem(self,i):
        return self.encoded_texts[i]
    def __len__(self):
        return len(self.data)


# padding, generate input and target pairs, replace the padding tokens with -100
def custom_collate(batch,pad_token_id=50256,device="cpu",ignore_index=-100,allowed_max_len=None):
    # find the longest seq in the batch, and increase the length by +1 which will add one extra
    # padding tokens below
    batch_max_len=max(len(item)+1 for item in batch)
    input_lst=[]
    output_lst=[]
    for item in batch:
        new_item=item.copy()
        new_item+=[pad_token_id]
        padded=(new_item+[pad_token_id]*(batch_max_len-len(new_item)))
        inputs=torch.tensor(padded[:-1])
        outputs=torch.tensor(padded[1:])
        # replace all but the first padding tokens in targets by ignore_index
        mask=outputs==pad_token_id
        indices=torch.nonzero(mask).squeeze # get all the padding token positions
        if indices.numel()>1:
            outputs[indices[1:]]=ignore_index
        if allowed_max_len is not None:
            inputs=inputs[:allowed_max_len]
            outputs=outputs[:allowed_max_len]
        input_lst.append(inputs)
        output_lst.append(outputs)
    inputs_tensor=torch.stacked(inputs_lst).to(device)
    outputs_tensor=torch.stacked(outputs_lst).to(device)
    return inputs_tensor,outputs_tensor # inputs and targets

# test
i1=[1,2,3,4,5,6]
i2=[1,3]
i3=[3,4,5,6]
batch=(i1,i2,i3)
print(custom_collate(batch))

logits_1=torch.tensor([[-1.0,1.0],[-0.5,1.5],[-0.5,1.5]])
targets_1=torch.tensor([0,1,-100])
loss=torch.nn.functional.cross_entropy(logits_1,targets_1) # ignore_index=-100 is ignored
print(loss)

# create dataloader
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")

from functools import partial 
# new version of the function with the device argument pre-filled
customized_collate_fn=partial(custom_collate,device=device,allowed_max_length=1024)

from torch.utils.data import DataLoader

num_workers=0
batch_size=8
torch.manual_seed(123)
train_dataset=InstructionDataset(train_data,tokenizer)
val_dataset=InstructionDataset(val_data,tokenizer)
test_dataset=InstructionDataset(test_data,tokenizer)
train_loader=DataLoader(train_data_set,batch_size=batch_size,collate_fn=customized_collate,
                       shuffle=True,drop_last=True,num_workers=num_workers)
val_loader=DataLoader(val_data_set,batch_size=batch_size,collate_fn=customized_collate,
                       shuffle=True,drop_last=True,num_workers=num_workers)
test_loader=DataLoader(test_data_set,batch_size=batch_size,collate_fn=customized_collate,
                       shuffle=True,drop_last=True,num_workers=num_workers)

for inputs,targets in val_loader:
    print(inputs.shape,targets.shape) # both batch_size x token_num => max_length for that batch
# however, between batches, the token_num can differ

# load weights into gpt2 architecture - we have done it previously; likely, we can do it again
mo=GPT2(CONFIG)
load_weights_into_gpt(mo,params)
mo.eval()
torch.manual_seed(123)
input_t=val_data[0]
token_ids=generate(model=model,idx=text_to_token_ids(input_t,tokenizer),max_new_tokens=35,
                  context_size=CONFIGT["context_length"],eos_id=50256)
print(input_t,token_ids_to_text(token_ids,tokenizer)[len(input_t:)].strip())
# the pretrained weights are not doing a good job at all!

# training loop - we can use calc_loss_batch, calc_loss_loader defined before
def train_instructions(model,train_loader,val_loader,optimizer,device,num_epochs,
                           eval_freq,eval_iter,start_context=format_input(val_data[0]),tokenizer=tokenizer):
    train_losses,val_losses=[],[]
    examples_seen,global_step=0,-1
    for epoch in range(num_epochs):
        model.train()
        for input_batch,target_batch in train_loader:
            optimizer.zero_grad()
            loss=calc_loss_batch(input_batch,target_batch,model,device)
            loss.backward()
            optimizer.step()
            examples_seen+=input_batch.shape[0]
            global_step+=1
            if global_step% eval_freq==0:
                train_loss,val_loss=evaluate_model(model,train_loader,val_loader,device,eval_iter)
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                print(f"epoch {epoch+1} step ({global_step}) train loss {train_loss:.3f} val loss {val_loss:.3f}")
        generate_and_print_sample(model,tokenizer,device,start_context)
    return train_losses,val_losses

start_time=time.time()
torch.manual_seed(123)
optimizer=torch.optim.AdamW(mo.parameters(),lr=0.00005,weight_decay=0.1)
num_epochs=1
train_losses,val_losses,tokens_seen=train_instructions(mo,train_loader,val_loader,optimizer,
                                                      device,num_epochs,eval_freq=5,5,
                                                      format_input(val_data[0]),tokenizer)
end_time=time.time()
print(end_time-start_time)

# plot losses
def plot_losses(epochs_seen,tokens_seen,train_losses,val_losses):
    fig,ax1=plt.subplots(figsize=(5,3))
    ax1.plot(epochs_seen,train_losses,label="training")
    ax1.plot(epochs_seen,val_losses,linestyle="-.",label("val"))
    ax1.set_xlabel("epochs")
    ax1.set_ylabel("loss")
    ax1.legend(loc="upper right")
    ax1.xaxis.set_major_locator(MaxNLocator(integer=True))
    ax2=ax1.twiny()
    ax2.plot(tokens_seen,train_losses,alpha=0)
    ax2.set_xlabel("tokens seen")
    fig.tight_layout()
    plt.savefig("loss-plot.pdf")
    plt.show()

plot_losses(epochs_seen,tokens_seen,train_losses,val_losses)

SyntaxError: incomplete input (1079912594.py, line 4)

Evaluating the Finetuned LLM Using Ollama

In [None]:
from tqdm import tqdm

# generate responses and save as json
for i,entry in tqdm(enumerate(test_data),total=len(test_data)):
    input_text=format_input(entry)
    token_ids=generate(model=model,idx=text_to_token_ids(entry,tokenizer),max_new_tokens=35,
                  context_size=CONFIGT["context_length"],eos_id=50256)
    generated_text=token_ids_to_text(token_ids,tokenizer)
    response_text=generated_text[len(input_text):].replace("### Response:","").strip()
    test_data[i]["model_response"]=response_text
with open("instruction-data-with-reponse.json",w) as file:
    json.dump(test_data,file,indent=4)

print(test_data[0])

#save the model

import re
file_name=f"{re.sub(r'[ ()]','',CHOOSE_MODEL) }-sft.pth"
torch.save(model.state_dict(),file_name)
print(file_name)

# evaluation - using another larger LLM
import psutil
def check_if_running(proc_name):
    running=False
    for proc in psutil.process_iter(["name"]):
        if proc_name in proc.info["name"]:
            running=True
            break
    return running

ollama_running=check_if_running("ollama")
if not ollama_running:
    raise RuntimeError("Ollama not running.")

import urllib.request
def query_model(prompt,model="llama3",url="http://localhost:11434/api/chat"):
    data={"model":mo,
         "messages":[{"role":"user","content":prompt}]}
    ,"options":{"seed":123,"temperature":0,"num_ctx:2048"}
    payload=json.dumps(data).encode("utf-8")
    request=urllib.request.Request(url,data=payload,method="POST")
    request.add_header("Content-Type","application/json")
    response_data=""
    with urllib.request.urlopen(request) as response:
        while True:
            line=response.readline().decode("utf-8")
            if not line:
                break
            response_json=json.loads(line)
            response_data+=response_json["message"]["content"]
    return response_data
    
result=query_model("what did you deat","llama3")
print(result)

# test
for entry in test_data[:3]:
    prompt=(
        f"given the input `{format_input(entry)}`"
        f"and correct output `{entry['output']}`"
        f"score the model response `{entry['model_response']}`"
        f"on a sacle from 0 to 100 where 100 is the best score"
    )
    print(query_model(prompt),entry['output'],entry['model_response'])

def generate_model_scores(json_data,json_key,model="llama3"):
    scores=[]
    for entry in tqdm(json_data,desc="Scoring entries"):
        prompt=(
        f"given the input `{format_input(entry)}`"
        f"and correct output `{entry['output']}`"
        f"score the model response `{entry['model_response']}`"
        f"on a sacle from 0 to 100 where 100 is the best score"
        )
        score=query_model(prompt)
        try:
            scores.append(int(score))
        except ValueError:
            print(f"could not convert score {score}")
            continue
    return scores


    