## LLM form scratch coure by Sebastian rashka

In [None]:
%cd /content/drive/MyDrive/Github/llm-courses

/content/drive/MyDrive/Github/llm-courses


In [None]:
!pwd

/content/drive/MyDrive/Github/llm-courses


### Get data

In [None]:
# Dowload the data to link : https://en.wikisource.org/wiki/The_Verdict
import urllib.request

url = ("https://raw.githubusercontent.com/rasbt/"
"LLMs-from-scratch/main/ch02/01_main-chapter-code/"
"the-verdict.txt")

file_path = "the-verdict.txt"
urllib.request.urlretrieve(url, file_path)

('the-verdict.txt', <http.client.HTTPMessage at 0x7901954eebd0>)

In [None]:
# load  download text file
with open(file_path, 'r', encoding='utf-8') as f:
  raw_text = f.read()

len(raw_text), raw_text[:10]

(20479, 'I HAD alwa')

In [None]:
# Tokenizer the text using regular expression
import re
text = "Hello, |?  hmm ! world. This, is a test."
result = re.split(r'([,.}!-?]|\s+)', text)
tokens = [x for x in result if x.strip()]
print(tokens)

['Hello', ',', '|', '?', 'hmm', '!', 'world', '.', 'This', ',', 'is', 'a', 'test', '.']


In [None]:
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s+)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(preprocessed[:90])

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in', 'the', 'height', 'of', 'his', 'glory', ',', 'he', 'had', 'dropped', 'his', 'painting', ',', 'married', 'a', 'rich', 'widow', ',', 'and', 'established', 'himself', 'in', 'a', 'villa', 'on', 'the', 'Riviera', '.', '(', 'Though', 'I', 'rather', 'thought', 'it', 'would', 'have', 'been', 'Rome', 'or', 'Florence', '.', ')', '"', 'The', 'height', 'of', 'his', 'glory', '"', '--', 'that', 'was', 'what', 'the', 'women', 'called', 'it', '.', 'I', 'can', 'hear']


### Create vocabulary of words and tokens with ids

In [None]:
# Convert token into token ID

# get vocabulary list  : all list of the words
all_words = sorted(set(preprocessed))

# create  the vocabulry dictionnary
vocab = { token : id for id, token in enumerate(all_words)}
print(len(vocab))

1130


In [None]:
class SimpleTokkenizerV1:
  def __init__(self, vocab):
    self.str_to_int = vocab
    self.int_to_str = {id : token for token, id in vocab.items()}

  def encode(self, text):
    preprocessed = re.split(r'([,.:;?_!"()\']|--|\s+)', text)
    preprocessed = [item.strip() for item in preprocessed if item.strip()]
    ids = [self.str_to_int[token] for token in preprocessed]
    print(ids)
    return ids

  def decode(self, ids):
    text = " ".join([self.int_to_str[id] for id in ids ])
    text = re.sub(r'\s+([,.?!"()\'])', r'', text)
    return text

In [None]:
tokenizer = SimpleTokkenizerV1(vocab)

In [None]:
text = """"It's the last he painted, you know,"
Mrs. Gisburn said with pardonable pride."""

ids = tokenizer.encode(text)
print(ids)

[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]
[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]


In [None]:
tokenizer.decode(ids)

'" It s the last he painted you know Mrs Gisburn said with pardonable pride'

#### Add special maker : unknow token and endoftext token

In [None]:
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s+)', raw_text)
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])
vocab = {token:integer for integer, token in enumerate(all_tokens)}
print(len(vocab.items()))

1135


In [None]:
class SimpleTokkenizerV2:
  def __init__(self, vocab):
    self.str_to_int = vocab
    self.int_to_str = {id : token for token, id in vocab.items()}

  def encode(self, text):
    preprocessed = re.split(r'([,.:;?_!"()\']|--|\s+)', text)
    preprocessed = [item.strip() for item in preprocessed if item.strip()]
    preprocessed = [item if item in self.str_to_int else "<|unk|>" for item in preprocessed]
    ids = [self.str_to_int[token] for token in preprocessed]
    return ids

  def decode(self, ids):
    text = " ".join([self.int_to_str[id] for id in ids ])
    # text = re.sub(r'\s+([,.?!"()\'])', r'', text)
    text = re.sub(r'\s+([,.:;?!"()\'])', r'', text)
    return text

In [None]:
tokenizer2 = SimpleTokkenizerV2(vocab)

In [None]:
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."
text = " <|endoftext|> ".join((text1, text2))
print(text)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.


In [None]:
ids = tokenizer2.encode(text)

In [None]:
tokenizer2.decode(ids)

'<|unk|> do you like tea <|endoftext|> In the sunlit terraces of the <|unk|>'

#### Byte paire encoding

In [None]:
from importlib.metadata import version
import tiktoken
print("tiktoken version:", version("tiktoken"))

tiktoken version: 0.11.0


In [None]:
# instanciate the BPE tokenizer from tiktoken
tokenizer = tiktoken.get_encoding("gpt2")

text = (
"Hello, do you like tea? <|endoftext|> In the sunlit terraces"
"of someunknownPlace."
)
integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(integers)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 27271, 13]


In [None]:
tokenizer.decode(integers)

'Hello, do you like tea? <|endoftext|> In the sunlit terracesof someunknownPlace.'

In [None]:
# Tokenizer the whole book story
with open(file_path, 'r', encoding='utf-8') as f:
  raw_text = f.read()

#encode raw data
ids = tokenizer.encode(raw_text)
print(len(ids) , ids[:10])

5145 [40, 367, 2885, 1464, 1807, 3619, 402, 271, 10899, 2138]


In [None]:
#Ccontext size determin how may token are included into the input
context_size=4
x = ids[:context_size]
y = ids[1:context_size+1]

print(f"x = {x}")
print(f"y = {y}" )

x = [40, 367, 2885, 1464]
y = [367, 2885, 1464, 1807]


#### Implement custom dataLoader

In [None]:
import torch
from torch.utils.data import Dataset

class GPTDatasetV1(Dataset):
  def __init__(self, text, tokenizer, max_length, stride):
    self.input_ids = []
    self.target_ids = []

    self.tokens_ids = tokenizer.encode(text)
    assert len(self.tokens_ids) > max_length, "Number of tokenized inputs must at least be equal to max_length+1"


    for i in range(0, len(self.tokens_ids) - max_length, stride):
      input_chunk = self.tokens_ids[i:i + max_length ]
      target_chunk = self.tokens_ids[i + 1:i + max_length + 1]
      self.input_ids.append( torch.tensor(input_chunk))
      self.target_ids.append(torch.tensor(target_chunk))

  def __len__(self):
    return len(self.input_ids)


  def __getitem__(self, idx):
    return self.input_ids[idx], self.target_ids[idx]


In [None]:
from torch.utils.data import DataLoader

# Data loader function
def create_dataloader_v1(text, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True ):
  tokenizer = tiktoken.get_encoding("gpt2")
  dataset = GPTDatasetV1(text, tokenizer, max_length, stride)
  dataloader = DataLoader(
      dataset,
      batch_size=batch_size,
      shuffle=shuffle,
      drop_last=drop_last
  )

  return dataloader

In [None]:
# load  download text file
with open(file_path, 'r', encoding='utf-8') as f:
  raw_text = f.read()

dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=4, stride=4, shuffle=False)
data_iter = iter(dataloader)

input, target = next(data_iter)
print(f"Input :\n{input}")
print(f"Target :\n {target}")

Input :
tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])
Target :
 tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])


In [None]:
#create token embedding layer
vocab_size = 50257
output_dim = 256
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
token_embedding_layer

Embedding(50257, 256)

In [None]:
max_length = 4
dataloader = create_dataloader_v1(
  raw_text, batch_size=8, max_length=max_length,
  stride=max_length, shuffle=False
)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Token IDs:\n", inputs)
print("\nInputs shape:\n", inputs.shape)

Token IDs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Inputs shape:
 torch.Size([8, 4])


#### use the embedding layer to embed these token IDs into 256-dimensional vectors:

In [None]:
token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape)

torch.Size([8, 4, 256])


###Create positional embeding with the  same dim as token embeding

In [None]:
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)
pos_embeddings = pos_embedding_layer(torch.arange(context_length))
print(pos_embeddings.shape)

torch.Size([4, 256])


### Coding attentiion mechanism : simple way

In [None]:
import torch

inputs = torch.tensor(
  [[0.43, 0.15, 0.89], # Your
  [0.55, 0.87, 0.66], # journey
  [0.57, 0.85, 0.64], # starts
  [0.22, 0.58, 0.33], # with
  [0.77, 0.25, 0.10], # one
  [0.05, 0.80, 0.55]] # step
)


In [None]:
#compute attention score for token 2
query = inputs[1]
attn_scores_2 = torch.empty(inputs.shape[0])
for i, x_i in enumerate(inputs):
  attn_scores_2[i] = torch.dot(x_i, query)


print(attn_scores_2)

tensor([0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865])


In [None]:
#normalize the attention score

def softmax_naive(x):
  return torch.exp(x) / torch.exp(x).sum(dim=0)

attn_weights_2_naive = softmax_naive(attn_scores_2)
print("Attention weights:", attn_weights_2_naive)
print("Sum:", attn_weights_2_naive.sum())

Attention weights: tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581])
Sum: tensor(1.)


In [None]:
#using the torch softmax
attn_weights_2 = torch.softmax(attn_scores_2, dim=0)
print("Attention weights:", attn_weights_2)
print("Sum:", attn_weights_2.sum())

Attention weights: tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581])
Sum: tensor(1.)


#### calculating the context vector z(2) by multiplying the embedded input tokens, x(i),  with the corresponding attention weights and then summing the resulting vectors.

Thus, context vector z(2) is the weighted sum of all input
vectors, obtained by multiplying each input vector by its
corresponding attention weight

In [None]:
query = inputs[1]

context_vec_2 = torch.zeros(query.shape)
for i,x_i in enumerate(inputs):
  context_vec_2 += attn_weights_2[i]*x_i

print(context_vec_2)

tensor([0.4419, 0.6515, 0.5683])


In [None]:
#cmute the atten score
attn_scores = torch.matmul(inputs,  inputs.T)
print(attn_scores)

#we normalize each row
attn_weights = torch.softmax(attn_scores, dim=1)
print(attn_weights)

tensor([[0.9995, 0.9544, 0.9422, 0.4753, 0.4576, 0.6310],
        [0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865],
        [0.9422, 1.4754, 1.4570, 0.8296, 0.7154, 1.0605],
        [0.4753, 0.8434, 0.8296, 0.4937, 0.3474, 0.6565],
        [0.4576, 0.7070, 0.7154, 0.3474, 0.6654, 0.2935],
        [0.6310, 1.0865, 1.0605, 0.6565, 0.2935, 0.9450]])
tensor([[0.2098, 0.2006, 0.1981, 0.1242, 0.1220, 0.1452],
        [0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581],
        [0.1390, 0.2369, 0.2326, 0.1242, 0.1108, 0.1565],
        [0.1435, 0.2074, 0.2046, 0.1462, 0.1263, 0.1720],
        [0.1526, 0.1958, 0.1975, 0.1367, 0.1879, 0.1295],
        [0.1385, 0.2184, 0.2128, 0.1420, 0.0988, 0.1896]])


In [None]:
# compute all atttention vectors
all_context_vecs = torch.matmul(attn_weights, inputs)
print(all_context_vecs)

tensor([[0.4421, 0.5931, 0.5790],
        [0.4419, 0.6515, 0.5683],
        [0.4431, 0.6496, 0.5671],
        [0.4304, 0.6298, 0.5510],
        [0.4671, 0.5910, 0.5266],
        [0.4177, 0.6503, 0.5645]])


### Implement self attention with learning


Define 3 variables :
* The second input element
* The input embedding size, d=3
* The output embedding size, d_out=2

In [None]:
x_2  = inputs[1]
d_in = inputs.shape[1]
d_out = 2


x_2, d_in, d_out

(tensor([0.5500, 0.8700, 0.6600]), 3, 2)

####Next, we initialize the three weight matrices Wq, Wk, and Wv

In [None]:
###we initialize the three weight matrices Wq, Wk, and Wv
torch.manual_seed(123)
W_query = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)
W_key = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)
W_value = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)

#### Compute the 3 matrix :  keys, queries and values  

we successfully projected the six input tokens from a three-dimensional onto a two-
dimensional embedding space:

In [None]:
###obtain all keys and values via matrix multiplication
keys = inputs @ W_key
values = inputs @ W_value
queries = inputs @ W_query
print("keys.shape:", keys.shape)
print("values.shape:", values.shape)
print("queries shape", queries.shape)

keys.shape: torch.Size([6, 2])
values.shape: torch.Size([6, 2])
queries shape torch.Size([6, 2])


####The attention score computation is a dot-product
computation similar to what we used in the simplified self-attention
mechanism in section 3.3. The new aspect here is that we are not
directly computing the dot-product between the input elements but
using the query and key obtained by transforming the inputs via the
respective weight matrices.

In [None]:
##compute the attention score  ω22 for token 2 :
keys_2 = keys[1] # get key associate with token 2
query_2 = queries[1] #get query associate with token 2

#compute attention score for token 2 embedding
attn_scores_22 = torch.dot(query_2, keys_2)
attn_scores_22

tensor(1.8524)

we can generalize this computation to all attention
scores via matrix multiplication

In [None]:
#compute attention score for  all tokens embedding
attn_scores_2 = query_2 @ keys.T
print(attn_scores_2)

tensor([1.2705, 1.8524, 1.8111, 1.0795, 0.5577, 1.5440])


#### Compute attention *weight*

we want to go from the attention scores to the
attention weights, as illustrated in figure 3.16. We compute
the attention weights by scaling the attention scores and
using the softmax function. However, now we scale the
attention scores by dividing them by the square root of the
embedding dimension of the keys (taking the square root is
mathematically the same as exponentiating by 0.5

In [None]:
d_k = keys.shape[-1]
attn_weights_2 = torch.softmax(attn_scores_2 / d_k**0.5, dim=-1) #softmax n
print(attn_weights_2)

tensor([0.1500, 0.2264, 0.2199, 0.1311, 0.0906, 0.1820])




#### Compute the context vector

Now, the final step is to compute the context vectors,
* we compute the context vector by combining all value vectors via the
attention weights.

In [None]:
#create value vector
context_vec_2 = attn_weights_2 @ values
print(context_vec_2)

tensor([0.3061, 0.8210])


So far, we’ve only computed a single context vector, z(2).
Next, we will generalize the code to compute all context
vectors in the input sequence, z(1) to z(T).

#### Implementing a compact self-attention Python class

In this PyTorch code, SelfAttention_v1 is a class derived from
nn.Module, which is a fundamental building block of PyTorch
models that provides necessary functionalities for model
layer creation and management.

The __init__ method initializes trainable weight matrices
(W_query, W_key, and W_value) for queries, keys, and values,
each transforming the input dimension d_in to an output
dimension d_out.

During the forward pass, using the forward method, we
compute the attention scores (attn_scores) by multiplying
queries and keys, normalizing these scores using softmax.
Finally, we create a context vector by weighting the values
with these normalized attention scores.

In [None]:
!!

In [None]:
import torch
import torch.nn as nn

class SelfAttention_v1(torch.nn.Module):
  def __init__(self, d_in, d_out):
    super().__init__()
    self.W_query = nn.Parameter(torch.rand(d_in, d_out))
    self.W_key = nn.Parameter(torch.rand(d_in, d_out))
    self.W_value = nn.Parameter(torch.rand(d_in, d_out))


  def forward(self, inputs):
    queries = inputs @ self.W_query
    keys = inputs @ self.W_key
    values = inputs @ self.W_value

    attn_scores = queries @ keys.T
    attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
    context_vecs = attn_weights @ values

    return context_vecs

Since inputs contains six embedding vectors, this results in a
matrix storing the six context vectors

In [None]:
torch.manual_seed(123)
sa_v1 = SelfAttention_v1(d_in, d_out)
print(sa_v1(inputs))

tensor([[0.2996, 0.8053],
        [0.3061, 0.8210],
        [0.3058, 0.8203],
        [0.2948, 0.7939],
        [0.2927, 0.7891],
        [0.2990, 0.8040]], grad_fn=<MmBackward0>)


In [101]:
!git config --global user.email "tchindjeeric2@gmail.com"
!git config --global user.name "Tchindje"
!git config --global init.defaultBranch master

In [102]:
!git init

Initialized empty Git repository in /content/drive/MyDrive/Github/llm-courses/.git/


In [85]:
!git add .
!git commit -m "init commit"

[master ec00907] init commit
 1 file changed, 1 insertion(+), 1 deletion(-)


In [84]:
!git remote remove origin

error: No such remote: 'origin'


In [86]:
# !git remote add origin https://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@github.com/Eric-Tchindje/llm-course-from-scratch.git
###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

In [87]:
!git remote -v

origin	https://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@github.com/Eric-Tchindje/llm-course-from-scratch.git (fetch)
origin	https://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@github.com/Eric-Tchindje/llm-course-from-scratch.git (push)


In [96]:
# !git rm --cached "LLm-course_chap2_working _on_text.ipynb"
!git commit -m "Remove accidentally committed token"

[master b71f6f9] Remove accidentally committed token
 1 file changed, 1 deletion(-)
 delete mode 100644 LLm-course_chap2_working _on_text.ipynb


In [97]:
!git push -u origin master

Enumerating objects: 9, done.
Counting objects:  11% (1/9)Counting objects:  22% (2/9)Counting objects:  33% (3/9)Counting objects:  44% (4/9)Counting objects:  55% (5/9)Counting objects:  66% (6/9)Counting objects:  77% (7/9)Counting objects:  88% (8/9)Counting objects: 100% (9/9)Counting objects: 100% (9/9), done.
Delta compression using up to 2 threads
Compressing objects:  12% (1/8)Compressing objects:  25% (2/8)Compressing objects:  37% (3/8)Compressing objects:  50% (4/8)Compressing objects:  62% (5/8)Compressing objects:  75% (6/8)Compressing objects:  87% (7/8)Compressing objects: 100% (8/8)Compressing objects: 100% (8/8), done.
Writing objects:  11% (1/9)Writing objects:  22% (2/9)Writing objects:  33% (3/9)Writing objects:  44% (4/9)Writing objects:  55% (5/9)Writing objects:  66% (6/9)Writing objects:  77% (7/9)Writing objects:  88% (8/9)Writing objects: 100% (9/9)Writing objects: 100% (9/9), 19.06 KiB | 542.00 KiB/s, done.
Total 9 (delta 1), reused