# implementation of a decoder only transformer structure
![decoder only architecture](./image.png)

# Prepare the Environment 

In [3]:
import os
import requests
import pandas as pd
import matplotlib.pyplot as plt
import math
import tiktoken
import torch
import torch.nn as nn

# Setup Hyperparameters
Hyperparameters are external configurations for a model that cannot be learned from the data during training. They are set before the training process begins and play a crucial role in controlling the behavior of the training algorithm and the performance of the trained models.

In [4]:
batch_size = 4
context_length = 16 # Length of the token chunk each batch
d_model = 64 # Dimension of the model, also the dimension of the token embedding
num_layers = 8 # Number of transformer blocks
num_heads = 4 # Number of attention heads
learning_rate =1e-3 # 0.001
dropout = 0.1 # Dropout rate
max_iters = 5000 # Total of training iterations
eval_interval = 50 # How often to evaluate the model
eval_iters = 20 # Number of iterations to average for evaluation
device = 'cuda' if torch.cuda.is_available() else 'cpu'

"""
在这段代码中，我们定义了一个名为 TORCH_SEED 的常量，并将其值设置为 1337。接下来，我们调用 torch.manual_seed(TORCH_SEED) 函数来设置 PyTorch 的随机数生成器的种子。

具体来说，torch.manual_seed 函数用于为所有设备（包括 CPU 和 GPU）设置随机数生成器的种子。这对于确保实验的可重复性非常重要，因为在机器学习和深度学习中，许多操作（如权重初始化、数据分割等）都依赖于随机数生成器。通过设置相同的种子，我们可以确保每次运行代码时，生成的随机数序列都是相同的，从而使实验结果具有可重复性。

在这个例子中，我们将种子设置为 1337，这意味着每次运行代码时，PyTorch 的随机数生成器都会使用这个种子来生成相同的随机数序列。这对于调试和比较不同模型的性能非常有用，因为它消除了由于随机性引入的变量。
"""
TORCH_SEED = 1337
torch.manual_seed(TORCH_SEED)

<torch._C.Generator at 0x7f5607468e50>

# Prepare the Dataset

we'll use a small dataset for training. The dataset is a text file containing a sales textbook.
we'll use the text file to train a language model that can generate sales text

In [5]:
# download a sample txt file from huggingface
if not os.path.exists('sales_textbook.txt'):
    url = 'https://huggingface.co/datasets/goendalf666/sales-textbook_for_convincing_and_selling/raw/main/sales_textbook.txt'
    with open('sales_textbook.txt', 'w') as f:
        f.write(requests.get(url).text)

with open('sales_textbook.txt', 'r', encoding='utf-8') as f:
    text = f.read()
print(len(text))
print(text[:1000])

460319
Chapter 1: Building Rapport and Capturing Attention
Subpoint: Understanding the Importance of Building Rapport
Building rapport is a fundamental skill in sales that cannot be underestimated. It lays the foundation for establishing a connection with your potential customers, gaining their trust, and ultimately convincing them to make a purchase. Rapport can be defined as a harmonious relationship based on mutual understanding and empathy. When you build rapport with someone, you create a sense of familiarity, comfort, and shared interests, making it easier to communicate and influence their decision-making process.
One of the main reasons why building rapport is crucial in sales is that people are more likely to buy from someone they like and trust. By establishing a positive and genuine connection with your customers, you increase their confidence in you and your product or service. People want to do business with individuals they feel comfortable with, those who understand thei

# Tokenization
we'll use the tiktoken library to tokenize the dataset.
The library is a fast and lightweight tokenizer 

ref: [Tikoken introduction](https://juejin.cn/post/7390583568207822867)

In [6]:
encoding = tiktoken.get_encoding('cl100k_base') # 构造一个编码器
tokenized_text = encoding.encode(text) # 对文本进行编码,返回的是一个数字列表,数字代表token id

In [7]:
assert isinstance(tokenized_text, list) and isinstance(tokenized_text[0], int)

# 对列表里的token id去重,然后获取词汇表数量
vocab_size = len(set(tokenized_text))
max_token_value = max(tokenized_text)

print(f"Tokenized text size: {len(tokenized_text)}")
print(f"Vocabulary size: {vocab_size}")
print(f"The maximum value in the tokenized text is: {max_token_value}")

Tokenized text size: 77919
Vocabulary size: 3771
The maximum value in the tokenized text is: 100069


# Word Embeding
we'll split the dataset into training and validation sets.
The training set will be used to train the model, and the validation set will be used to evaluate the model's performance.

In [8]:
# split train and validation
split_idx = int(len(tokenized_text) * 0.8)
train_data = tokenized_text[:split_idx]
val_data = tokenized_text[split_idx:]

data = train_data
# 随机取一个batch_size大小的索引
idxs = torch.randint(low=0, high=len(data) - context_length, size=(batch_size,)) 

# 根据索引取出实际的token, 得到的x_batch的维度是 (batch_size, context_length)
x_batch = torch.stack([torch.tensor(data[idx:idx + context_length]) for idx in idxs])
# y_batch[batch_idx][pos] 相当于 x_batch[batch_idx][pos+1]
y_batch = torch.stack([torch.tensor(data[idx + 1 : idx + 1 + context_length]) for idx in idxs])
print(x_batch)
print(y_batch)
print(x_batch.shape, y_batch.shape)

tensor([[  627,  1383, 88861,   279,  1989,   315, 25607, 16940, 65931,   323,
         32097,    11,   584, 26458, 13520,   449],
        [15749,   311,  9615,  3619,   872,  6444,     6,  3966,    11, 10742,
            11,   323, 32097,    13,  3296, 22815],
        [13189,   315,  1701,  5557,   304,  6763,   374, 88861,  7528, 10758,
          7526,    13,  4314,  7526,  2997,  2613],
        [  323,  6376,  2867, 26470,  1603, 16661,   264, 49148,   627,    18,
            13, 81745, 48023, 75311,  7246, 66044]])
tensor([[ 1383, 88861,   279,  1989,   315, 25607, 16940, 65931,   323, 32097,
            11,   584, 26458, 13520,   449,   264],
        [  311,  9615,  3619,   872,  6444,     6,  3966,    11, 10742,    11,
           323, 32097,    13,  3296, 22815, 14624],
        [  315,  1701,  5557,   304,  6763,   374, 88861,  7528, 10758,  7526,
            13,  4314,  7526,  2997,  2613,    11],
        [ 6376,  2867, 26470,  1603, 16661,   264, 49148,   627,    18,    13,
   

`y_batch` 的功能是在深度学习模型训练过程中，作为目标输出（标签）批次使用。这些子序列代表了模型在每个时间步或位置上应该预测的目标值。

在训练过程中，模型会使用输入批次（`x_batch`）来预测输出，然后将预测结果与 `y_batch` 中的目标值进行比较，以计算损失函数。通过最小化这个损失函数，模型可以逐渐学习到如何更准确地进行预测。因此，`y_batch` 在模型训练中起到了提供目标输出的关键作用。

# Positional Encoding

现在开始构造模型最开始的layer, 首先是创建一个embedding层, 它的作用是将token变成vector
![embeding layer](./embeding.png)

ref: [嵌入层 nn.Embedding() 详解和要点提醒（PyTorch）](https://github.com/Hoper-J/AI-Guide-and-Demos-zh_CN/blob/master/Guide/g.%20%E5%B5%8C%E5%85%A5%E5%B1%82%20nn.Embedding()%20%E8%AF%A6%E8%A7%A3%E5%92%8C%E8%A6%81%E7%82%B9%E6%8F%90%E9%86%92%EF%BC%88PyTorch%EF%BC%89.md)

In [None]:
# 这里用 max_token_value 而不是用 vocab_size 是因为
# 模型的映射范围要到 max_token_value 才能准确表达所有的 token
# embedding_dim 设置成 d_model （超参数），可以理解为
# 通过这个layer，一个 token 会被表示为一个 (1, d_model) 的 vector
token_embedding_lookup_table = nn.Embedding(
        num_embeddings=max_token_value, 
        embedding_dim=d_model)

# Get X and Y embedding
x = token_embedding_lookup_table(x_batch)
y = token_embedding_lookup_table(y_batch)
print(x.shape, type(x)) 
# torch.Size([batch_size, context_length, d_model]) <class 'torch.Tensor'>

torch.Size([4, 16, 64]) <class 'torch.Tensor'>


接下来要给 embedded 后的 vector 加上 位置编码信息。

在一个batch里, 每个token都会有一个独特的位置编码信息，这个位置编码信息也是用一个维度为 (1, d_model) 的 vector 表示的。

假设某个token在输入序列中的位置是 pos，那在这个token对应的位置编码信息的vector中，

位于奇数位置的数值表示为
`PE(POS, 2i+1) = cos(pos / 10000 ^ (2i/d_model))`
位于偶数位置的数值表示为
`PE(pos, 2i) = sin(pos / 10000^ (2i/d_model))`


如果一个batch的输入序列的长度是 context_length , 那么它需要的位置编码对应的矩阵的 shape 是 ( context_length , d_model ), 接下来我们要构造这个矩阵

In [21]:
# Define Position Encoding look-up table
position_encoding_lookup_table = torch.zeros(context_length, d_model)


# 求分母部分的数值
# make a list of odd numbers between range [ 0 , d_model ]
odds = torch.arange(0, d_model, 2) # tensor([ 0,  2,  4,  6,  8, 10, 12,..., 52, 54, 56, 58, 60, 62])
div_term = torch.exp(odds.float() * (-math.log(10000.0) / d_model))

# 构造一个表示位置的tensor
position = torch.arange(0, context_length, dtype=torch.float).unsqueeze(1) # torch.Size([16, 1])
# 对每个token的embedded vector的偶数维度进行赋值计算
# [:, 0::2] 的含义是选择所有行，并且选择这些行中的偶数列（索引为 0, 2, 4, ... 的列）。
position_encoding_lookup_table[:, 0::2] = torch.sin(position * div_term)
# 对每个token的embedded vector的奇数维度进行赋值计算
position_encoding_lookup_table[:, 1::2] = torch.cos(position * div_term)

# 为了方便处理输入，扩展维度
position_encoding_lookup_table = position_encoding_lookup_table.unsqueeze(0).expand(batch_size, -1, -1) #add batch to the first dimension
print("Position Encoding Look-up Table: ", position_encoding_lookup_table.shape)

# Add position encoding to the token embedding
input_embedding_x = x + position_encoding_lookup_table
input_embedding_y = y + position_encoding_lookup_table


Position Encoding Look-up Table:  torch.Size([4, 16, 64])


# Transformer Block

接下来要实现的是transformer block 首先是 attention 的实现
