<a href="https://colab.research.google.com/github/Brownwang0426/transformer-from-scratch/blob/main/train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 手刻簡單的文字生成 transformer

示範如何純粹用 pytorch 手刻一個簡單的文字生成 transformer \
並強化自己對於 transformer 的理解以及操作能力 \
如果有機會，未來會再增加 numpy 版本，並且使用 numpy 手刻 error back-propagation



# For colab

In [1]:
!git clone https://github.com/Brownwang0426/transformer-from-scratch.git

fatal: destination path 'transformer-from-scratch' already exists and is not an empty directory.


In [2]:
import os
os.chdir('/content/transformer-from-scratch')

In [3]:
!pip install torch dill datasets tqdm numpy IPython



# For local
CUDA Toolkit 11.8 \
cuDNN 8.9.x \
pip install torch==2.0.1 --extra-index-url https://download.pytorch.org/whl/cu118  
其餘套件可自行下載

# 導入官方套件

In [4]:
import numpy as np
import math


import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.utils.rnn as rnn_utils
from torch.utils.data import DataLoader, TensorDataset, Subset

import csv

import multiprocessing as mp
import os
import sys
import copy
import random
import gc
import time
from tqdm import tqdm
from collections import defaultdict

import itertools

import dill

import warnings
warnings.filterwarnings('ignore')

from datasets import load_dataset

import torch
from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModel
import numpy as np

from IPython.display import display, clear_output

# 導入客製化套件

In [5]:
from model import *

# 確認有無讀取到 cuda

In [6]:
if torch.cuda.is_available():
    for i in range(torch.cuda.device_count()):
        print(f"Device {i}: {torch.cuda.get_device_name(i)}")
    device_index = 0
    device = torch.device(f"cuda:{device_index}")
    print('using cuda...')
else:
    device = torch.device("cpu")
    print('using cpu...')

torch.backends.cudnn.enabled = True
torch.backends.cudnn.benchmark = True

Device 0: Tesla T4
using cuda...


# 參數區域

In [7]:
# 其他可以用的有 squad  natural_questions
source = "daily_dialog"

In [8]:

# 初始化 BERT tokenizer 和 vectorizer
tokenizer  = AutoTokenizer.from_pretrained("bert-base-uncased")
vectorizer = AutoModel.from_pretrained("bert-base-uncased")


In [9]:


# 要進行幾回合
interval   = 1000000

# 每一回合要抽取多少樣本
train_size = 50

# 每個樣本要看多長的字
max_length = 200


In [10]:

# 是否讀取之前訓練的模型
retrain = True

# 模型相關參數
sequence_size = max_length
feature_size = vectorizer.config.hidden_size
output_size = tokenizer.vocab_size
num_layers = 6
num_heads = 4
hidden_activation = 'tanh'
output_activation = 'softmax'
initializer = "xavier_normal"
optimizer = 'adam'
loss = 'NLLLoss'
bias = False
drop_rate = 0.0
alpha = 0.00001

# 訓練相關參數
num_epochs = 1
batch_size = 1

# 存檔區域
model_directory = f'model.pth'


In [11]:
# 機器回覆你的時候，要用多少個字（當沒有結束標記出現的時候）
response_length = 50

# 建立模型

In [12]:

# 建立模型
model = build_model(sequence_size,
                    feature_size,
                    output_size,
                    num_layers,
                    num_heads,
                    hidden_activation,
                    output_activation,
                    initializer,
                    optimizer,
                    loss,
                    bias,
                    drop_rate,
                    alpha)

# 將模型放到 cuda
model = model.to(device)

# 讀取參數
if retrain:
    try:
        model_dict = torch.load(model_directory)
        model.load_state_dict(model_dict[f'model'])
        print('Model loaded.')
    except:
        print('Model not loaded. Now using new model.')
        pass

Model not loaded. Now using new model.


# 準備資料並訓練模型

In [13]:
def create_dataset(input_vectors, input_ids, output_size, attention_masks):

    final_input  = []
    final_label  = []
    final_mask_1 = []
    final_mask_2 = []

    for i in tqdm(range(input_vectors.size(0))):
        for j in range(input_vectors.size(1) - 1):

            if attention_masks[i][j + 1] != 0:

                factored_mask       = torch.zeros_like(attention_masks[i])
                factored_mask[:j+1] = 1
                input               = input_vectors[i] * factored_mask.unsqueeze(1)
                final_input.append(input)

                label = torch.zeros(output_size)
                label[input_ids[i][j+1]] = 1
                final_label.append(label)

                mask_2 = factored_mask.unsqueeze(1) * factored_mask.unsqueeze(0)
                mask_2 = mask_2.unsqueeze(0)
                mask_1 = (mask_2 -1) * 1e20
                final_mask_1.append(mask_1)
                final_mask_2.append(mask_2)

    final_input  = torch.stack(final_input , dim=0)
    final_label  = torch.stack(final_label , dim=0)
    final_mask_1 = torch.stack(final_mask_1, dim=0)
    final_mask_2 = torch.stack(final_mask_2, dim=0)

    return final_input, final_label, final_mask_1, final_mask_2

In [14]:
# 讀取資料
dataset     = load_dataset(source, trust_remote_code=True )

questions = []
answers   = []
# Process each dialogue
for dialog in dataset['train']['dialog']:
    # Separate utterances into Person A and Person B
    person_a_utterances = dialog[::2]   # Odd-indexed utterances are Person A's
    person_b_utterances = dialog[1::2]  # Even-indexed utterances are Person B's

    person_max_length = max(len(person_a_utterances), len(person_b_utterances))
    if len(person_a_utterances) < person_max_length:
        person_b_utterances = person_b_utterances[:-1]
    if len(person_b_utterances) < person_max_length:
        person_a_utterances = person_a_utterances[:-1]

    # Join Person A's and Person B's utterances into strings
    questions.extend(person_a_utterances)
    answers.extend(person_b_utterances)


In [15]:


# 確定要抽取的 QA 大小
train_size  = min(train_size, len(questions))

# 開始抽取資料
for _ in range(interval):

    # 隨機 indices
    random_indices = random.sample(range(len(questions)), train_size)

    # 抽取 QA
    q_samples = [questions[i] for i in random_indices]
    a_samples = [answers[i]   for i in random_indices]

    # 建立 QA
    qa_pairs  = []
    for q_sample, a_sample in zip(q_samples, a_samples):
        qa_pairs.append(f"[CLS] {q_sample} [SEP] {a_sample} [SEP] ")

    # 偷看一下 QA 裡面有什麼東西
    for qa in qa_pairs[:3]:
        print(qa)
        print("-" * 50)

    # 改一下名稱
    sentences = qa_pairs

    # 將 QA tokenize
    tokenized_sentences = tokenizer(sentences, add_special_tokens=False, padding='max_length', max_length=max_length, truncation=True, return_tensors="pt")

    # 將 QA vectorize
    input_ids           = tokenized_sentences['input_ids']
    attention_masks     = tokenized_sentences['attention_mask']
    with torch.no_grad():
        input_vectors   = vectorizer(input_ids).last_hidden_state * ( attention_masks.unsqueeze(2) )

    # 生成 QA 訓練集
    final_input, final_label, final_mask_1, final_mask_2 = create_dataset(input_vectors, input_ids, output_size, attention_masks)

    # 彙整 QA 訓練集
    dataset    = TensorDataset(final_input, final_label, final_mask_1, final_mask_2)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    # 訓練模型
    for epoch in range(num_epochs):

        model.train()

        running_loss = 0.0

        for batch_idx, (input, label, mask_1, mask_2) in tqdm(enumerate(dataloader), total=len(dataloader), desc="Training Progress", ncols=100, unit="batch"):

            input  = input.to(device)
            label  = label.to(device)
            mask_1 = mask_1.to(device)
            mask_2 = mask_2.to(device)

            optimizer = model.optimizer
            optimizer.zero_grad()

            loss_function   = model.loss_function
            output          = model(input, (mask_1, mask_2))
            loss            = loss_function(torch.log(output), label)
            loss.backward()     # get grad

            optimizer.step()    # update params

            running_loss += loss.item()

        epoch_loss = running_loss / len(dataloader)
        print(f"Epoch [{epoch+1}/{num_epochs}] finished. Average Loss: {epoch_loss:.4f}")

        model_dict = {}
        model_dict[f'model'] = model.state_dict()
        torch.save(model_dict, model_directory)


We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


[CLS] what are you listening to ? Is that Beethoven or Mozart ?  [SEP]  it's Beethoven . Do you like it ?  [SEP] 
--------------------------------------------------
[CLS] There is too much traffic in Taipei these days , I think cars should be banned .  [SEP]  I'm not sure about that . How would people get to work ?  [SEP] 
--------------------------------------------------
[CLS]  Yes , right .  [SEP]  Great ! You are all set to enjoy the library .  [SEP] 
--------------------------------------------------


100%|██████████| 50/50 [00:01<00:00, 38.52it/s]
Training Progress: 100%|█████████████████████████████████████| 1415/1415 [00:42<00:00, 33.64batch/s]


Epoch [1/1] finished. Average Loss: nan
[CLS]  The bell captain will put it in storage room . When you are ready to leave , you can claim your luggage from him .  [SEP]  Thanks very much and goodbye !  [SEP] 
--------------------------------------------------
[CLS]  Not as long as it gets fixed .  [SEP]  All right , I will start working now .  [SEP] 
--------------------------------------------------
[CLS]  It's such a hot season , isn't it ? May I suggest thinning out the top ?  [SEP]  That's a good idea . But leave the front as it is now .  [SEP] 
--------------------------------------------------


100%|██████████| 50/50 [00:02<00:00, 24.61it/s]
Training Progress: 100%|█████████████████████████████████████| 1480/1480 [00:38<00:00, 38.09batch/s]


Epoch [1/1] finished. Average Loss: nan
[CLS]  Oh yeah ! Almost forgot ! I need to take fundamental linguistics , consumer psychology and neuroanatomy .  [SEP]  Wow , you are going to be busy this semester ! Okay , here you go . You are registered now , you ’ ll have to make your first tuition payment before classes start .  [SEP] 
--------------------------------------------------
[CLS]  You see . I told you you'd hit him if you weren't careful .  [SEP]  Oh , shut up , will you ? It was all your fault.If you'd kept quite this would never have happened .  [SEP] 
--------------------------------------------------
[CLS]  it is . Our customers really like it . Do you have any other questions about your new accounts ?  [SEP]  yes . What's the maximum amount that you are allowed to have in an overdraft ?  [SEP] 
--------------------------------------------------


100%|██████████| 50/50 [00:01<00:00, 27.04it/s]
Training Progress: 100%|█████████████████████████████████████| 1947/1947 [00:51<00:00, 37.60batch/s]


Epoch [1/1] finished. Average Loss: nan
[CLS]  Then can I exchange it for something else with the same price ?  [SEP]  Just a moment . Let me find out . Our manager Wil be here in a minute . So will you speak to him ?  [SEP] 
--------------------------------------------------
[CLS]  I wonder if anyone can call the Ridleys about this . Do you know them well ?  [SEP]  Not really .  [SEP] 
--------------------------------------------------
[CLS] Mr . White , I would like to give you notice that I will be leaving the company . It will be effective at the beginning of the next month .  [SEP]  Jessica , I am very sorry to hear that . Why are you leaving ?  [SEP] 
--------------------------------------------------


KeyboardInterrupt: 

# 來跟這個小模型用英文聊聊天吧

In [None]:
# 你要問的句子
sentence = "Please talk like human, dummy!"

In [None]:


# 幫你的問題補上 [CSL] 以及 [SEP] ，讓機器可以知道問句的開始與結束
sentence = "[CLS] " + sentence + " [SEP] "

# 機器的回答
response = ''

# 開始遞迴
for i in range(response_length):

    # 將 QA tokenize
    tokenized_sentence = tokenizer(sentence, add_special_tokens=False, padding='max_length', max_length=max_length, truncation=True, return_tensors="pt")

    # 將 QA vectorize
    input_id           = tokenized_sentence['input_ids']
    attention_mask     = tokenized_sentence['attention_mask']
    with torch.no_grad():
        input_vector   = vectorizer(input_id).last_hidden_state * ( attention_mask.unsqueeze(2) )
    input_vector = input_vector.to(device)

    # 製作對應 QA 長度的 mask
    mask_2 = attention_mask[0].unsqueeze(1) * attention_mask[0].unsqueeze(0)
    mask_2 = mask_2.unsqueeze(0).unsqueeze(0)
    mask_2 = mask_2.to(device)
    mask_1 = (mask_2 -1) * 1e20
    mask_1 = mask_1.to(device)

    # 將你的問題向量丟到模型去吧
    model.eval()
    output                  = model(input_vector, (mask_1, mask_2))

    # 選擇最高概率的詞
    most_probable_token_idx = torch.argmax(output, dim=-1).item()
    word = tokenizer.convert_ids_to_tokens(most_probable_token_idx)

    # 將機器人吐出的那個字拼接回去
    if word not in ['[SEP]']:
        sentence += ' ' + word
        response += ' ' + word
        clear_output(wait=True)  # Clear the previous output
        print(response, flush=False)
        display()  # Display the updated output
    else:
        print("[END]")
        break

    # 將機器人吐出的那個字拼接回去
    # while word in ['[SEP]', '.', ',', '?']:
    #     output[0][most_probable_token_idx] = 0
    #     most_probable_token_idx = torch.argmax(output, dim=-1).item()
    #     word = tokenizer.convert_ids_to_tokens(most_probable_token_idx)
    # sentence += ' ' + word
    # response += ' ' + word
    # clear_output(wait=True)  # Clear the previous output
    # print(response, flush=False)
    # display()  # Display the updated output
