<a href="https://colab.research.google.com/github/Brownwang0426/transformer-from-scratch/blob/main/train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 手刻簡單的文字生成 transformer

示範如何純粹用 pytorch 手刻一個簡單的文字生成 transformer \
並強化自己對於 transformer 的理解以及操作能力 \
如果有機會，未來會再增加 numpy 版本，並且使用 numpy 手刻 error back-propagation



# For colab

In [1]:
!git clone https://github.com/Brownwang0426/transformer-from-scratch.git

fatal: destination path 'transformer-from-scratch' already exists and is not an empty directory.


In [2]:
import os
os.chdir('/content/transformer-from-scratch')

In [3]:
!pip install torch dill datasets tqdm numpy IPython



# For local
CUDA Toolkit 11.8 \
cuDNN 8.9.x \
pip install torch==2.0.1 --extra-index-url https://download.pytorch.org/whl/cu118  
其餘套件可自行下載

# 導入官方套件

In [18]:


import numpy as np
import math


import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.utils.rnn as rnn_utils
from torch.utils.data import DataLoader, TensorDataset, Subset

import csv

import multiprocessing as mp
import os
import sys
import copy
import random
import gc
import time
from tqdm import tqdm
from collections import defaultdict

import itertools

import dill

import warnings
warnings.filterwarnings('ignore')

from datasets import load_dataset

import torch
from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModel
import numpy as np

from IPython.display import display, clear_output

# 導入客製化套件

In [19]:
from model import *

# 確認有無讀取到 cuda

In [20]:
if torch.cuda.is_available():
    for i in range(torch.cuda.device_count()):
        print(f"Device {i}: {torch.cuda.get_device_name(i)}")
    device_index = 0
    device = torch.device(f"cuda:{device_index}")
    print('using cuda...')
else:
    device = torch.device("cpu")
    print('using cpu...')

torch.backends.cudnn.enabled = True
torch.backends.cudnn.benchmark = True

Device 0: Tesla T4
using cuda...


# 參數區域

In [21]:
source = "daily_dialog" # 其他可以用的有 squad  natural_questions

retrain = True

interval   = 1000000
train_size = 50


In [22]:


max_length = 200

sequence_size =  max_length
feature_size = 768
num_layers = 3
num_heads = 4
hidden_activation = 'tanh'
output_activation = 'tanh'
initializer = "xavier_normal"
optimizer = 'adam'
loss = 'mean_squared_error'
bias = False
drop_rate = 0.0
alpha = 0.0000001

num_epochs = 100
batch_size = 1

model_directory = f'model.pth'


In [23]:

response_length = 50

# 建立模型

In [24]:

# 建立模型
model = build_model(sequence_size,
                    feature_size,
                    num_layers,
                    num_heads,
                    hidden_activation,
                    output_activation,
                    initializer,
                    optimizer,
                    loss,
                    bias,
                    drop_rate,
                    alpha)

# 將模型放到 cuda
model = model.to(device)

# 讀取參數
if retrain:
    try:
        model_dict = torch.load(model_directory)
        model.load_state_dict(model_dict[f'model'])
        print('Model loaded.')
    except:
        print('Model not loaded. Now using new model.')
        pass

Model loaded.


# 準備資料並訓練模型

In [25]:
def create_dataset(input_vectors, attention_masks):

    final_input  = []
    final_label  = []
    final_mask_1 = []
    final_mask_2 = []

    for i in tqdm(range(input_vectors.size(0))):
        for j in range(input_vectors.size(1) - 1):

            if attention_masks[i][j + 1] != 0:

                factored_mask       = torch.zeros_like(attention_masks[i])
                factored_mask[:j+1] = 1
                input               = input_vectors[i] * factored_mask.unsqueeze(1)
                final_input.append(input)

                label = input_vectors[i][j+1]
                final_label.append(label)

                mask_2 = factored_mask.unsqueeze(1) * factored_mask.unsqueeze(0)
                mask_2 = mask_2.unsqueeze(0)
                mask_1 = (mask_2 -1) * 1e20
                final_mask_1.append(mask_1)
                final_mask_2.append(mask_2)

    final_input  = torch.stack(final_input , dim=0)
    final_label  = torch.stack(final_label , dim=0)
    final_mask_1 = torch.stack(final_mask_1, dim=0)
    final_mask_2 = torch.stack(final_mask_2, dim=0)

    return final_input, final_label, final_mask_1, final_mask_2

In [26]:
# 讀取資料
dataset     = load_dataset(source, trust_remote_code=True )

questions = []
answers = []
# Process each dialogue
for dialog in dataset['train']['dialog']:
    # Separate utterances into Person A and Person B
    person_a_utterances = dialog[::2]  # Odd-indexed utterances are Person A's
    person_b_utterances = dialog[1::2]  # Even-indexed utterances are Person B's

    person_max_length = max(len(person_a_utterances), len(person_b_utterances))
    while len(person_a_utterances) < person_max_length:
        person_a_utterances.append("[SEP]")  # Add empty string or "[No Response]"
    while len(person_b_utterances) < person_max_length:
        person_b_utterances.append("[SEP]")  # Add empty string or "[No Response]"

    # Join Person A's and Person B's utterances into strings
    questions.extend(person_a_utterances)
    answers.extend(person_b_utterances)





# 確定要抽取的 QA 大小
train_size  = min(train_size, len(questions))

# 開始抽取資料
for _ in range(interval):

    # 隨機 indices
    random_indices = random.sample(range(len(questions)), train_size)

    # 抽取 QA
    questions = [questions[i] for i in random_indices]
    answers   = [answers[i]   for i in random_indices]

    # 建立 QA
    qa_pairs  = []
    for question, answer in zip(questions, answers):
        qa_pairs.append(f"[CLS] {question} [SEP] {answer} [SEP] ")

    # 偷看一下 QA 裡面有什麼東西
    for qa in qa_pairs[:3]:
        print(qa)
        print("-" * 50)

    # 初始化 BERT tokenizer 和 vectorizer
    tokenizer  =  AutoTokenizer.from_pretrained("roberta-base")
    vectorizer = AutoModel.from_pretrained("roberta-base")

    # 改一下名稱
    sentences = qa_pairs

    # 將 QA tokenize
    tokenized_sentences = tokenizer(sentences, add_special_tokens=False, padding='max_length', max_length=max_length, truncation=True, return_tensors="pt")

    # 將 QA vectorize
    input_ids           = tokenized_sentences['input_ids']
    attention_masks     = tokenized_sentences['attention_mask']
    with torch.no_grad():
        input_vectors   = vectorizer(input_ids).last_hidden_state * ( attention_masks.unsqueeze(2) )

    # 生成 QA 訓練集
    final_input, final_label, final_mask_1, final_mask_2 = create_dataset(input_vectors, attention_masks)

    # 彙整 QA 訓練集
    dataset    = TensorDataset(final_input, final_label, final_mask_1, final_mask_2)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    # 訓練模型
    for epoch in range(num_epochs):

        model.train()

        running_loss = 0.0

        for batch_idx, (input, label, mask_1, mask_2) in tqdm(enumerate(dataloader), total=len(dataloader), desc="Training Progress", ncols=100, unit="batch"):

            input  = input.to(device)
            label  = label.to(device)
            mask_1 = mask_1.to(device)
            mask_2 = mask_2.to(device)

            optimizer = model.optimizer
            optimizer.zero_grad()

            loss_function   = model.loss_function
            output          = model(input, (mask_1, mask_2))
            loss            = loss_function(output, label)
            loss.backward()     # get grad

            optimizer.step()    # update params

            running_loss += loss.item()

        epoch_loss = running_loss / len(dataloader)
        print(f"Epoch [{epoch+1}/{num_epochs}] finished. Average Loss: {epoch_loss:.4f}")

        model_dict = {}
        model_dict[f'model'] = model.state_dict()
        torch.save(model_dict, model_directory)


[CLS]  No , she's out for lunch . May I take a message ?  [SEP]  Yes , please ask her to call John Smith .  [SEP] 
--------------------------------------------------
[CLS]  Oh , I have an elder sister . She is married .. Her husband is a businessman .  [SEP]  Do they live with you and your parents ?  [SEP] 
--------------------------------------------------
[CLS] I finally found an apartment that I want to rent .  [SEP]  Where did you see it ?  [SEP] 
--------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 50/50 [00:01<00:00, 38.13it/s]
Training Progress: 100%|█████████████████████████████████████| 2227/2227 [01:56<00:00, 19.19batch/s]


Epoch [1/100] finished. Average Loss: 1.3344


Training Progress: 100%|█████████████████████████████████████| 2227/2227 [01:56<00:00, 19.07batch/s]


Epoch [2/100] finished. Average Loss: 0.9750


Training Progress: 100%|█████████████████████████████████████| 2227/2227 [01:55<00:00, 19.22batch/s]


Epoch [3/100] finished. Average Loss: 0.7853


Training Progress: 100%|█████████████████████████████████████| 2227/2227 [01:55<00:00, 19.23batch/s]


Epoch [4/100] finished. Average Loss: 0.6596


Training Progress:  73%|██████████████████████████▉          | 1618/2227 [01:24<00:31, 19.20batch/s]


KeyboardInterrupt: 

# 來跟這個小模型用英文聊聊天吧

In [27]:
# 你要問的句子
sentence = "what the fuck are you talking"

In [28]:


# 初始化 BERT tokenizer 和 vectorizer
tokenizer  =  AutoTokenizer.from_pretrained("roberta-base")
vectorizer = AutoModel.from_pretrained("roberta-base")

# 幫你的問題補上 [CSL] 以及 [SEP] ，讓機器可以知道問句的開始與結束
sentence = "[CLS] " + sentence + " [SEP] "

# 機器的回答
response = ''

# 開始遞迴
for i in range(response_length):

    # 將 QA tokenize
    tokenized_sentence = tokenizer(sentence, add_special_tokens=False, padding='max_length', max_length=max_length, truncation=True, return_tensors="pt")

    # 將 QA vectorize
    input_id           = tokenized_sentence['input_ids']
    attention_mask     = tokenized_sentence['attention_mask']
    with torch.no_grad():
        input_vector   = vectorizer(input_id).last_hidden_state * ( attention_mask.unsqueeze(2) )
    input_vector = input_vector.to(device)

    # 製作對應 QA 長度的 mask
    mask_2 = attention_mask[0].unsqueeze(1) * attention_mask[0].unsqueeze(0)
    mask_2 = mask_2.unsqueeze(0).unsqueeze(0)
    mask_2 = mask_2.to(device)
    mask_1 = (mask_2 -1) * 1e20
    mask_1 = mask_1.to(device)

    # 將你的問題向量丟到模型去吧
    model.eval()
    output                 = model(input_vector, (mask_1, mask_2))


    # 提取詞嵌入矩陣
    vocab_embeddings = vectorizer.get_input_embeddings().weight .to(device)  # shape: (vocab_size, embedding_dim)

    # 創建線性層並共享嵌入矩陣權重
    vocab_size, embedding_dim = vocab_embeddings.shape
    linear_layer = torch.nn.Linear(embedding_dim, vocab_size, bias=True).to(device)
    linear_layer.weight.data = vocab_embeddings

    # 將模型輸出映射到詞彙空間
    logits = linear_layer(output)  # shape: (1, vocab_size)

    # 計算詞彙分佈
    probabilities = F.softmax(logits, dim=-1)  # shape: (1, vocab_size)


    # 選擇最高概率的詞
    most_probable_token_idx = torch.argmax(probabilities, dim=-1).item()









    # 將機器人吐出的那個字拼接回去
    if most_similar_token_idx != 102:
        word = tokenizer.convert_ids_to_tokens(most_probable_token_idx)
        sentence += ' ' + word
        response += ' ' + word
        clear_output(wait=True)  # Clear the previous output
        print(response, flush=False)
        display()  # Display the updated output
    else:
        print("[END]")
        break



 <mask> <mask> <mask> <mask> ĠSubstance <mask> ĠMonsanto ĠArche ĠTurtle ĠBris ĠAmmunition Ġepile Ġepile azor ĠFlood Node rance ĠBee ropy ropy ĠEg Mode orters rovers ĠBab rance ĠRiders put Ġmechan ropy ropy ropy


KeyboardInterrupt: 