<a href="https://colab.research.google.com/github/Brownwang0426/transformer-from-scratch/main/train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 手刻簡單的文字生成 transformer
## 1. 目的
示範如何純粹用 pytorch 手刻一個簡單的文字生成 transformer \
並強化自己對於 transformer 的理解以及操作能力
## 2.未來
如果有機會，未來會再增加 numpy 版本，並且使用 numpy 手刻 error back-propagation
## 3. 套件
(for local) \
CUDA Toolkit 11.8 \
cuDNN 8.9.x \
pip install torch==2.0.1 --extra-index-url https://download.pytorch.org/whl/cu118  
其餘套件可自行下載

# 導入官方套件

In [1]:


import numpy as np
import math


import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.utils.rnn as rnn_utils
from torch.utils.data import DataLoader, TensorDataset, Subset

import csv

import multiprocessing as mp
import os
import sys
import copy
import random
import gc
import time
from tqdm import tqdm
from collections import defaultdict

import itertools

import dill

import warnings
warnings.filterwarnings('ignore')

from datasets import load_dataset

import torch
from transformers import BertTokenizer, BertModel
import numpy as np

from IPython.display import display, clear_output

# 導入客製化套件

In [2]:
from model import *

# 確認有無讀取到 cuda

In [3]:
if torch.cuda.is_available():
    for i in range(torch.cuda.device_count()):
        print(f"Device {i}: {torch.cuda.get_device_name(i)}")
    device_index = 0
    device = torch.device(f"cuda:{device_index}")
    print('using cuda...')
else:
    device = torch.device("cpu")
    print('using cpu...')

torch.backends.cudnn.enabled = True
torch.backends.cudnn.benchmark = True

Device 0: NVIDIA GeForce RTX 4090
using cuda...


# 參數區域

In [4]:
source = "natural_questions" # 其他可以用的有 squad  natural_questions

retrain = True

interval   = 1000000
train_size = 200


In [5]:


max_length = 200

sequence_size =  max_length             
feature_size = 768          
num_layers = 3                      
num_heads = 4                
hidden_activation = 'tanh'
output_activation = 'tanh'
initializer = "xavier_normal"
optimizer = 'adam'
loss = 'mean_squared_error'
bias = False
drop_rate = 0.0
alpha = 0.0000001       

num_epochs = 100  
batch_size = 1

model_directory = f'model.pth'


# 建立模型

In [6]:

# 建立模型
model = build_model(sequence_size,
                    feature_size,
                    num_layers,
                    num_heads,
                    hidden_activation,
                    output_activation,
                    initializer,
                    optimizer,
                    loss,
                    bias,
                    drop_rate,
                    alpha)

# 將模型放到 cuda
model = model.to(device)

# 讀取參數
if retrain:
    model_dict = torch.load(model_directory)
    model.load_state_dict(model_dict[f'model'])

# 準備資料並訓練模型

In [7]:
def create_dataset(input_vectors, attention_masks):

    final_input  = []
    final_label  = []
    final_mask_1 = []
    final_mask_2 = []

    for i in tqdm(range(input_vectors.size(0))): 
        for j in range(input_vectors.size(1) - 1):
            
            if attention_masks[i][j + 1] != 0:

                factored_mask       = torch.zeros_like(attention_masks[i])
                factored_mask[:j+1] = 1
                input               = input_vectors[i] * factored_mask.unsqueeze(1)
                final_input.append(input)

                label = input_vectors[i][j+1]
                final_label.append(label)

                mask_2 = factored_mask.unsqueeze(1) * factored_mask.unsqueeze(0)
                mask_2 = mask_2.unsqueeze(0)
                mask_1 = (mask_2 -1) * 1e20
                final_mask_1.append(mask_1)
                final_mask_2.append(mask_2)

    final_input  = torch.stack(final_input , dim=0)
    final_label  = torch.stack(final_label , dim=0)
    final_mask_1 = torch.stack(final_mask_1, dim=0)
    final_mask_2 = torch.stack(final_mask_2, dim=0)

    return final_input, final_label, final_mask_1, final_mask_2

In [None]:
# 讀取資料
dataset     = load_dataset(source, trust_remote_code=True )
questions   = dataset['train']['question']
answers     = dataset['train']['answers' ]

# 確定要抽取的 QA 大小
train_size  = min(train_size, len(questions))

# 開始抽取資料
for _ in range(interval):

    # 隨機 indices
    random_indices = random.sample(range(len(questions)), train_size)

    # 抽取 QA
    questions = [questions[i] for i in random_indices]
    answers   = [answers[i]   for i in random_indices]

    # 建立 QA 
    qa_pairs  = []
    for question, answer in zip(questions, answers):
        qa_pairs.append(f"[CLS] {question} [SEP] {answer['text'][0]} [SEP] ")

    # 偷看一下 QA 裡面有什麼東西
    for qa in qa_pairs[:3]:
        print(qa)
        print("-" * 50)

    # 初始化 BERT tokenizer 和 vectorizer
    tokenizer  = BertTokenizer.from_pretrained('bert-base-uncased')
    vectorizer = BertModel.from_pretrained('bert-base-uncased')

    # 改一下名稱
    sentences = qa_pairs

    # 將 QA tokenize
    tokenized_sentences = tokenizer(sentences, add_special_tokens=False, padding='max_length', max_length=max_length, truncation=True, return_tensors="pt")

    # 將 QA vectorize
    input_ids           = tokenized_sentences['input_ids']
    attention_masks     = tokenized_sentences['attention_mask']
    with torch.no_grad(): 
        input_vectors   = vectorizer(input_ids).last_hidden_state * ( attention_masks.unsqueeze(2) )

    # 生成 QA 訓練集
    final_input, final_label, final_mask_1, final_mask_2 = create_dataset(input_vectors, attention_masks)

    # 彙整 QA 訓練集
    dataset    = TensorDataset(final_input, final_label, final_mask_1, final_mask_2)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    # 訓練模型
    for epoch in range(num_epochs):
        
        model.train()  
        
        running_loss = 0.0

        for batch_idx, (input, label, mask_1, mask_2) in tqdm(enumerate(dataloader), total=len(dataloader), desc="Training Progress", ncols=100, unit="batch"):

            input  = input.to(device)
            label  = label.to(device)
            mask_1 = mask_1.to(device)
            mask_2 = mask_2.to(device)
            
            optimizer = model.optimizer
            optimizer.zero_grad()

            loss_function   = model.loss_function
            output          = model(input, (mask_1, mask_2))
            loss            = loss_function(output, label)
            loss.backward()     # get grad
            
            optimizer.step()    # update params 
            
            running_loss += loss.item()
            
        epoch_loss = running_loss / len(dataloader)
        print(f"Epoch [{epoch+1}/{num_epochs}] finished. Average Loss: {epoch_loss:.4f}")

        model_dict = {}
        model_dict[f'model'] = model.state_dict()
        torch.save(model_dict, model_directory)
        

Downloading data:  89%|████████▉ | 255/287 [48:35<32:15, 60.49s/files]

In [None]:

model_dict = {}
model_dict[f'model'] = model.state_dict()
torch.save(model_dict, model_directory)