首先import

In [None]:
import math, copy, time
import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as Data

检查一下设备

In [None]:
print(torch.cuda.is_available())
print(torch.cuda.current_device())

模型参数

In [None]:
d_model = 768  # Embedding Size
d_ff = 3072 # FeedForward dimension, 4*h
d_k = d_v = 64  # dimension of K(=Q), V
n_layers = 12  # number of Encoder of Decoder Layer
n_heads = 12  # number of heads in Multi-Head Attention
n_vocab = 50257
max_context_len = 256

In [None]:
def clones(module, N):
    "Produce N identical layers."
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

embedding层(包含输入和输出的embedding)

In [None]:
class Embeddings(nn.Module):
    def __init__(self):
        super(Embeddings, self).__init__()

        self.lut = nn.Embedding(n_vocab, d_model)   # 生成随机正态分布的embedding
        
    def forward(self,x):
        return self.lut(x) * math.sqrt(d_model)     # annotated 建议乘根号dmodel

位置编码(dropout=0.1)

In [None]:
class PositionEncoding(nn.Module):
    def __init__(self, dropout=0.1, max_len=4096):
        super(PositionEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        
        pe = torch.zeros(max_len, d_model)
        pos = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        denominator = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(pos * denominator)
        pe[:, 1::2] = torch.cos(pos * denominator)
        pe = pe.unsqueeze(0).transpose(0, 1)    #得到 [max_len, 1, d_model]
        self.register_buffer('pe', pe)

    def forward(self, x):
        '''
        x: [seq_len, batch_size, d_model]
        '''
        #print(x.shape)
        #print(self.pe.shape)
        x = x + self.pe[:x.size(0), :]
        x = self.dropout(x)
        return x     #在 d_model 维度上对 x 做归一化

注意力机制

In [None]:
def attn_padmask(pad_mask): # [batch_size,max_length]
    batch_size, len = pad_mask.size()
    
    pad_attn_mask = pad_mask.unsqueeze(1)   #为0的全部标为 True, [batch_size, 1, len_k]
    return pad_attn_mask.expand(batch_size, len, len)   # [batch_size, len_q, len_k]

def attn_mask(seq):
    attn_shape = [seq.size(0), seq.size(1), seq.size(1)]
    mask = np.triu(np.ones(attn_shape), k=1)    # 对角线为0的上三角
    return torch.from_numpy(mask).bool()     # [batch_size, tgt_len, tgt_len]

In [None]:
class SDPAttn(nn.Module):
    def __init__(self):
        super(SDPAttn, self).__init__()

    def forward(self, Q, K, V, attnmask=None, dropout=None):
        scores = torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(d_k)    # [batch_size, n_heads, len_q, len_k]
        if attnmask is not None:
            scores.masked_fill_(attnmask, -1e9) # Fills elements of self tensor with value where mask is True.
        
        attn = nn.Softmax(dim=-1)(scores)
        if dropout is not None:
            attn = dropout(attn)
        return torch.matmul(attn, V), attn    # [batch_size, n_heads, len_q, d_v]

多头注意力机制

In [None]:
class MultiHeadAttn(nn.Module):
    def __init__(self,dropout=0.1):
        super(MultiHeadAttn, self).__init__()
        assert d_model % n_heads == 0
        self.linears = clones(nn.Linear(d_model, d_model), 4)
        self.attn = None
        self.SDPAttn = SDPAttn()
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, Q, K, V, attn_mask=None):
        batch_size = Q.size(0)
        
        Q, K, V = \
            [l(x).view(batch_size, -1, n_heads, d_k).transpose(1, 2)
             for l, x in zip(self.linears, (Q, K, V))]
        
        if attn_mask is not None:
            attn_mask = attn_mask.unsqueeze(1).repeat(1, n_heads, 1, 1) # attn_mask : [batch_size, n_heads, seq_len, seq_len]

        context, self.attn = self.SDPAttn.forward(Q, K, V, attnmask=attn_mask)
        context = context.transpose(1, 2).reshape(batch_size, -1, n_heads * d_v) # context: [batch_size, len_q, n_heads * d_v]
        output = self.linears[-1](context) # [batch_size, len_q, d_model]
        return output

全连接层

In [None]:
class FeedForward(nn.Module):
    def __init__(self, dropout=0.1):
        super(FeedForward, self).__init__()
        self.w_1 = nn.Linear(d_model, d_ff)
        self.w_2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)
        self.relu = nn.ReLU()

    def forward(self, x):
        #residual = x
        output = self.w_2(self.dropout(self.relu(self.w_1(x))))   # w1->relu->dropout->w2
        return output   #nn.LayerNorm(d_model)(output) # [batch_size, seq_len, d_model]

设置 Decoder Layer

In [None]:
class DecoderLayer(nn.Module):
    def __init__(self):
        super(DecoderLayer, self).__init__()
        self.dec_attn = MultiHeadAttn()
        self.layernorm = nn.LayerNorm(d_model)
        self.ffn = FeedForward()

    def forward(self, dec_inputs, attn_mask):
        # outputs: [batch_size, tgt_len, d_model]
        residual = dec_inputs
        dec_inputs = self.layernorm(dec_inputs)
        attn_output = self.dec_attn(dec_inputs, dec_inputs, dec_inputs, attn_mask=attn_mask)
        # 第一次残差
        layernorm_output = self.layernorm(attn_output + residual)
        outputs = self.ffn(layernorm_output) # [batch_size, tgt_len, d_model]
        # 第二次残差
        outputs = attn_output+residual+outputs

        return outputs

Decoder

In [None]:
class Decoder(nn.Module):
    def __init__(self):
        super(Decoder, self).__init__()
        self.emb = Embeddings()
        self.pos_enc = PositionEncoding()
        self.layernorm = nn.LayerNorm(d_model)
        self.layers = nn.ModuleList([DecoderLayer() for _ in range(n_layers)])
        self.ffn = FeedForward()

    def forward(self, dec_inputs, pad_mask):
        '''
        dec_inputs: [batch_size, tgt_len]
        '''
        # print(dec_inputs)
        dec_outputs = self.emb(dec_inputs)  # [batch_size, seq_len, d_model]
        # print("emb",dec_outputs)
        dec_outputs = self.pos_enc(dec_outputs.transpose(0, 1))#.cuda() # [batch_size, seq_len, d_model]
        # print("posenc",dec_outputs)
        dec_pad_mask = attn_padmask(pad_mask).cuda() # [batch_size, tgt_len, tgt_len]
        dec_attnmask = attn_mask(dec_inputs).cuda() # [batch_size, tgt_len, tgt_len]
        # print("padmask",dec_pad_mask)
        # print("attnmask",dec_attnmask)
        dec_mask = torch.gt((dec_pad_mask + dec_attnmask), 0)#.cuda() # 只要满足填充或下三角之一，就标记为1，[batch_size, tgt_len, tgt_len]
        # print("dec_mask",dec_mask)
        for layer in self.layers:
            # dec_outputs: [batch_size, tgt_len, d_model], dec_self_attn: [batch_size, n_heads, tgt_len, tgt_len], dec_enc_attn: [batch_size, h_heads, tgt_len, src_len]
            dec_outputs = layer(dec_outputs, dec_mask)
        dec_outputs = self.layernorm(dec_outputs).transpose(0, 1)
        return dec_outputs

In [None]:
class GPT2(nn.Module):
    def __init__(self):
        super(GPT2, self).__init__()
        self.decoder = Decoder()#.cuda()
        self.projection = nn.Linear(d_model, n_vocab)#.cuda()

    def forward(self, dec_inputs,attention_mask):
        '''
        dec_inputs: [batch_size, tgt_len]
        '''
        # tensor to store decoder outputs
        # outputs = torch.zeros(batch_size, tgt_len, tgt_vocab_size).to(self.device)
        pad_mask = attention_mask
        dec_inputs = dec_inputs
        
        dec_outputs = self.decoder(dec_inputs,pad_mask)  # dec_outputs: [batch_size, tgt_len, d_model]
        dec_logits = self.projection(dec_outputs)   # dec_logits: [batch_size, tgt_len, n_vocab]
        return dec_logits.view(-1, dec_logits.size(-1)) # 返回 [batch_size*tgt_len, n_vocab]

建立模型

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

#model = GPT2().to(device)
#print(model)

训练

In [None]:
class TrainState:
    """Track number of steps, examples, and tokens processed"""

    step: int = 0  # Steps in the current epoch
    accum_step: int = 0  # Number of gradient accumulation steps
    samples: int = 0  # total # of examples used
    tokens: int = 0  # total # of tokens processed

In [None]:
from transformers import BertTokenizer,GPT2Tokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")#("uer/gpt2-chinese-cluecorpussmall")
#tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
'''
#tokenizer.bos_token = tokenizer.cls_token_id
#tokenizer.eos_token = tokenizer.sep_token_id
#tokenizer.pad_token = tokenizer.convert_tokens_to_ids('[SEP]')

with open('train.txt','r') as f:
    data = f.readlines()
#print(data[0])

dataencode = tokenizer.batch_encode_plus(data[0:2],add_special_tokens=False, padding="max_length",max_length=max_context_len, return_tensors="pt")

#print(tokenizer(["这是一个测试","this is another one"],padding="max_length",max_length=max_context_len,return_length=True, return_tensors="pt"))
print(dataencode['input_ids'].shape)
print(dataencode)
print(tokenizer.batch_decode(dataencode['input_ids']))
a=0
for i in dataencode['input_ids'][0]:
    if i==8667:
        a+=1
print(a)
print(tokenizer(["这是一个测试","this is another one"],padding="longest",return_tensors="pt")['input_ids'])
'''

In [None]:
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

train_args = {
    'batch_size': 32,
    'epochs': 100
}
#bos, eos, pad, sep = tokenizer.bos_token_id, tokenizer.eos_token_id, tokenizer.pad_token_id, tokenizer.sep_token_id


class MyDataSet(Dataset):
  def __init__(self, path='train.txt'):
    super(MyDataSet, self).__init__()
    with open(path,'r') as f:
      self.data = f.readlines()
    self.encode = tokenizer.batch_encode_plus(self.data,padding="max_length",truncation=True,max_length=max_context_len-1,return_tensors="pt",add_special_tokens=False)
    #for ed in self.encode['input_ids']:
    #  ed = torch.tensor([bos])+ed+torch.tensor([eos])
    self.input_ids = self.encode['input_ids']
    self.attention_masks = self.encode['attention_mask']
  
  def __len__(self):
    return self.input_ids.shape[0]
  
  def __getitem__(self, idx):
    return self.input_ids[idx], self.attention_masks[idx]

In [None]:
train_dataset = MyDataSet()

loader = DataLoader(train_dataset, batch_size = train_args['batch_size'])
print(train_dataset.__len__())
for dat in loader:
    print(dat[0].shape)
    #dat[0] = torch.concat([dat[0],torch.tensor([[0]]).expand(dat[0].shape[0],1)], -1)
    print(dat[0])

In [None]:
def train(model, train_args, train_dataset = MyDataSet(), train_state=TrainState(), PATH = 'model_checkpoint/checkpoint', mode="train"):
    train_dataloader = DataLoader(train_dataset, batch_size = train_args['batch_size'])
    
    model.train()
    criterion = nn.CrossEntropyLoss(ignore_index=0)
    # optimizer = optim.SGD(model.parameters(), lr=5e-4, momentum=0.99)
    optimizer = optim.Adam(model.parameters(), lr = 1e-5)
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size = 100, gamma = 0.1)
    running_loss = 0
    oom_time = 0
    accum_iter = 1

    for epoch in range(train_args['epochs']):
        """Train a single epoch"""
        start = time.time()
        total_tokens = 0
        total_loss = 0
        tokens = 0
        n_accum = 0

        for i, inputs in enumerate(train_dataloader):
            # batch = [batch_size, tgt_len]
            # out = [batch_size*tgt_len, n_vocab]
            #labels = []
            labels = torch.concat([inputs[0],torch.tensor([[tokenizer.convert_tokens_to_ids('[SEP]')]]).expand(inputs[0].shape[0],1)], -1)
            #labels[1] = torch.concat([inputs[1], torch.tensor([[0]]).expand(inputs[1].shape[0],1)], -1)
            inputs[0] = torch.concat([torch.tensor([[tokenizer.convert_tokens_to_ids('[CLS]')]]).expand(inputs[0].shape[0],1),inputs[0]], -1)
            inputs[1] = torch.concat([torch.tensor([[0]]).expand(inputs[1].shape[0],1),inputs[1]], -1)
            
            batch = inputs[0].to(device)
            batch_mask = inputs[1].to(device)
            labels = labels.to(device)
            try:
                out = model(
                    batch,
                    batch_mask
                )
                #print(batch.shape)
                #print(out.shape)
                loss = criterion(out, labels.view(-1))
                #loss = criterion(out, batch.view(-1))
                # loss_node = loss_node / accum_iter
                if mode == "train":# or mode == "train+log":
                    loss.backward()
                    train_state.step += 1
                    #train_state.samples += batch.src.shape[0]
                    train_state.samples += batch.shape[0]
                    train_state.tokens += batch.shape[0]*batch.shape[1]
                    if i % accum_iter == 0:
                        optimizer.step()
                        optimizer.zero_grad(set_to_none=True)
                        scheduler.step()
                        n_accum += 1
                        train_state.accum_step += 1

                total_loss += loss
                total_tokens += batch.shape[0]*batch.shape[1]
                tokens += batch.shape[0]*batch.shape[1]
                if i % 20 == 0 and (mode == "train"): # or mode == "train+log"):
                    lr = optimizer.param_groups[0]["lr"]
                    elapsed = time.time() - start
                    print(
                        (
                            "Epoch Step: %6d | Accumulation Step: %3d | Loss: %6.2f "
                            + "| Tokens / Sec: %7.1f | Learning Rate: %6.1e"
                        )
                        % (i, n_accum, loss, tokens / elapsed, lr)
                    )
                    start = time.time()
                    tokens = 0
                del loss
            except RuntimeError as exception:
                if "out of memory" in str(exception):
                    oom_time += 1
                    print("WARNING: ran out of memory for {} times".format(oom_time))
                    if hasattr(torch.cuda, 'empty_cache'):
                        torch.cuda.empty_cache()
                else:
                    print(str(exception))
                    raise exception
        
        if epoch % 10 == 0:
            torch.save(model.state_dict(), PATH+'_{}.pth'.format(epoch))
        print('epoch {}/{} finished'.format(epoch+1,train_args['epochs']))

In [None]:
#train(model,train_args)

保存模型并加载

In [None]:
PATH = 'model_save/model_finals.pth'
#PATH = 'model_checkpoint/checkpoint_0_steps50000.pth'
#torch.save(model.state_dict(), PATH)

In [None]:
model1 = GPT2()
model1.load_state_dict(torch.load(PATH,map_location={'cuda:4':'cuda:0'}))

In [None]:
from random import randint

def prediction(dec_output, top_k=0):
    # 
    top_k = min(top_k, dec_output.shape[-1])  # Safety check
    if top_k > 1:
        # Remove all tokens with a probability less than the last token of the top-k
        dec_output = torch.topk(dec_output, top_k)[1][:, randint(0,top_k-1)]
    return dec_output[-1]
    # return dec_output.max(-1)[1]

def greedy_decoder(model, inputs, gen_len=100, start_symbol='[CLS]'):
    terminal = False
    start_len = len(inputs['input_ids'][0])
    next_symbol = start_symbol

    while not terminal:
        if len(inputs['input_ids'][0]) - start_len > gen_len:
            next_symbol = tokenizer.convert_tokens_to_ids('[PAD]')
            inputs['input_ids'] = torch.cat([inputs['input_ids'].detach(), torch.tensor([[next_symbol]],dtype=inputs['input_ids'].dtype,device=device)],-1)
            inputs['attention_mask'] = torch.cat([inputs['attention_mask'].detach(), torch.tensor([[0]],device=device)],-1)
            break
        output = model(inputs['input_ids'], inputs['attention_mask'])
        pred = prediction(output,5)
        next_symbol = pred
        
        if next_symbol == tokenizer.convert_tokens_to_ids('[SEP]'):
            # or next_symbol == tokenizer.convert_tokens_to_ids('，')
            terminal = True

        inputs['input_ids'] = torch.cat([inputs['input_ids'].detach(), torch.tensor([[next_symbol]],dtype=inputs['input_ids'].dtype,device=device)],-1)
        inputs['attention_mask'] = torch.cat([inputs['attention_mask'].detach(), torch.tensor([[1]],device=device)],-1)

    return inputs['input_ids']


#test_str = '你要能到林荫中去有多好！——那儿有一座高高的山'
# test_str = '你要能到林荫中去有多好！那儿有一座高高的山'
test_str = '难道还怕火焰不成？'
test_encode = tokenizer.encode_plus('[CLS]'+test_str, add_special_tokens=False,return_tensors="pt",truncation=True).to(device)
model1.to(device)
result = greedy_decoder(model1, test_encode)

print(result.shape[1]-len(test_str))
print(tokenizer.decode(result[0]))

In [None]:
test_str = '你要能到林荫中去有多好！——那儿有一座高高的山'
#test_str = '你，好朋友，你的意'
#tokenizer.pad_token = tokenizer.eos_token
#test_encode = tokenizer.encode_plus('[CLS]'+test_str, add_special_tokens=False,return_tensors="pt",padding="max_length",truncation=True,max_length=max_context_len-1) # input 的 [PAD] 会生成 output 的 [SEP]
test_encode = tokenizer.encode_plus('[CLS]'+test_str, add_special_tokens=False,return_tensors="pt",truncation=True) # input 的 [PAD] 会生成 output 的 [SEP]
print('encode',test_encode['input_ids'].shape)

#test_encode['input_ids'] = torch.concat([torch.tensor([[tokenizer.convert_tokens_to_ids('[CLS]')]]),test_encode['input_ids']],-1)
#test_encode['attention_mask'] = torch.concat([torch.tensor([[0]]),test_encode['attention_mask']],-1)


#print('encode',test_encode)

test_out = model(test_encode['input_ids'].to(device),test_encode['attention_mask'].to(device))


#print('output',test_out)
print(test_encode['input_ids'].shape)
print(test_out.shape)
tok = torch.topk(test_out, 3)
print('tok',tok.indices.shape,tok)
test_pred = test_out.max(-1)[1]
#test_pred = test_pred[1:]
print(test_pred.shape)
#print(len(test_pred))
print(test_pred)
print(tokenizer.decode(test_encode['input_ids'][0]))
print(tokenizer.decode(test_pred))

In [None]:
print(tok[0])
print(tok[0][:,-1].shape)

In [None]:
model_size = sum(t.numel() for t in model.parameters())
print(model_size/1000**2)