In [3]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchtext
from torchtext.data.utils import get_tokenizer
from example import Transformer
import numpy as np
from torch.autograd import Variable

In [4]:
# TEXT = torchtext.data.Field(tokenize=get_tokenizer('basic_english'), init_token='<sos>', eos_token='<eos>', lower=True)
TEXT = torchtext.data.Field(tokenize=get_tokenizer('basic_english'), init_token='<sos>', eos_token='<eos>', lower=True)

train_txt, val_txt, test_txt = torchtext.datasets.WikiText2.splits(TEXT)

TEXT.build_vocab(train_txt)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
def batchify(data, bsz):
    data = TEXT.numericalize([data.examples[0].text])

    nbatch = data.size(0) // bsz
    data = data.narrow(0, 0, nbatch * bsz)
    data = data.view(bsz, -1).t().contiguous()
    return data.to(device)


batch_size = 20
eval_batch_size = 10

train_data = batchify(train_txt, batch_size)
val_data = batchify(val_txt, eval_batch_size)
test_data = batchify(test_txt, batch_size)

bptt = 35

def get_batch(source, i):
    seq_len = min(bptt, len(source) - 1 - i)
    src = source[i:i+seq_len]
    target = source[i+1:i+seq_len+1].view(-1)
    return src, target

def nopeak_mask(size):
    np_mask = np.triu(np.ones((1, size, size)), k=1).astype('uint8')
    np_mask = Variable(torch.from_numpy(np_mask == 0).to(device))
    return np_mask

def create_masks(src, trg):
    
    src_mask = (src != src_pad).unsqueeze(-2).to(device)

    if trg is not None:
        trg_mask = (trg != opt.trg_pad).unsqueeze(-2).to(device)
        size = trg.size(1)  # get seq_len for matrix
        np_mask = nopeak_mask(size).to(device)
        trg_mask = trg_mask & np_mask

    else:
        trg_mask = None
    return src_mask, trg_mask

In [9]:
mask = (1 - torch.triu(torch.ones(20,20), diagonal=1)).unsqueeze(0)
mask.shape

torch.Size([1, 20, 20])

In [5]:
ntokens = len(TEXT.vocab.stoi)
emsize = 200
nhid = 200
nlayers = 2
nhead = 2
dropout = 0.2
lr = 0.1
# model = make_model(source_vocab=ntokens, target_vocab=ntokens)
model = Transformer(ntokens, ntokens, d_model=512, N=4, heads=2)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer=optimizer, step_size=1, gamma=0.95)


In [17]:
import time

def train():
    model.train()
    total_loss = 0.
    start_time = time.time()

    for batch, i in enumerate(range(0, train_data.size(0)-1, bptt)):
        source, target = get_batch(train_data, i)
        optimizer.zero_grad()
        # src_mask, tgt_mask = create_masks(source, target)
        output = model(source, target, None, None)
        loss = criterion(output.view(-1, ntokens), target)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(),0.5)
        optimizer.step()
        total_loss += loss.item()
        log_interval = 200

        if batch % log_interval ==0 and batch > 0:
            cur_loss = total_loss / log_interval
            elapsed = time.time() - start_time
            print("epochs:",epoch,"\t","batches:",batch,"\t","loss:",loss,"\t",elapsed * 1000 / log_interval,"ms/batch","\n")
            total_loss = 0
            start_time = time.time()

def evaluate(eval_model, data_source):
    eval_model.eval()
    total_loss = 0
    with torch.no_grad():
        for i in range(0, data_source.size(0), bptt):
            source, target = get_batch(data_source, i)
            # src_mask, tgt_mask = create_masks(source, target)
            output = eval_model(source, target, None, None)
            loss = criterion(output.view(-1, ntokens), target)
            total_loss += loss.item()

    return total_loss

In [None]:
best_val_loss = float("inf")

epochs = 3

best_model = None

for epoch in range(1, epochs + 1):

    train()

    val_loss = evaluate(model, val_data)

    print('#' * 89)
    print("valid loss:", val_loss)
    print('#' * 89)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = model
    
    scheduler.step()

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
import math

# 假设你已经有输入和目标数据，这里用数字表示词的索引
input_sequence = torch.tensor([[2, 4, 6, 8, 10]])
target_sequence = torch.tensor([[4, 6, 8, 10, 12]])

# 参数设置
vocab_size = 10000  # 词汇表大小
embedding_dim = 256  # 词嵌入维度
num_heads = 8  # 注意力头数
num_layers = 4  # 编码器和解码器层数
max_seq_len = 1000
# 位置编码
class PositionalEncoding(nn.Module):
    def __init__(self, max_seq_len, embedding_dim):
        super(PositionalEncoding, self).__init__()
        self.embedding_dim = embedding_dim
        self.pe = torch.zeros(max_seq_len, embedding_dim)
        position = torch.arange(0, max_seq_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embedding_dim, 2).float() * (-math.log(10000.0) / embedding_dim))
        self.pe[:, 0::2] = torch.sin(position * div_term)
        self.pe[:, 1::2] = torch.cos(position * div_term)
        self.pe = self.pe.unsqueeze(0)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)].to(x.device)

# 定义Transformer模型
class Transformer(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_heads, num_layers):
        super(Transformer, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.positional_encoding = PositionalEncoding(max_seq_len, embedding_dim)
        self.transformer = nn.Transformer(d_model=embedding_dim, nhead=num_heads, num_encoder_layers=num_layers,
                                          num_decoder_layers=num_layers)
        self.fc = nn.Linear(embedding_dim, vocab_size)

    def forward(self, src, tgt):
        src = self.embedding(src)
        src = self.positional_encoding(src)
        tgt = self.embedding(tgt)
        tgt = self.positional_encoding(tgt)
        output = self.transformer(src, tgt)
        output = self.fc(output)
        return output

# 创建模型
model = Transformer(vocab_size, embedding_dim, num_heads, num_layers)

# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 训练循环
for epoch in range(10):
    optimizer.zero_grad()
    output = model(input_sequence, target_sequence)
    loss = criterion(output.view(-1, vocab_size), target_sequence.view(-1))
    loss.backward()
    optimizer.step()
    print(f'Epoch [{epoch+1}/{10}], Loss: {loss.item():.4f}')


Epoch [1/10], Loss: 9.6667
Epoch [2/10], Loss: 5.1785
Epoch [3/10], Loss: 3.2876
Epoch [4/10], Loss: 2.1381
Epoch [5/10], Loss: 1.5051
Epoch [6/10], Loss: 0.9559
Epoch [7/10], Loss: 0.6781
Epoch [8/10], Loss: 0.5433
Epoch [9/10], Loss: 0.4247
Epoch [10/10], Loss: 0.3404
