In [1]:
from datasets import load_dataset

dataset = load_dataset("opus100", "en-zh")
print(dataset)

DatasetDict({
    test: Dataset({
        features: ['translation'],
        num_rows: 2000
    })
    train: Dataset({
        features: ['translation'],
        num_rows: 1000000
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 2000
    })
})


In [2]:
print(dataset['train'][0])

{'translation': {'en': 'Sixty-first session', 'zh': '第六十一届会议'}}


In [3]:
import torch
import torch.nn as nn 

class T5Embedding(nn.Module):
    def __init__(self, vocab_size, d_model):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
    
    def forward(self, x):
        return self.embedding(x)

In [4]:
import math 

class PositionalEncoding(nn.Module): 
    def __init__(self, d_model, max_len=5000): 
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float()*(-math.log(10000.) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer("pe", pe)
    
    def forward(self, x): 
        seq_len = x.size(1)
        return x + self.pe[:, :seq_len].to(x.device)


pe = PositionalEncoding(512)
x = torch.randn(size=(2,5,512))
# print(x[1,0,:])

[para.numel() for para in pe.parameters()]

[]

In [None]:
class MultiHeadAttention(nn.Module): 
    def __init__(self, d_model, num_heads):
        super().__init__()
        assert d_model % num_heads == 0, "d_model必须被num_heads整除"
        self.num_heads = num_heads
        self.d_model = d_model
        self.d_k = d_model // num_heads
        self.q_linear = nn.Linear(d_model, d_model)    
        self.k_linear = nn.Linear(d_model, d_model)    
        self.v_linear = nn.Linear(d_model, d_model)    
        self.out_linear = nn.Linear(d_model, d_model)
        
    # 始终应用掩码（如果 mask 为 None，则传入全 1 掩码）
    def forward(self, q, k, v, mask): 
        batch_size = q.size(0)
        q = self.q_linear(q).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        k = self.k_linear(k).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        v = self.v_linear(v).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)

        scores = torch.matmul(q, k.transpose(-1, -2)) / math.sqrt(self.d_k)
        
        scores = scores.masked_fill(mask==0, -1e9)
        
        attention = torch.softmax(scores, dim=-1)
        output = torch.matmul(attention, v)
        
        output = output.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)
        
        return self.out_linear(output)
    

attn = MultiHeadAttention(512, 512)

In [6]:
params = [param.numel() for param in attn.parameters()]
print(params)
sum = 0
for i in params:
    sum += i 
print(sum)   

[262144, 512, 262144, 512, 262144, 512, 262144, 512]
1050624


In [7]:
class FeedForward(nn.Module): 
    def __init__(self, d_model, d_ff):
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)
        
    def forward(self, x): 
        return self.linear2(torch.relu(self.linear1(x)))
    
ffn = FeedForward(512, 2048)
param = [para.numel() for para in ffn.parameters()]
print(param)
sum = 0 
for i in param:
    sum += i
print(sum)

[1048576, 2048, 1048576, 512]
2099712


In [8]:
class EncoderLayer(nn.Module): 
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.ff = FeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, mask=None):
        x = x + self.dropout(self.self_attn(self.norm1(x), self.norm1(x), self.norm1(x), mask))
        x = x + self.dropout(self.ff(self.norm2(x)))
        return x 

encoder_block = EncoderLayer(512, 8, 2048, 0.1)
params = [param.numel() for param in encoder_block.parameters()]
print(params)
sum = 0 
for i in params:
    sum += i
    
# 参数量统计
print(sum)

[262144, 512, 262144, 512, 262144, 512, 262144, 512, 1048576, 2048, 1048576, 512, 512, 512, 512, 512]
3152384


In [9]:
class DecoderLayer(nn.Module): 
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.ff = FeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model) 
        self.norm2 = nn.LayerNorm(d_model) 
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, enc_output, src_mask=None, tgt_mask=None): 
         x = x + self.dropout(self.self_attn(self.norm1(x), self.norm1(x), self.norm1(x), tgt_mask))
         x = x + self.dropout(self.cross_attn(self.norm2(x), enc_output, enc_output, src_mask))
         x = x + self.dropout(self.ff(self.norm3(x)))
         return x 
     
decoder_block = DecoderLayer(512, 8, 2048)

params = [para.numel() for para in decoder_block.parameters()]
print(params)
sum = 0
for i in params:
    sum += i
print(sum)

[262144, 512, 262144, 512, 262144, 512, 262144, 512, 262144, 512, 262144, 512, 262144, 512, 262144, 512, 1048576, 2048, 1048576, 512, 512, 512, 512, 512, 512, 512]
4204032


In [10]:
# 组合T5模型
class T5Model(nn.Module):
    def __init__(self, vocab_size, d_model, num_heads, d_ff, num_layers, dropout=0.1):
        super().__init__()
        self.embedding = T5Embedding(vocab_size, d_model)
        self.pos_encoding = PositionalEncoding(d_model)
        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.output_layer = nn.Linear(d_model, vocab_size)
    
    def forward(self, src_input, tgt_input, src_mask=None, tgt_mask=None): 
        src_emb = self.pos_encoding(self.embedding(src_input))
        tgt_emb = self.pos_encoding(self.embedding(tgt_input))
        
        enc_output = src_emb 
        for layer in self.encoder_layers:
            enc_output = layer(enc_output, src_mask)
        
        dec_output = tgt_emb
        for layer in self.decoder_layers:
            dec_output = layer(dec_output, enc_output, src_mask, tgt_mask)
        
        return self.output_layer(dec_output)
    
t5 = T5Model(20000, 768, 8, 2048, 6, 0.1)

sum = 0 

for name,param in t5.named_parameters(): 
    print(f"{name} ==> 参数量：{param.numel()}")
    sum += param.numel()
    
print(sum)

embedding.embedding.weight ==> 参数量：15360000
encoder_layers.0.self_attn.q_linear.weight ==> 参数量：589824
encoder_layers.0.self_attn.q_linear.bias ==> 参数量：768
encoder_layers.0.self_attn.k_linear.weight ==> 参数量：589824
encoder_layers.0.self_attn.k_linear.bias ==> 参数量：768
encoder_layers.0.self_attn.v_linear.weight ==> 参数量：589824
encoder_layers.0.self_attn.v_linear.bias ==> 参数量：768
encoder_layers.0.self_attn.out_linear.weight ==> 参数量：589824
encoder_layers.0.self_attn.out_linear.bias ==> 参数量：768
encoder_layers.0.ff.linear1.weight ==> 参数量：1572864
encoder_layers.0.ff.linear1.bias ==> 参数量：2048
encoder_layers.0.ff.linear2.weight ==> 参数量：1572864
encoder_layers.0.ff.linear2.bias ==> 参数量：768
encoder_layers.0.norm1.weight ==> 参数量：768
encoder_layers.0.norm1.bias ==> 参数量：768
encoder_layers.0.norm2.weight ==> 参数量：768
encoder_layers.0.norm2.bias ==> 参数量：768
encoder_layers.1.self_attn.q_linear.weight ==> 参数量：589824
encoder_layers.1.self_attn.q_linear.bias ==> 参数量：768
encoder_layers.1.self_attn.k_linear.weig

In [11]:
# # 生成词汇表
# from generate_tokenizer import gen_tokenizer

# # 传入语料库文件， 输出tokenizer的json文件
# src_path = "corpus.txt"
# tokenizer_path = "translation.json"
# gen_tokenizer(src_path, tokenizer_path)


In [12]:
from transformers import PreTrainedTokenizerFast

# 加载 tokenizer，显式设置特殊 token
tokenizer = PreTrainedTokenizerFast(
    tokenizer_file="translation.json",
    pad_token="[PAD]",
    unk_token="[UNK]",
    bos_token="[BOS]",
    eos_token="[EOS]"
)

# 打印特殊 token 和映射
print("Pad token:", tokenizer.pad_token)
print("特殊 token 映射：", tokenizer.special_tokens_map)
print("Pad token ID:", tokenizer.pad_token_id)
print("BOS token ID:", tokenizer.convert_tokens_to_ids("[BOS]"))
print("BOS token ID:", tokenizer.convert_tokens_to_ids("[BOS]"))
print("EOS token ID:", tokenizer.convert_tokens_to_ids("[EOS]"))
print("PAD token ID:", tokenizer.convert_tokens_to_ids("[PAD]"))
print("EOS token ID:", tokenizer.eos_token_id)

Pad token: [PAD]
特殊 token 映射： {'bos_token': '[BOS]', 'eos_token': '[EOS]', 'unk_token': '[UNK]', 'pad_token': '[PAD]'}
Pad token ID: 0
BOS token ID: 2
BOS token ID: 2
EOS token ID: 3
PAD token ID: 0
EOS token ID: 3


In [13]:
# 输入文本
text = "Hello, world! 你好，世界！"

# 转换为 token ID
input_ids = tokenizer.encode(text)
print(f"Token ID: {input_ids}")

# 查看分词后的 token
tokens = tokenizer.tokenize(text)
print("分词结果：", tokens)

encoded = tokenizer(
    text,
    return_tensors="pt",
    padding=True,
    truncation=True,
    max_length=512
)
print("Encoded 输出:", encoded)

# 提取 input_ids 和 attention_mask
input_ids = encoded["input_ids"]
attention_mask = encoded["attention_mask"]
print("Input IDs:", input_ids)
print("Attention Mask:", attention_mask)

# 转换为 token 查看
tokens_with_special = tokenizer.convert_ids_to_tokens(input_ids[0])
print("带特殊 token 的分词结果：", tokens_with_special)

Token ID: [2, 14860, 18, 10298, 7, 13791, 9209, 9863, 9198, 3]
分词结果： ['Hello', ',', 'world', '!', '你好', '，', '世界', '！']
Encoded 输出: {'input_ids': tensor([[    2, 14860,    18, 10298,     7, 13791,  9209,  9863,  9198,     3]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
Input IDs: tensor([[    2, 14860,    18, 10298,     7, 13791,  9209,  9863,  9198,     3]])
Attention Mask: tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
带特殊 token 的分词结果： ['[BOS]', 'Hello', ',', 'world', '!', '你好', '，', '世界', '！', '[EOS]']


In [14]:
# 输入文本
text = "Hello, world! 你好，世界！"

# 转换为token ID 
input_ids = tokenizer.encode(text)
print(f"Token ID:{input_ids}")

# 查看分词后的token
tokens = tokenizer.tokenize(text)
print("分词结果：", tokens)

encoded = tokenizer(
    text,                   
    # return_tensors="pt",        # 返回 PyTorch 张量（"tf" 表示 TensorFlow，None 表示普通列表）
    padding="max_length",               # 自动填充（如果处理批量文本）
    truncation=True,            # 自动截断（如果超过最大长度）
    max_length=512              # 最大序列长度
)

print("Encoded 输出:", encoded)

# 提取 input_ids 和 attention_mask
input_ids = encoded["input_ids"]
attention_mask = encoded["attention_mask"]
print("Input IDs:", input_ids)
print("Attention Mask:", attention_mask)

Token ID:[2, 14860, 18, 10298, 7, 13791, 9209, 9863, 9198, 3]
分词结果： ['Hello', ',', 'world', '!', '你好', '，', '世界', '！']
Encoded 输出: {'input_ids': [2, 14860, 18, 10298, 7, 13791, 9209, 9863, 9198, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [28]:
# 创建Dataset
from torch.utils.data import Dataset, DataLoader

class TranslationDataset(Dataset): 
    def __init__(self, dataset, tokenizer, max_length = 10):
        self.dataset = dataset
        self.max_length = max_length
        self.tokenizer = tokenizer
        self.pad_token_id = [self.tokenizer.convert_tokens_to_ids(self.tokenizer.pad_token)]

    def __len__(self): 
        return len(self.dataset)
    
    def __getitem__(self, index):
        translation = self.dataset[index]['translation']
        input = self.tokenizer(
            translation["en"],                   
            # return_tensors="pt",        # 返回 PyTorch 张量（"tf" 表示 TensorFlow，None 表示普通列表）
            padding="max_length",               # 自动填充（如果处理批量文本）
            truncation=True,            # 自动截断（如果超过最大长度）
            max_length=self.max_length              # 最大序列长度
        )
        output = self.tokenizer(
            translation["zh"],                   
            # return_tensors="pt",        # 返回 PyTorch 张量（"tf" 表示 TensorFlow，None 表示普通列表）
            padding="max_length",               # 自动填充（如果处理批量文本）
            truncation=True,            # 自动截断（如果超过最大长度）
            max_length=self.max_length              # 最大序列长度
        )

        # 提取 input_ids（去掉 batch 维度）
        src_input = input["input_ids"] # [seq_len]
        src_attention_mask = input["attention_mask"]
        tgt_input = output["input_ids"] # [seq_len] 
        tgt_output = tgt_input[1:] + self.pad_token_id # 去掉第一个 token（通常是 <BOS>）
        
        return [src_input, src_attention_mask, tgt_input, tgt_output]

def collate_fn(batch):
    src_input, src_attention_mask, tgt_input, tgt_output = zip(*batch)
    return (torch.tensor(src_input, dtype=torch.long)
            , torch.tensor(src_attention_mask, dtype=torch.long).unsqueeze(1).unsqueeze(2)
            , torch.tensor(tgt_input, dtype=torch.long)
            , torch.tensor(tgt_output, dtype=torch.long))
        
train_dataset = TranslationDataset(dataset=dataset["train"], tokenizer=tokenizer, max_length=15)
train_loader = DataLoader(train_dataset, batch_size=160, shuffle=True, collate_fn=collate_fn)

In [29]:
#训练模型

vocab_size = tokenizer.vocab_size
d_model = 512
num_heads = 8
d_ff = 2048
num_layers = 6
dropout = 0.1

model = T5Model(vocab_size, d_model, num_heads, d_ff, num_layers, dropout)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer.convert_tokens_to_ids(tokenizer.pad_token))
optimizer = torch.optim.Adam(model.parameters(), lr = 0.0001)

In [17]:
param_num = [para.numel() for para in model.parameters()]
sum = 0 
for n in param_num:
    sum += n
    
print(sum)

76938496


In [18]:
print(next(iter(train_loader))[1].shape)
aa = next(iter(train_loader))[1].unsqueeze(1).unsqueeze(2)

print(aa.shape)


tgt_test = next(iter(train_loader))[2]
tgt_test

torch.Size([96, 1, 1, 10])
torch.Size([96, 1, 1, 1, 1, 10])


tensor([[    2, 11946,  9209,  9609,  1270,  6746,  7460,  3444, 26296,     3],
        [    2, 10381,  2688, 10604, 10406, 12602,  9209,  2256, 25404,     3],
        [    2,    45, 10978,    18,  5303,  2166,  7434, 13365,     3,     0],
        [    2, 16049, 13932,     3,     0,     0,     0,     0,     0,     0],
        [    2,    23,    20,  7224,  6999, 20563,  9812, 10115, 26918,     3],
        [    2,  1952,     3,     0,     0,     0,     0,     0,     0,     0],
        [    2, 24134,  9209, 19631, 10071, 22235, 12865,  9209,  2671,     3],
        [    2, 13939,     3,     0,     0,     0,     0,     0,     0,     0],
        [    2, 17927,  1813, 16046, 11085,  7628,  2372, 10931,  9506,     3],
        [    2,  1111,  9733, 11709,  1000, 19505,   799,     3,     0,     0],
        [    2, 12499, 16681, 13915,  9863,     3,     0,     0,     0,     0],
        [    2, 12291, 11052,  9621,     3,     0,     0,     0,     0,     0],
        [    2,   813,    24,    20, 100

In [19]:
aa

tensor([[[[[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]]]],




        [[[[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]]]],




        [[[[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]]]],




        [[[[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]]]],




        [[[[[1, 1, 1, 1, 1, 1, 0, 0, 0, 0]]]]],




        [[[[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]]]],




        [[[[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]]]],




        [[[[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]]]],




        [[[[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]]]],




        [[[[[1, 1, 1, 1, 1, 0, 0, 0, 0, 0]]]]],




        [[[[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]]]],




        [[[[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]]]],




        [[[[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]]]],




        [[[[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]]]],




        [[[[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]]]],




        [[[[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]]]],




        [[[[[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]]]]],




        [[[[[1, 1, 1, 1, 1, 1, 0, 0, 0, 0]]]]],




        [[[[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]]]],




        [[[[

In [20]:
def create_mask(tgt, pad_idx): 
    tgt_seq_len = tgt.size(1)
    tgt_mask = torch.tril(torch.ones((tgt_seq_len, tgt_seq_len))).bool().to(tgt.device)
    tgt_mask = tgt_mask & (tgt != pad_idx).unsqueeze(1).unsqueeze(2)
    return tgt_mask 

pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
print(pad_token_id)

test_size = tgt_test.shape[1]

tgt_mask = torch.tril(torch.ones((test_size, test_size))).bool()
tgt_mask = tgt_mask & (tgt_test != pad_token_id).unsqueeze(1).unsqueeze(2)
print(test_size)
tgt_mask

0
10


tensor([[[[ True, False, False,  ..., False, False, False],
          [ True,  True, False,  ..., False, False, False],
          [ True,  True,  True,  ..., False, False, False],
          ...,
          [ True,  True,  True,  ...,  True, False, False],
          [ True,  True,  True,  ...,  True,  True, False],
          [ True,  True,  True,  ...,  True,  True,  True]]],


        [[[ True, False, False,  ..., False, False, False],
          [ True,  True, False,  ..., False, False, False],
          [ True,  True,  True,  ..., False, False, False],
          ...,
          [ True,  True,  True,  ...,  True, False, False],
          [ True,  True,  True,  ...,  True,  True, False],
          [ True,  True,  True,  ...,  True,  True,  True]]],


        [[[ True, False, False,  ..., False, False, False],
          [ True,  True, False,  ..., False, False, False],
          [ True,  True,  True,  ..., False, False, False],
          ...,
          [ True,  True,  True,  ...,  True, Fa

In [30]:
from tqdm import tqdm

pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
num_epochs = 5 
for epoch in range(num_epochs): 
    model.train()
    total_loss = 0
    for (src_input, src_attention_mask, tgt_input, tgt_output) in tqdm(train_loader):
        src_input = src_input.to(device)
        tgt_input = tgt_input.to(device)
        tgt_output = tgt_output.to(device)
        src_attention_mask = src_attention_mask.to(device)
        
        tgt_mask = create_mask(tgt_input, pad_token_id)
        
        optimizer.zero_grad()
        logits = model(src_input, tgt_input, src_attention_mask, tgt_mask)
        
        # test测试是否正确配置项目
        # print(f"tgt_output.shape:{tgt_output.shape}")      # tgt_output.shape:torch.Size([2, 10])
        # print(f"logits.shape:{logits.shape}")              # logits.shape:torch.Size([2, 10, 32000])
        # break 
    
        loss = loss_fn(logits.view(-1, vocab_size), tgt_output.view(-1))
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        
    print(f"Epoch {epoch + 1} / {num_epochs}, Loss:{total_loss / len(train_loader)}")
    
        

100%|██████████| 6250/6250 [17:00<00:00,  6.12it/s]


Epoch 1 / 5, Loss:5.421014827575684


100%|██████████| 6250/6250 [16:50<00:00,  6.18it/s]


Epoch 2 / 5, Loss:4.214462287063599


100%|██████████| 6250/6250 [17:15<00:00,  6.03it/s]


Epoch 3 / 5, Loss:3.7815352058410645


100%|██████████| 6250/6250 [17:03<00:00,  6.11it/s]


Epoch 4 / 5, Loss:3.521689265899658


100%|██████████| 6250/6250 [16:47<00:00,  6.20it/s]

Epoch 5 / 5, Loss:3.3347109633636474





In [31]:
def translate(model, src_text, max_length=10000): 
    model.eval()
    input = tokenizer(text=src_text, return_tensors="pt")
    src_input = input['input_ids'].to(device)
    src_mask_attention = input['attention_mask'].to(device)
    print(src_input)
    print(src_mask_attention)

    bos_id = tokenizer.bos_token_id
    eos_id = tokenizer.eos_token_id

    tgt_ids = [bos_id]
    for _ in range(max_length):
        tgt_input = torch.tensor([tgt_ids], dtype=torch.long).to(device)
        tgt_mask = torch.tril(torch.ones((len(tgt_ids), len(tgt_ids)))).bool().to(device)
        tgt_mask = tgt_mask & (tgt_input != pad_token_id).unsqueeze(1).unsqueeze(2)
        
        with torch.no_grad():
            logits = model(src_input, tgt_input, src_mask_attention, tgt_mask)
            next_token = logits[0, -1].argmax().item()
            
        if next_token == eos_id:
            break 
        
        tgt_ids.append(next_token)
        

    return ''.join(tokenizer.convert_ids_to_tokens(tgt_ids)[1:])

In [48]:


src_text = "Dreams give life purpose and direction."

translated = translate(model, src_text)

print(f"翻译结果：{translated}")

tensor([[    2,    42, 12000,    89, 10841, 10850, 12409,  9312, 16620,    20,
             3]], device='cuda:0')
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')
翻译结果：Dreamsgivelifeand方向.
