In [16]:
import transformers
from transformers import MBartTokenizer, MBartForConditionalGeneration
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset,DataLoader
import torch.optim as optim
pd.set_option('display.max_colwidth', None)
from sklearn.model_selection import train_test_split
from torch.optim import Adam

In [17]:
file_path="/kaggle/input/translate/.csv"
df = pd.read_csv(file_path)
print(df.head(2))

                                                            Text  \
0                 It can be a very complicated thing, the ocean.   
1  And it can be a very complicated thing, what human health is.   

               Label  
0      海洋是一个非常复杂的事物。  
1  人类的健康也是一件非常复杂的事情。  


In [18]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [19]:
tokenizer = MBartTokenizer.from_pretrained('facebook/mbart-large-50')
model = MBartForConditionalGeneration.from_pretrained('facebook/mbart-large-50')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'MBart50Tokenizer'. 
The class this function is called from is 'MBartTokenizer'.


In [20]:
def tokenize_data(source_texts, target_texts, tokenizer, src_lang='en_XX', tgt_lang='zh_CN', max_length=128):
    source_encodings = tokenizer(source_texts, truncation=True, padding='max_length', max_length=max_length, return_tensors='pt')
    target_encodings = tokenizer(target_texts, truncation=True, padding='max_length', max_length=max_length, return_tensors='pt')
    return source_encodings, target_encodings

# 处理训练集数据
train_source_texts = train_df['Text'].tolist()
train_target_texts = train_df['Label'].tolist()
train_source_encodings, train_target_encodings = tokenize_data(train_source_texts, train_target_texts, tokenizer)

# 处理测试集数据
test_source_texts = test_df['Text'].tolist()
test_target_texts = test_df['Label'].tolist()
test_source_encodings, test_target_encodings = tokenize_data(test_source_texts, test_target_texts, tokenizer)


In [21]:
class TranslationDataset(Dataset):
    def __init__(self, source_encodings, target_encodings):
        self.source_encodings = source_encodings
        self.target_encodings = target_encodings

    def __len__(self):
        return len(self.source_encodings['input_ids'])

    def __getitem__(self, idx):
        source_item = {key: torch.tensor(val[idx]) for key, val in self.source_encodings.items()}
        target_item = {key: torch.tensor(val[idx]) for key, val in self.target_encodings.items()}
        return source_item, target_item

# 创建数据加载器
train_dataset = TranslationDataset(train_source_encodings, train_target_encodings)
test_dataset = TranslationDataset(test_source_encodings, test_target_encodings)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

In [22]:
optimizer = Adam(model.parameters(), lr=5e-5)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

MBartForConditionalGeneration(
  (model): MBartModel(
    (shared): Embedding(250054, 1024, padding_idx=1)
    (encoder): MBartEncoder(
      (embed_tokens): MBartScaledWordEmbedding(250054, 1024, padding_idx=1)
      (embed_positions): MBartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x MBartEncoderLayer(
          (self_attn): MBartAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (fi

In [23]:
num_epochs = 3

for epoch in range(num_epochs):
    model.train()
    total_train_loss = 0
    
    for source_batch, target_batch in train_loader:
        optimizer.zero_grad()
        input_ids = source_batch['input_ids'].to(device)
        attention_mask = source_batch['attention_mask'].to(device)
        labels = target_batch['input_ids'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_train_loss += loss.item()
        
    avg_train_loss = total_train_loss / len(train_loader)
    
    model.eval()
    total_eval_loss = 0
    
    with torch.no_grad():
        for source_batch, target_batch in test_loader:
            input_ids = source_batch['input_ids'].to(device)
            attention_mask = source_batch['attention_mask'].to(device)
            labels = target_batch['input_ids'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_eval_loss += loss.item()
    
    avg_eval_loss = total_eval_loss / len(test_loader)
    
    print(f"Epoch {epoch+1}/{num_epochs}")
    print(f"Train Loss: {avg_train_loss}")
    print(f"Eval Loss: {avg_eval_loss}")

  source_item = {key: torch.tensor(val[idx]) for key, val in self.source_encodings.items()}
  target_item = {key: torch.tensor(val[idx]) for key, val in self.target_encodings.items()}


Epoch 1/3
Train Loss: 0.6896660949803735
Eval Loss: 0.44413446116447447
Epoch 2/3
Train Loss: 0.3520221918960505
Eval Loss: 0.4511162173748016
Epoch 3/3
Train Loss: 0.24645656605236843
Eval Loss: 0.4883433943986893


In [24]:
model_save_path = "/kaggle/working/mbart_translation_model"
model.save_pretrained(model_save_path)

# 保存 tokenizer
tokenizer_save_path = "/kaggle/working/mbart_tokenizer"
tokenizer.save_pretrained(tokenizer_save_path)

Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}


('/kaggle/working/mbart_tokenizer/tokenizer_config.json',
 '/kaggle/working/mbart_tokenizer/special_tokens_map.json',
 '/kaggle/working/mbart_tokenizer/sentencepiece.bpe.model',
 '/kaggle/working/mbart_tokenizer/added_tokens.json')

In [28]:
model = MBartForConditionalGeneration.from_pretrained(model_save_path)
tokenizer = MBartTokenizer.from_pretrained(tokenizer_save_path)
model.to(device)
def translate_text(text, tokenizer, model, src_lang='en_XX', tgt_lang='zh_CN', max_length=128):
    # 准备输入数据
    inputs = tokenizer(text, return_tensors="pt", max_length=max_length, truncation=True).to(device)
    
    # 翻译
    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            decoder_start_token_id=tokenizer.lang_code_to_id[tgt_lang],
            max_length=max_length
        )
    
    # 解码生成的文本
    translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return translated_text

# 测试翻译
test_sentences = [
    "I want to eat delicious food.",
    "I want to watch a movie.",
    "What are you doing?",
    "I want to become Superman."
]
for sentence in test_sentences:
    translated = translate_text(sentence, tokenizer, model)
    print(f"Input: {sentence}")
    print(f"Translated: {translated}\n")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Input: I want to eat delicious food.
Translated: 我想要吃出美味的食物。

Input: I want to watch a movie.
Translated: 我想要看个电影。

Input: What are you doing?
Translated: 你在做什么?

Input: I want to become Superman.
Translated: 我想要成为超人的。

