In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
print(f"CUDA Device Count: {torch.cuda.device_count()}")
for i in range(torch.cuda.device_count()):
   print(f"Device {i}: {torch.cuda.get_device_name(i)}")

Using device: cuda
CUDA Device Count: 1
Device 0: Tesla V100-PCIE-32GB


In [2]:
import json
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW
from tqdm import tqdm

# 设置代理
proxy = "http://sisproxy.hkg.agoda.local:3128"
proxies = {"http": proxy, "https": proxy}

# 加载SQuAD数据
def load_squad(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        squad_dict = json.load(f)
    
    contexts = []
    questions = []
    answers = []
    for article in squad_dict["data"]:
        for paragraph in article["paragraphs"]:
            context = paragraph["context"]
            for qa in paragraph["qas"]:
                question = qa["question"]
                for answer in qa["answers"]:
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer["text"])
    return contexts, questions, answers

class SQuADDataset(Dataset):
    def __init__(self, contexts, questions, answers, tokenizer, max_len=512):
        self.contexts = contexts
        self.questions = questions
        self.answers = answers
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        context = self.contexts[idx]
        question = self.questions[idx]
        answer = self.answers[idx]

        input_text = f"question: {question} context: {context}"
        target_text = answer

        inputs = self.tokenizer.encode_plus(
            input_text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )

        targets = self.tokenizer.encode_plus(
            target_text,
            max_length=64,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )

        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'labels': targets['input_ids'].squeeze()
        }

# 主函数
def main():
    model_name = "google-t5/t5-base" 
    tokenizer = T5Tokenizer.from_pretrained(model_name, proxies=proxies)
    print('Tokenizer loaded successfully')
    model = T5ForConditionalGeneration.from_pretrained(model_name, proxies=proxies).to(device)
    print('Model loaded successfully')

    contexts, questions, answers = load_squad("train-v1.1.json")
    train_dataset = SQuADDataset(contexts, questions, answers, tokenizer)
    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

    optimizer = AdamW(model.parameters(), lr=5e-5)

    num_epochs = 3
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}/{num_epochs}, Average Loss: {avg_loss:.4f}")

    model.save_pretrained("./t5-finetuned-squad-custom")
    tokenizer.save_pretrained("./t5-finetuned-squad-custom")
    print("Model saved to ./t5-finetuned-squad-custom")

if __name__ == "__main__":
    main()

  from .autonotebook import tqdm as notebook_tqdm
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Tokenizer loaded successfully
Model loaded successfully


Epoch 1/3: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10950/10950 [1:08:00<00:00,  2.68it/s]


Epoch 1/3, Average Loss: 0.0548


Epoch 2/3: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10950/10950 [1:08:02<00:00,  2.68it/s]


Epoch 2/3, Average Loss: 0.0195


Epoch 3/3: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10950/10950 [1:08:00<00:00,  2.68it/s]


Epoch 3/3, Average Loss: 0.0157
Model saved to ./t5-finetuned-squad-custom
