## Pipeline

In [3]:
import torch
import torch.optim as optim
import torch.nn as nn

from datasets import load_dataset
from transformers import pipeline

import numpy as np
import matplotlib.pyplot as plt
from tqdm import *
import sys

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

# 判断可用的设备是 CPU 还是 GPU，并将模型移动到对应的计算资源设备上
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
#文本分类
classifier = pipeline("sentiment-analysis")

result = classifier("This is a great movie. I enjoyed it a lot!")[0]
print(result)

result = classifier("This movie is so bad, I almost fell asleep.")[0]
print(result)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


{'label': 'POSITIVE', 'score': 0.9998773336410522}
{'label': 'NEGATIVE', 'score': 0.9997857213020325}


## 数据集查看

In [5]:
from datasets import load_dataset

imdb_dataset = load_dataset('imdb')# 加载imdb数据集
print(imdb_dataset['train'][0]) # 查看第一条数据
print(imdb_dataset['train'][-1]) # 查看最后一条数据

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far be

## 数据处理

In [6]:
#定义数据集
class Dataset(torch.utils.data.Dataset):
    def __init__(self, split):
        self.dataset = load_dataset(path='imdb', split=split)

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, i):
        text = self.dataset[i]['text']
        label = self.dataset[i]['label']
        return text, label


train_dataset = Dataset('train')
test_dataset = Dataset('test')

In [7]:
print(len(train_dataset), len(test_dataset))

25000 25000


## 词元化

In [8]:
from transformers import AutoTokenizer

#加载Tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
tokenizer

BertTokenizerFast(name_or_path='bert-base-cased', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [9]:
# 数据集处理函数
def collate_fn(data):
    sents = [i[0] for i in data]
    labels = [i[1] for i in data]

    #编码
    data = tokenizer.batch_encode_plus(batch_text_or_text_pairs=sents,
                                   truncation=True,
                                   padding='max_length',
                                   max_length=500,
                                   return_tensors='pt',
                                   return_length=True)

    #input_ids:编码之后的数字
    #attention_mask:是补零的位置是0,其他位置是1
    input_ids = data['input_ids']
    attention_mask = data['attention_mask']
    token_type_ids = data['token_type_ids']
    labels = torch.LongTensor(labels)

    return input_ids, attention_mask, token_type_ids, labels


#定义数据加载器
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                     batch_size=32,
                                     collate_fn=collate_fn,
                                     shuffle=True)

test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                              batch_size=32,
                                              collate_fn=collate_fn,
                                              shuffle=True)


## 建立模型

In [10]:
from transformers import BertModel

#加载预训练bert模型
pretrained = BertModel.from_pretrained('bert-base-cased').to(device)

#不训练,不需要计算梯度
for param in pretrained.parameters():
    param.requires_grad_(False)

In [11]:
# 定义下游任务模型
class Model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.fc = torch.nn.Linear(768, 2)

    def forward(self, input_ids, attention_mask, token_type_ids):
        with torch.no_grad():
            out = pretrained(input_ids=input_ids,
                       attention_mask=attention_mask,
                       token_type_ids=token_type_ids)
        out = self.fc(out.last_hidden_state[:, 0]) # 最后一层隐藏层作为输入
        out = out.softmax(dim=1)
        return out

model = Model().to(device)

In [12]:
# 定义训练器
class Trainer:
    def __init__(self, model, train_loader, valid_loader):
        # 初始化训练数据集和验证数据集的dataloader
        self.train_loader = train_loader
        self.valid_loader = valid_loader
        
        self.device = device
        self.model = model.to(self.device)
        
        # 定义优化器、损失函数和学习率调度器
        self.optimizer = optim.AdamW(self.model.parameters(), lr=0.001)
        self.criterion = nn.CrossEntropyLoss()
        self.scheduler = optim.lr_scheduler.ExponentialLR(self.optimizer, gamma=0.95)
        
        # 记录训练过程中的损失和验证过程中的准确率
        self.train_losses = []
        self.val_accuracy = []
    
    def train(self, num_epochs):
        # tqdm用于显示进度条并评估任务时间开销
        for epoch in tqdm(range(num_epochs), file=sys.stdout):
            # 记录损失值
            total_loss = 0

            # 批量训练
            self.model.train()
            
            for input_ids, attention_mask, token_type_ids, labels in train_loader:
                # 预测、损失函数、反向传播
                self.optimizer.zero_grad()
                outputs = self.model(input_ids=input_ids.to(self.device), attention_mask=attention_mask.to(self.device), token_type_ids=token_type_ids.to(self.device)).to(self.device)
                loss = self.criterion(outputs, labels.to(self.device))
                loss.backward()
                self.optimizer.step()
                total_loss += loss.item()
            
            # 更新优化器的学习率
            self.scheduler.step()
            # 计算验证集的准确率
            accuracy = self.validate()
            
            # 记录训练集损失和验证集准确率
            self.train_losses.append(total_loss)
            self.val_accuracy.append(accuracy)
            
            # 打印中间值
            tqdm.write("Epoch: {0} Loss: {1} Acc: {2}".format(
                epoch, self.train_losses[-1], self.val_accuracy[-1]))
    
    def validate(self):
        # 测试模型，不计算梯度
        self.model.eval()
        
        # 记录总数和预测正确数
        total = 0
        correct = 0
        
        with torch.no_grad():
            for input_ids, attention_mask, token_type_ids, labels in self.valid_loader:
                outputs = self.model(input_ids=input_ids.to(self.device), attention_mask=attention_mask.to(self.device), token_type_ids=token_type_ids.to(self.device)).to(self.device)
                # 记录验证集总数和预测正确数
                total += labels.size(0)
                correct += (outputs.argmax(1) == labels.to(self.device)).sum().item()
        
        # 返回准确率
        accuracy = correct / total
        return accuracy

## 模型训练和验证

In [None]:
# 创建一个 Trainer 类的实例
trainer = Trainer(model, train_loader, test_loader)
# 训练模型，迭代 30 个周期
trainer.train(num_epochs = 10)

  0%|          | 0/10 [00:00<?, ?it/s]

In [None]:
# 使用Matplotlib绘制损失曲线图
plt.plot(trainer.train_losses, label='loss')
plt.legend()
plt.show()

In [None]:
# 使用Matplotlib绘制准确率曲线图
plt.plot(trainer.val_accuracy, label='accuracy')
plt.legend()
plt.show()

## 直接finetune

In [None]:
# 导入必要的库
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
import evaluate

# 加载数据集
dataset = load_dataset("imdb")

# 加载 BERT 分词器
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

# 定义用于对输入文本进行分词的函数
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# 对数据集进行分词处理
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# 从数据集中选择一小部分用于训练和测试
small_train_dataset = tokenized_datasets["train"].shuffle(seed=0).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=0).select(range(1000))

# 加载 BERT-base-cased 模型用于序列分类任务
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)

# 加载准确率度量
metric = evaluate.load("accuracy")

# 定义用于计算评估指标的函数
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# 设置训练参数
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10, 
    evaluation_strategy="epoch")

# 创建一个 Trainer 实例
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

# 训练模型
trainer.train()

# 将训练好的模型保存到磁盘上
model.save_pretrained('./results/imdb_model')

In [None]:
# 加载模型
model = AutoModelForSequenceClassification.from_pretrained('./results/imdb_model')

# 创建pipeline
classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)

# 测试模型
result = classifier('This is a great movie. I enjoyed it a lot!')
print(result)

# 测试模型
result = classifier('This movie is so bad, I almost fell asleep.')
print(result)