# 自己的情感分析任务
针对自己的数据集的情感分析任务，加载之前训练好的预训练模型，用自己的游记文本数据再次训练模型，学习游记文本中的语义。<br/>
加入其他网络模型，比如加入LSTM

## 参数设置和变量设置

In [1]:
check_point = "schen/longformer-chinese-base-4096"
batch_size = 2 # 每一批次的数量

## 加载数据

In [2]:
from datasets import load_dataset
from datasets import load_from_disk
# 加载一个评估标准，默认的评估标准
from datasets import load_metric

In [3]:
train_dataset = load_dataset('csv',data_files='../data/MyDataset/data2/train_dataset.csv',split='train')
valid_dataset = load_dataset('csv',data_files='../data/MyDataset/data2/valid_dataset.csv',split='train')
test_dataset = load_dataset('csv',data_files='../data/MyDataset/data2/test_dataset.csv',split='train')

Using custom data configuration default-5602383f9cde0ea3
Reusing dataset csv (/home/chenli/.cache/huggingface/datasets/csv/default-5602383f9cde0ea3/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a)
Using custom data configuration default-062c84d526dcea84
Reusing dataset csv (/home/chenli/.cache/huggingface/datasets/csv/default-062c84d526dcea84/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a)
Using custom data configuration default-0f8395db45727ded
Reusing dataset csv (/home/chenli/.cache/huggingface/datasets/csv/default-0f8395db45727ded/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a)


In [4]:
train_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 2755
})

## 数据预处理

In [5]:
from transformers import AutoTokenizer
    
# tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
# 加载分词器
tokenizer = AutoTokenizer.from_pretrained(check_point)

In [6]:
tokenizer

PreTrainedTokenizerFast(name_or_path='schen/longformer-chinese-base-4096', vocab_size=21128, model_max_len=4096, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [7]:
# 分词
def preprocess_function(data):
    return tokenizer(data['text'],padding='max_length',max_length=1500,truncation=True)

In [8]:
encoded_train_dataset = train_dataset.map(function=preprocess_function,
                     batched=True,
                     remove_columns=['text'])
encoded_train_dataset = encoded_train_dataset.rename_column("label", "labels")
encoded_train_dataset

Loading cached processed dataset at /home/chenli/.cache/huggingface/datasets/csv/default-5602383f9cde0ea3/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-49dd6547c0c3a89d.arrow


Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 2755
})

In [9]:
encoded_valid_dataset = valid_dataset.map(function=preprocess_function,
                     batched=True,
                     remove_columns=['text'])
encoded_valid_dataset = encoded_valid_dataset.rename_column("label", "labels")
encoded_valid_dataset

Loading cached processed dataset at /home/chenli/.cache/huggingface/datasets/csv/default-062c84d526dcea84/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-0cdccafb6189a66b.arrow


Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 344
})

In [10]:
encoded_test_dataset = test_dataset.map(function=preprocess_function,
                     batched=True,
                     remove_columns=['text'])
encoded_test_dataset = encoded_test_dataset.rename_column("label", "labels")
encoded_test_dataset

  0%|          | 0/1 [00:00<?, ?ba/s]

Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 345
})

## 数据集加载器

In [11]:
import torch
from transformers.data.data_collator import DataCollatorWithPadding
train_dataloader = torch.utils.data.DataLoader(dataset = encoded_train_dataset,batch_size=batch_size,collate_fn=DataCollatorWithPadding(tokenizer),shuffle=True,drop_last=True)
valid_dataloader = torch.utils.data.DataLoader(dataset = encoded_valid_dataset,batch_size=batch_size,collate_fn=DataCollatorWithPadding(tokenizer),shuffle=True,drop_last=True)
test_dataloader = torch.utils.data.DataLoader(dataset = encoded_test_dataset,batch_size=batch_size,collate_fn=DataCollatorWithPadding(tokenizer),shuffle=True,drop_last=True)

## 微调预训练模型
针对自己数据集进行微调 <br/>
加入LSTM网络模型

In [12]:
from transformers import AutoTokenizer,AutoModelForSequenceClassification, TrainingArguments, Trainer
from torch import nn
from transformers import AutoModel

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using {device} device')
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        # 这个可以拿到预训练模型最后一层的结果
        self.longformer = AutoModel.from_pretrained(check_point)
        # 接一个BiGRU
        self.lstm = nn.LSTM(input_size=768,hidden_size=512,batch_first=True,bidirectional=True)
        # 这里可以接分类层，输入768维，最后分为2个类别
        # 这里可以添加其他网络模型，提升效果
        self.linear = nn.Linear(1024, 2)

    def forward(self,input_ids,token_type_ids,attention_mask):
        # 取最后一层的第一个，因为我们希望拿到的是整句话的一个语义
        output = self.longformer(input_ids,token_type_ids,attention_mask).last_hidden_state # 维度 [batch,seq,hidden_size]
        output,h_n = self.lstm(output)
        output = output[:,-1,:] # [batch,1024]
        # 然后输送给分类的linear层
        output = self.linear(output)
        return output

# 如果显卡的话就使用显卡
model = NeuralNetwork().to(device)
model.eval()

Using cuda device


Some weights of the model checkpoint at schen/longformer-chinese-base-4096 were not used when initializing BertModel: ['bert.encoder.layer.0.attention.self.query_global.weight', 'bert.encoder.layer.0.attention.self.query_global.bias', 'bert.encoder.layer.0.attention.self.key_global.weight', 'bert.encoder.layer.0.attention.self.key_global.bias', 'bert.encoder.layer.0.attention.self.value_global.weight', 'bert.encoder.layer.0.attention.self.value_global.bias', 'bert.encoder.layer.1.attention.self.query_global.weight', 'bert.encoder.layer.1.attention.self.query_global.bias', 'bert.encoder.layer.1.attention.self.key_global.weight', 'bert.encoder.layer.1.attention.self.key_global.bias', 'bert.encoder.layer.1.attention.self.value_global.weight', 'bert.encoder.layer.1.attention.self.value_global.bias', 'bert.encoder.layer.2.attention.self.query_global.weight', 'bert.encoder.layer.2.attention.self.query_global.bias', 'bert.encoder.layer.2.attention.self.key_global.weight', 'bert.encoder.layer.

NeuralNetwork(
  (longformer): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(4096, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affi

## 模型训练

In [13]:
from tqdm.auto import tqdm # 显示它的进度条，会更好看点

# 参数解释
# dataloader ： 批量数据的loader
# model : 定义的模型
# loss_fn ： 定义的损失函数
# optimizer ：优化器
# lr_scheduler ： 学习率根据步数会下降，动态变化的。如果用一个固定的学习率，其实是没有这种随着迭代次数下降的效果好的
# epoch ：训练的轮次
# total_loss ：整体loss的情况
def train_loop(dataloader, model, loss_fn, optimizer, lr_scheduler, epoch, total_loss):
    progress_bar = tqdm(range(len(dataloader)))
    progress_bar.set_description(f'loss: {0:>7f}')
    finish_batch_num = (epoch-1)*len(dataloader)
    
    model.train()
    for batch, data in enumerate(dataloader,start=1):
        labels = data.labels.to(device)
        input_ids = data.input_ids.to(device)
        token_type_ids = data.token_type_ids.to(device)
        attention_mask = data.attention_mask.to(device)
        pred = model(input_ids,token_type_ids,attention_mask)
        loss = loss_fn(pred, labels)

        
        loss.backward() # 向后传播
        optimizer.step() # 算完梯度下降之后更改参数
        lr_scheduler.step() # 对学习率进行调整
        optimizer.zero_grad() # 把之前的梯度都清掉

        total_loss += loss.item() # 统计一下整体的loss
        # batch=2时，每600输出一次loss
        # batch=4时，可以每300输出一次loss
        progress_bar.set_description(f'loss: {total_loss/(finish_batch_num + batch):>7f}')
        progress_bar.update(1)
        
        
        # running_loss += loss.item() 
        # 每600输出一次loss
        # if batch % 600 == 599:
            # print('[%d,%5d] running_loss:%.3f' % (epoch,batch+1,running_loss/600))
            # 每600一组数据跑完，清零
            # running_loss = 0.0
    return total_loss

def test_loop(dataloader, model, mode='Test'):
    assert mode in ['Valid', 'Test']
    size = len(dataloader.dataset)
    correct = 0

    model.eval()
    with torch.no_grad():
        for batch,data in enumerate(dataloader,start=1):
            labels = data.labels.to(device)
            input_ids = data.input_ids.to(device)
            token_type_ids = data.token_type_ids.to(device)
            attention_mask = data.attention_mask.to(device)
            pred = model(input_ids,token_type_ids,attention_mask)
            correct += (pred.argmax(1) == labels).type(torch.float).sum().item()

    correct /= size
    print(f"{mode} Accuracy: {(100*correct):>0.1f}%\n")
    return correct

# 开始训练

## 20221116 训练
加入LSTM网络模型

In [14]:
from transformers import AdamW, get_scheduler

learning_rate = 1e-5 # 定义学习率
epoch_num = 10 # 轮次定义

loss_fn = nn.CrossEntropyLoss() # 损失函数，交叉熵
optimizer = AdamW(model.parameters(), lr=learning_rate) # Adamw一个常用的优化器
lr_scheduler = get_scheduler(
    "linear",# 使用线性的方式，慢慢往下降
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=epoch_num*len(train_dataloader),
)

total_loss = 0.
best_acc = 0.
for t in range(epoch_num):
    print(f"Epoch {t+1}/{epoch_num}\n-------------------------------")
    total_loss = train_loop(train_dataloader, model, loss_fn, optimizer, lr_scheduler, t+1, total_loss)
    valid_acc = test_loop(valid_dataloader, model, mode='Valid')
    if valid_acc > best_acc:
        best_acc = valid_acc
        print('saving new weights...\n')
        # 保存模型
        torch.save(model.state_dict(), f'epoch_{t+1}_valid_acc_{(100*valid_acc):0.1f}_model_weights.bin')
print("Done!")

Epoch 1/10
-------------------------------


  0%|          | 0/1377 [00:00<?, ?it/s]

ValueError: too many values to unpack (expected 2)