# 自己的情感分析任务
针对自己的数据集的情感分析任务，加载之前训练好的预训练模型，用自己的游记文本数据再次训练模型，学习游记文本中的语义


## 参数设置和变量设置

In [1]:
model_dir = "/home/chenli/pre_model/20221108/checkpoint-14400/"
batch_size = 1 # 每一批次的数量
num_labels = 2 # 多少分类，这里是二分类问题，积极和消极
output_dir = "/home/chenli/pre_model/20221109" # 模型保存路径
learning_rate = 1e-5 # 学习率
num_train_epochs = 5 # 训练轮次

## 加载数据

In [2]:
from datasets import load_dataset
from datasets import load_from_disk
# 加载一个评估标准，默认的评估标准
from datasets import load_metric

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
train_dataset = load_dataset('csv',data_files='../data/MyDataset/train_dataset.csv',split='train')
valid_dataset = load_dataset('csv',data_files='../data/MyDataset/valid_dataset.csv',split='train')
test_dataset = load_dataset('csv',data_files='../data/MyDataset/test_dataset.csv',split='train')

Using custom data configuration default-f4542a74482b2188
Found cached dataset csv (/home/chenli/.cache/huggingface/datasets/csv/default-f4542a74482b2188/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a)
Using custom data configuration default-9358b381c3caed87
Found cached dataset csv (/home/chenli/.cache/huggingface/datasets/csv/default-9358b381c3caed87/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a)
Using custom data configuration default-4bbbcf0261746e3e
Found cached dataset csv (/home/chenli/.cache/huggingface/datasets/csv/default-4bbbcf0261746e3e/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a)


In [4]:
train_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 1889
})

In [5]:
metric = load_metric("glue","mrpc")
metric

  metric = load_metric("glue","mrpc")


Metric(name: "glue", features: {'predictions': Value(dtype='int64', id=None), 'references': Value(dtype='int64', id=None)}, usage: """
Compute GLUE evaluation metric associated to each GLUE dataset.
Args:
    predictions: list of predictions to score.
        Each translation should be tokenized into a list of tokens.
    references: list of lists of references for each translation.
        Each reference should be tokenized into a list of tokens.
Returns: depending on the GLUE subset, one or several of:
    "accuracy": Accuracy
    "f1": F1 score
    "pearson": Pearson Correlation
    "spearmanr": Spearman Correlation
    "matthews_correlation": Matthew Correlation
Examples:

    >>> glue_metric = datasets.load_metric('glue', 'sst2')  # 'sst2' or any of ["mnli", "mnli_mismatched", "mnli_matched", "qnli", "rte", "wnli", "hans"]
    >>> references = [0, 1]
    >>> predictions = [0, 1]
    >>> results = glue_metric.compute(predictions=predictions, references=references)
    >>> print(res

## 数据预处理

In [4]:
from transformers import AutoTokenizer
    
# tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
# 加载分词器
tokenizer = AutoTokenizer.from_pretrained(model_dir)

In [7]:
tokenizer

PreTrainedTokenizerFast(name_or_path='/home/chenli/pre_model/20221108/checkpoint-14400/', vocab_size=21128, model_max_len=4096, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [5]:
# 分词
def preprocess_function(data):
    return tokenizer(data['text'],padding='max_length',max_length=3000,truncation=True)

In [6]:
encoded_train_dataset = train_dataset.map(function=preprocess_function,
                     batched=True,
                     remove_columns=['text'])
encoded_train_dataset

100%|██████████| 2/2 [00:03<00:00,  1.85s/ba]


Dataset({
    features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1889
})

In [7]:
len(encoded_train_dataset[0]['input_ids'])

3000

In [8]:
# Rename the label column to labels because the model expects the argument to be named labels
encoded_train_dataset = encoded_train_dataset.rename_column("label", "labels")
encoded_train_dataset

Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1889
})

In [9]:
encoded_valid_dataset = valid_dataset.map(function=preprocess_function,
                     batched=True,
                     remove_columns=['text'])
encoded_valid_dataset

100%|██████████| 1/1 [00:00<00:00,  2.17ba/s]


Dataset({
    features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 236
})

In [10]:
# Rename the label column to labels because the model expects the argument to be named labels
encoded_valid_dataset = encoded_valid_dataset.rename_column("label", "labels")
encoded_valid_dataset

Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 236
})

In [11]:
encoded_test_dataset = test_dataset.map(function=preprocess_function,
                     batched=True,
                     remove_columns=['text'])
encoded_test_dataset

100%|██████████| 1/1 [00:00<00:00,  2.07ba/s]


Dataset({
    features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 237
})

In [12]:
# Rename the label column to labels because the model expects the argument to be named labels
encoded_test_dataset = encoded_test_dataset.rename_column("label", "labels")
encoded_test_dataset

Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 237
})

## 微调预训练模型
针对自己数据集进行微调

是这样加载模型吗？

In [13]:
from transformers import AutoTokenizer,AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch

# 加载原始模型
model = AutoModelForSequenceClassification.from_pretrained(model_dir, num_labels=num_labels)
# 启动模型
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(4096, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [14]:
metric_name = "accuracy"

args = TrainingArguments(
    output_dir = output_dir,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    # learning_rate = learning_rate,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    num_train_epochs = num_train_epochs,
    # weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
)

In [15]:
import numpy as np
def compute_metrics(eval_preds):
    metric = load_metric('glue','mrpc')
    logits,labels = eval_preds # 预测值和真实值
    predictions = np.argmax(logits,axis=-1)
    return metric.compute(predictions=predictions,references=labels)

In [16]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_train_dataset,
    eval_dataset=encoded_valid_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

训练前先评估一下

In [47]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 236
  Batch size = 2
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Using the latest cached version of the module from /home/chenli/.cache/huggingface/modules/datasets_modules/metrics/glue/91f3cfc5498873918ecf119dbf806fb10815786c84f41b85a5d3c47c1519b343 (last modified on Mon Nov  7 19:47:24 2022) since it couldn't be found locally at glue, or remotely on the Hugging Face Hub.


{'eval_loss': 3.103043794631958,
 'eval_accuracy': 0.6059322033898306,
 'eval_f1': 0.6618181818181819,
 'eval_runtime': 4249.1272,
 'eval_samples_per_second': 0.056,
 'eval_steps_per_second': 0.028}

In [None]:
trainer.evaluate(eval_dataset=encoded_test_dataset)

In [18]:
# 模型训练参数
trainer.args

TrainingArguments(
_n_gpu=0,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=epoch,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=True,
group_by_length=False,
half_precision_backend=auto,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=every_save,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
include_inputs_for_metrics=False,
jit_mode_eval=False,
label_n

In [None]:
trainer.predict(test_dataset=encoded_test_dataset)

# 开始训练

## 20221110 训练
model_dir = "/home/chenli/pre_model/20221108/checkpoint-14400/" <br/>
batch_size = 1 # 每一批次的数量 <br/>
num_labels = 2 # 多少分类，这里是二分类问题，积极和消极 <br/>
output_dir = "/home/chenli/pre_model/20221109" # 模型保存路径 <br/>
num_train_epochs = 5 # 训练轮次 <br/>
把文本统一成3000，并且batch_size=1才跑通

In [None]:
trainer.train()

***** Running training *****
  Num examples = 1889
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 9445
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.1783,1.275712,0.754237,0.859903


***** Running Evaluation *****
  Num examples = 236
  Batch size = 1
  metric = load_metric('glue','mrpc')
Saving model checkpoint to /home/chenli/pre_model/20221109/checkpoint-1889
Configuration saved in /home/chenli/pre_model/20221109/checkpoint-1889/config.json
Model weights saved in /home/chenli/pre_model/20221109/checkpoint-1889/pytorch_model.bin
tokenizer config file saved in /home/chenli/pre_model/20221109/checkpoint-1889/tokenizer_config.json
Special tokens file saved in /home/chenli/pre_model/20221109/checkpoint-1889/special_tokens_map.json


## 模型加载（定义）
加载前要下定义模型 <br/>
这里不再是模型的定义了，模型已经进行了一轮预训练，可以直接加载进来。

In [8]:
from torch import nn
from transformers import AutoModel

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using {device} device')
checkpoint = "schen/longformer-chinese-base-4096"

class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.bert_encoder = AutoModel.from_pretrained(checkpoint)
        self.classifier = nn.Linear(768, 2)

    def forward(self, x):
        bert_output = self.bert_encoder(**x)
        cls_vectors = bert_output.last_hidden_state[:, 0]
        logits = self.classifier(cls_vectors)
        return logits

model = NeuralNetwork().to(device)
print(model)

Using cpu device


Some weights of the model checkpoint at schen/longformer-chinese-base-4096 were not used when initializing BertModel: ['bert.encoder.layer.9.attention.self.query_global.weight', 'bert.encoder.layer.5.attention.self.key_global.weight', 'bert.encoder.layer.11.attention.self.value_global.weight', 'cls.predictions.transform.dense.bias', 'bert.encoder.layer.9.attention.self.value_global.bias', 'bert.encoder.layer.0.attention.self.key_global.bias', 'bert.encoder.layer.2.attention.self.query_global.weight', 'bert.encoder.layer.8.attention.self.query_global.weight', 'bert.encoder.layer.7.attention.self.value_global.weight', 'bert.encoder.layer.7.attention.self.key_global.weight', 'bert.encoder.layer.3.attention.self.value_global.bias', 'bert.encoder.layer.0.attention.self.value_global.bias', 'bert.encoder.layer.7.attention.self.query_global.weight', 'bert.encoder.layer.2.attention.self.query_global.bias', 'bert.encoder.layer.3.attention.self.query_global.bias', 'bert.encoder.layer.2.attention.

NeuralNetwork(
  (bert_encoder): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(4096, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_af

In [9]:
# 有模型就加载，没有的话就训练
model.load_state_dict(torch.load('/home/chenli/pre_model/2022.10.22/epoch_4_valid_acc_95.0_model_weights.bin'))
model.eval()

NeuralNetwork(
  (bert_encoder): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(4096, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_af

## 模型训练
针对自己的数据集进行训练 <br/>
我们将每一轮 Epoch 分为训练循环和验证/测试循环。在训练循环中计算损失、优化模型的参数，在验证/测试循环中评估模型的性能

In [11]:
from tqdm.auto import tqdm # 显示它的进度条，会更好看点

# 参数解释
# dataloader ： 批量数据的loader
# model : 定义的模型
# loss_fn ： 定义的损失函数
# optimizer ：优化器
# lr_scheduler ： 学习率根据步数会下降，动态变化的。如果用一个固定的学习率，其实是没有这种随着迭代次数下降的效果好的
# epoch ：训练的轮次
# total_loss ：整体loss的情况
def train_loop(dataloader, model, loss_fn, optimizer, lr_scheduler, epoch, total_loss):
    progress_bar = tqdm(range(len(dataloader)))
    progress_bar.set_description(f'loss: {0:>7f}')
    finish_batch_num = (epoch-1)*len(dataloader)
    
    model.train()
    for batch, (X, y) in enumerate(dataloader, start=1):
        X, y = X.to(device), y.to(device)
        pred = model(X)
        loss = loss_fn(pred, y)

        optimizer.zero_grad() # 把之前的梯度都清掉
        loss.backward() # 向后传播
        optimizer.step() # 算完梯度下降之后更改参数
        lr_scheduler.step() # 对学习率进行调整

        total_loss += loss.item() # 统计一下整体的loss
        # batch=2时，每100输出一次loss
        if batch % 200 == 199:
            print('[%d,%5d] running_loss:%.3f' % (epoch,batch+1,total_loss/(finish_batch_num + batch)))
        progress_bar.set_description(f'loss: {total_loss/(finish_batch_num + batch):>7f}')
        progress_bar.update(1)
        
    return total_loss

def test_loop(dataloader, model, mode='Test'):
    assert mode in ['Valid', 'Test']
    size = len(dataloader.dataset)
    correct = 0

    model.eval()
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    correct /= size
    print(f"{mode} Accuracy: {(100*correct):>0.1f}%\n")
    return correct

# 2022.11.3 训练结果
用自己的数据集进行训练，加载模型。<br/>
batch_size=2，共945个批次，轮次=10

In [43]:
from transformers import AdamW, get_scheduler

learning_rate = 1e-5 # 定义学习率
epoch_num = 10 # 轮次定义

loss_fn = nn.CrossEntropyLoss() # 损失函数，交叉熵
optimizer = AdamW(model.parameters(), lr=learning_rate) # Adamw一个常用的优化器
lr_scheduler = get_scheduler(
    "linear",# 使用线性的方式，慢慢往下降
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=epoch_num*len(train_dataloader),
)

total_loss = 0.
best_acc = 0.
for t in range(epoch_num):
    print(f"Epoch {t+1}/{epoch_num}\n-------------------------------")
    total_loss = train_loop(train_dataloader, model, loss_fn, optimizer, lr_scheduler, t+1, total_loss)
    valid_acc = test_loop(valid_dataloader, model, mode='Valid')
    if valid_acc > best_acc:
        best_acc = valid_acc
        print('saving new weights...\n')
        # 保存模型
        torch.save(model.state_dict(), f'epoch_{t+1}_valid_acc_{(100*valid_acc):0.1f}_model_weights.bin')
print("Done!")

# 它会去保存最好的那个模型



Epoch 1/10
-------------------------------


loss: 0.003257:   0%|          | 2/945 [00:38<5:10:46, 19.77s/it]

RuntimeError: [enforce fail at alloc_cpu.cpp:66] . DefaultCPUAllocator: can't allocate memory: you tried to allocate 1043540064 bytes. Error code 12 (Cannot allocate memory)

# 2022.11.4 训练
用自己的数据集进行训练，不加载模型。<br/>
batch_size=2，共945个批次，轮次=10

In [10]:
from transformers import AdamW, get_scheduler

learning_rate = 1e-5 # 定义学习率
epoch_num = 10 # 轮次定义

loss_fn = nn.CrossEntropyLoss() # 损失函数，交叉熵
optimizer = AdamW(model.parameters(), lr=learning_rate) # Adamw一个常用的优化器
lr_scheduler = get_scheduler(
    "linear",# 使用线性的方式，慢慢往下降
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=epoch_num*len(train_dataloader),
)

total_loss = 0.
best_acc = 0.
for t in range(epoch_num):
    print(f"Epoch {t+1}/{epoch_num}\n-------------------------------")
    total_loss = train_loop(train_dataloader, model, loss_fn, optimizer, lr_scheduler, t+1, total_loss)
    valid_acc = test_loop(valid_dataloader, model, mode='Valid')
    if valid_acc > best_acc:
        best_acc = valid_acc
        print('saving new weights...\n')
        # 保存模型
        torch.save(model.state_dict(), f'epoch_{t+1}_valid_acc_{(100*valid_acc):0.1f}_model_weights.bin')
print("Done!")

# 它会去保存最好的那个模型



Epoch 1/10
-------------------------------


loss: 0.000000:   0%|          | 0/945 [00:00<?, ?it/s]

RuntimeError: [enforce fail at alloc_cpu.cpp:66] . DefaultCPUAllocator: can't allocate memory: you tried to allocate 1610612736 bytes. Error code 12 (Cannot allocate memory)

# 2022.11.4
模型加载<br/>
batch_size=1进行训练

In [12]:
from transformers import AdamW, get_scheduler

learning_rate = 1e-5 # 定义学习率
epoch_num = 10 # 轮次定义

loss_fn = nn.CrossEntropyLoss() # 损失函数，交叉熵
optimizer = AdamW(model.parameters(), lr=learning_rate) # Adamw一个常用的优化器
lr_scheduler = get_scheduler(
    "linear",# 使用线性的方式，慢慢往下降
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=epoch_num*len(train_dataloader),
)

total_loss = 0.
best_acc = 0.
for t in range(epoch_num):
    print(f"Epoch {t+1}/{epoch_num}\n-------------------------------")
    total_loss = train_loop(train_dataloader, model, loss_fn, optimizer, lr_scheduler, t+1, total_loss)
    valid_acc = test_loop(valid_dataloader, model, mode='Valid')
    if valid_acc > best_acc:
        best_acc = valid_acc
        print('saving new weights...\n')
        # 保存模型
        torch.save(model.state_dict(), f'epoch_{t+1}_valid_acc_{(100*valid_acc):0.1f}_model_weights.bin')
print("Done!")

# 它会去保存最好的那个模型

Epoch 1/10
-------------------------------


loss: 0.069808:   0%|          | 1/1889 [01:09<36:15:58, 69.15s/it]

RuntimeError: [enforce fail at alloc_cpu.cpp:66] . DefaultCPUAllocator: can't allocate memory: you tried to allocate 751576752 bytes. Error code 12 (Cannot allocate memory)