In [1]:
# model_checkpoint = "bert-base-chinese"
model_checkpoint = "schen/longformer-chinese-base-4096"
batch_size = 2 # 每一批次的数量
num_labels = 2 # 多少分类，这里是二分类问题，积极和消极
output_dir = "/home/chenli/pre_model/BERT_and_Longformer/Longformer" # 模型保存路径
learning_rate = 1e-5 # 学习率
# weight_decay=0.01 # 学习率衰减，设置0.01即可。如果weight_decay设置太小，几乎就不起作用了。
num_train_epochs = 10 # 训练轮次，差不多设置为5左右。轮数不要设置太大。轮数设置的太大，Loss是下降了，但是微调的时候效果不是很好，有可能训练过头了

In [2]:
from datasets import load_dataset
from datasets import load_from_disk
# 加载一个评估标准，默认的评估标准
from datasets import load_metric

In [3]:
# 加载数据集
train_dataset = load_dataset('csv',data_files='../data/MyDataset/travel_comment_data/train_dataset.csv',split='train')
valid_dataset = load_dataset('csv',data_files='../data/MyDataset/travel_comment_data/valid_dataset.csv',split='train')
test_dataset = load_dataset('csv',data_files='../data/MyDataset/travel_comment_data/test_dataset.csv',split='train')

Using custom data configuration default-be87a0d28564f2ae
Reusing dataset csv (/home/chenli/.cache/huggingface/datasets/csv/default-be87a0d28564f2ae/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a)
Using custom data configuration default-e6970b6e6097a9c9
Reusing dataset csv (/home/chenli/.cache/huggingface/datasets/csv/default-e6970b6e6097a9c9/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a)
Using custom data configuration default-8cb27c720d5c6049
Reusing dataset csv (/home/chenli/.cache/huggingface/datasets/csv/default-8cb27c720d5c6049/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a)


In [4]:
train_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 33858
})

In [5]:
valid_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 4232
})

In [6]:
test_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 4233
})

In [7]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

In [8]:
# 分词
def preprocess_function(data):
    return tokenizer(data['text'],padding='max_length', max_length=100,truncation=True)

In [9]:
train_encoded_dataset = train_dataset.map(function=preprocess_function,
                     batched=True,
                     remove_columns=['text'])
val_encoded_dataset = valid_dataset.map(function=preprocess_function,
                     batched=True,
                     remove_columns=['text'])
test_encoded_dataset = test_dataset.map(function=preprocess_function,
                     batched=True,
                     remove_columns=['text'])

  0%|          | 0/34 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

In [10]:
train_encoded_dataset

Dataset({
    features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 33858
})

In [11]:
val_encoded_dataset

Dataset({
    features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 4232
})

In [12]:
test_encoded_dataset

Dataset({
    features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 4233
})

In [13]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

Some weights of the model checkpoint at schen/longformer-chinese-base-4096 were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'bert.encoder.layer.2.attention.self.query_global.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'bert.encoder.layer.6.attention.self.value_global.bias', 'bert.encoder.layer.0.attention.self.query_global.weight', 'bert.encoder.layer.5.attention.self.value_global.weight', 'bert.encoder.layer.5.attention.self.value_global.bias', 'bert.encoder.layer.8.attention.self.query_global.bias', 'bert.encoder.layer.4.attention.self.query_global.weight', 'bert.encoder.layer.10.attention.self.key_global.bias', 'bert.encoder.layer.5.attention.self.key_global.bias', 'bert.encoder.layer.4.attention.self.value_global.weight', 'cls.seq_relationship.weight', 'bert.encoder.layer.4.attention.self.key_global.weight', 'bert.encoder.layer.5.attention.self.query_global.weight', 'bert.encoder.layer.

In [14]:
metric_name = "accuracy"

args = TrainingArguments(
    output_dir = output_dir,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",# no step epoch
    learning_rate = learning_rate,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = 1,
    num_train_epochs = num_train_epochs,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    # _n_gpu=2
)

In [15]:
import numpy as np
def compute_metrics(eval_preds):
    metric = load_metric('glue','mrpc')
    logits,labels = eval_preds # 预测值和真实值
    predictions = np.argmax(logits,axis=-1)
    return metric.compute(predictions=predictions,references=labels)

In [16]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_encoded_dataset,
    eval_dataset=val_encoded_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

### 20230206

In [17]:
trainer.train()

***** Running training *****
  Num examples = 33858
  Num Epochs = 10
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 84650


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.1339,0.186501,0.962429,0.974838
2,0.1236,0.169053,0.965501,0.976759
3,0.0926,0.15552,0.972117,0.981102
4,0.0631,0.17434,0.968809,0.978839
5,0.0417,0.200965,0.96621,0.977262
6,0.028,0.256067,0.966682,0.977573
7,0.0208,0.311681,0.964319,0.976013
8,0.0113,0.345583,0.963611,0.975587
9,0.0142,0.338754,0.965737,0.976959
10,0.0112,0.354895,0.966446,0.977403


***** Running Evaluation *****
  Num examples = 4232
  Batch size = 2
Using the latest cached version of the module from /home/chenli/.cache/huggingface/modules/datasets_modules/metrics/glue/91f3cfc5498873918ecf119dbf806fb10815786c84f41b85a5d3c47c1519b343 (last modified on Fri Nov 11 21:04:53 2022) since it couldn't be found locally at glue, or remotely on the Hugging Face Hub.
Saving model checkpoint to /home/chenli/pre_model/BERT_and_Longformer/Longformer/checkpoint-8465
Configuration saved in /home/chenli/pre_model/BERT_and_Longformer/Longformer/checkpoint-8465/config.json
Model weights saved in /home/chenli/pre_model/BERT_and_Longformer/Longformer/checkpoint-8465/pytorch_model.bin
tokenizer config file saved in /home/chenli/pre_model/BERT_and_Longformer/Longformer/checkpoint-8465/tokenizer_config.json
Special tokens file saved in /home/chenli/pre_model/BERT_and_Longformer/Longformer/checkpoint-8465/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 4232
  Batch

  Batch size = 2
Saving model checkpoint to /home/chenli/pre_model/BERT_and_Longformer/Longformer/checkpoint-84650
Configuration saved in /home/chenli/pre_model/BERT_and_Longformer/Longformer/checkpoint-84650/config.json
Model weights saved in /home/chenli/pre_model/BERT_and_Longformer/Longformer/checkpoint-84650/pytorch_model.bin
tokenizer config file saved in /home/chenli/pre_model/BERT_and_Longformer/Longformer/checkpoint-84650/tokenizer_config.json
Special tokens file saved in /home/chenli/pre_model/BERT_and_Longformer/Longformer/checkpoint-84650/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from /home/chenli/pre_model/BERT_and_Longformer/Longformer/checkpoint-25395 (score: 0.972117202268431).


TrainOutput(global_step=84650, training_loss=0.059437910438076814, metrics={'train_runtime': 31259.2388, 'train_samples_per_second': 10.831, 'train_steps_per_second': 2.708, 'total_flos': 1.739924631324e+16, 'train_loss': 0.059437910438076814, 'epoch': 10.0})

评估测试

In [18]:
trainer.evaluate(eval_dataset=test_encoded_dataset)

***** Running Evaluation *****
  Num examples = 4233
  Batch size = 2


{'eval_loss': 0.17506630718708038,
 'eval_accuracy': 0.9688164422395464,
 'eval_f1': 0.9794967381174278,
 'eval_runtime': 400.9112,
 'eval_samples_per_second': 10.558,
 'eval_steps_per_second': 5.28,
 'epoch': 10.0}