In [1]:
# model_checkpoint = "bert-base-chinese"
# model_checkpoint = "schen/longformer-chinese-base-4096"
model_checkpoint = "voidful/albert_chinese_tiny"
batch_size = 2 # 每一批次的数量
num_labels = 2 # 多少分类，这里是二分类问题，积极和消极
output_dir = "/home/chenli/pre_model/roberta" # 模型保存路径
learning_rate = 1e-5 # 学习率
# weight_decay=0.01 # 学习率衰减，设置0.01即可。如果weight_decay设置太小，几乎就不起作用了。
num_train_epochs = 5 # 训练轮次，差不多设置为5左右。轮数不要设置太大。轮数设置的太大，Loss是下降了，但是微调的时候效果不是很好，有可能训练过头了

In [2]:
from datasets import load_dataset
from datasets import load_from_disk
# 加载一个评估标准，默认的评估标准
from datasets import load_metric

In [3]:
# 加载数据集
train_dataset = load_dataset('csv',data_files='../../data/MyDataset/data2/train_dataset.csv',split='train')
valid_dataset = load_dataset('csv',data_files='../../data/MyDataset/data2/valid_dataset.csv',split='train')
test_dataset = load_dataset('csv',data_files='../../data/MyDataset/data2/test_dataset.csv',split='train')

Using custom data configuration default-5602383f9cde0ea3
Reusing dataset csv (/home/chenli/.cache/huggingface/datasets/csv/default-5602383f9cde0ea3/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a)
Using custom data configuration default-062c84d526dcea84
Reusing dataset csv (/home/chenli/.cache/huggingface/datasets/csv/default-062c84d526dcea84/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a)
Using custom data configuration default-0f8395db45727ded
Reusing dataset csv (/home/chenli/.cache/huggingface/datasets/csv/default-0f8395db45727ded/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a)


In [4]:
train_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 2755
})

In [5]:
valid_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 344
})

In [6]:
test_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 345
})

In [8]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint,use_fast=True)

In [20]:
# 分词
def preprocess_function(data):
    return tokenizer(data['text'],padding='max_length', max_length=500,truncation=True)

In [21]:
train_encoded_dataset = train_dataset.map(function=preprocess_function,
                     batched=True,
                     remove_columns=['text'])
val_encoded_dataset = valid_dataset.map(function=preprocess_function,
                     batched=True,
                     remove_columns=['text'])
test_encoded_dataset = test_dataset.map(function=preprocess_function,
                     batched=True,
                     remove_columns=['text'])

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [22]:
train_encoded_dataset

Dataset({
    features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 2755
})

In [23]:
val_encoded_dataset

Dataset({
    features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 344
})

In [24]:
test_encoded_dataset

Dataset({
    features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 345
})

In [25]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

loading configuration file https://huggingface.co/voidful/albert_chinese_tiny/resolve/main/config.json from cache at /home/chenli/.cache/huggingface/transformers/b08a63442fa70fa1e124af576eb48002271fc62678a8f86047fa45cb3323bd11.4d160a4a4557ca7887bb3f096f73e5ec3f44d2026bc8706fdd499190dcbc4f7b
Model config AlbertConfig {
  "architectures": [
    "AlbertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.0,
  "bos_token_id": 2,
  "classifier_dropout_prob": 0.1,
  "down_scale_factor": 1,
  "embedding_size": 128,
  "eos_token_id": 3,
  "gap_size": 0,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 312,
  "initializer_range": 0.02,
  "inner_group_num": 1,
  "intermediate_size": 1248,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "albert",
  "net_structure_type": 0,
  "num_attention_heads": 12,
  "num_hidden_groups": 1,
  "num_hidden_layers": 4,
  "num_memory_blocks": 0,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "tra

In [26]:
metric_name = "accuracy"

args = TrainingArguments(
    output_dir = output_dir,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",# no step epoch
    learning_rate = learning_rate,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = 1,
    num_train_epochs = num_train_epochs,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    # _n_gpu=2
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [27]:
import numpy as np
def compute_metrics(eval_preds):
    metric = load_metric('glue','mrpc')
    logits,labels = eval_preds # 预测值和真实值
    predictions = np.argmax(logits,axis=-1)
    return metric.compute(predictions=predictions,references=labels)

In [28]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_encoded_dataset,
    eval_dataset=val_encoded_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

### 20230522
max_length = 512

In [18]:
trainer.train()

***** Running training *****
  Num examples = 2755
  Num Epochs = 5
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 3445


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.235,0.211675,0.956395,0.96063
2,0.1157,0.185118,0.962209,0.966234
3,0.1086,0.177464,0.962209,0.966408
4,0.073,0.165171,0.965116,0.969072
5,0.0977,0.172477,0.965116,0.969072


***** Running Evaluation *****
  Num examples = 344
  Batch size = 2
Using the latest cached version of the module from /home/chenli/.cache/huggingface/modules/datasets_modules/metrics/glue/91f3cfc5498873918ecf119dbf806fb10815786c84f41b85a5d3c47c1519b343 (last modified on Fri Nov 11 21:04:53 2022) since it couldn't be found locally at glue, or remotely on the Hugging Face Hub.
Saving model checkpoint to /home/chenli/pre_model/roberta/checkpoint-689
Configuration saved in /home/chenli/pre_model/roberta/checkpoint-689/config.json
Model weights saved in /home/chenli/pre_model/roberta/checkpoint-689/pytorch_model.bin
tokenizer config file saved in /home/chenli/pre_model/roberta/checkpoint-689/tokenizer_config.json
Special tokens file saved in /home/chenli/pre_model/roberta/checkpoint-689/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 344
  Batch size = 2
Saving model checkpoint to /home/chenli/pre_model/roberta/checkpoint-1378
Configuration saved in /home/chenli/pr

TrainOutput(global_step=3445, training_loss=0.121810318458236, metrics={'train_runtime': 753.1517, 'train_samples_per_second': 18.29, 'train_steps_per_second': 4.574, 'total_flos': 55476055296000.0, 'train_loss': 0.121810318458236, 'epoch': 5.0})

评估测试

In [19]:
trainer.evaluate(eval_dataset=test_encoded_dataset)

***** Running Evaluation *****
  Num examples = 345
  Batch size = 2


{'eval_loss': 0.19281823933124542,
 'eval_accuracy': 0.9652173913043478,
 'eval_f1': 0.9692307692307692,
 'eval_runtime': 5.1966,
 'eval_samples_per_second': 66.389,
 'eval_steps_per_second': 33.291,
 'epoch': 5.0}

### 20230522
max_length=500

In [29]:
trainer.train()

***** Running training *****
  Num examples = 2755
  Num Epochs = 5
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 3445


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2469,0.218863,0.953488,0.958115
2,0.1053,0.221176,0.950581,0.955614
3,0.1078,0.207534,0.959302,0.963351
4,0.0745,0.189246,0.962209,0.966057
5,0.1044,0.191292,0.962209,0.966057


***** Running Evaluation *****
  Num examples = 344
  Batch size = 2
Saving model checkpoint to /home/chenli/pre_model/roberta/checkpoint-689
Configuration saved in /home/chenli/pre_model/roberta/checkpoint-689/config.json
Model weights saved in /home/chenli/pre_model/roberta/checkpoint-689/pytorch_model.bin
tokenizer config file saved in /home/chenli/pre_model/roberta/checkpoint-689/tokenizer_config.json
Special tokens file saved in /home/chenli/pre_model/roberta/checkpoint-689/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 344
  Batch size = 2
Using the latest cached version of the module from /home/chenli/.cache/huggingface/modules/datasets_modules/metrics/glue/91f3cfc5498873918ecf119dbf806fb10815786c84f41b85a5d3c47c1519b343 (last modified on Fri Nov 11 21:04:53 2022) since it couldn't be found locally at glue, or remotely on the Hugging Face Hub.
Saving model checkpoint to /home/chenli/pre_model/roberta/checkpoint-1378
Configuration saved in /home/chenli/pr

TrainOutput(global_step=3445, training_loss=0.1257414475926813, metrics={'train_runtime': 282.7778, 'train_samples_per_second': 48.713, 'train_steps_per_second': 12.183, 'total_flos': 54175835250000.0, 'train_loss': 0.1257414475926813, 'epoch': 5.0})

In [30]:
trainer.evaluate(eval_dataset=test_encoded_dataset)

***** Running Evaluation *****
  Num examples = 345
  Batch size = 2


{'eval_loss': 0.19573403894901276,
 'eval_accuracy': 0.9623188405797102,
 'eval_f1': 0.9665809768637531,
 'eval_runtime': 4.9276,
 'eval_samples_per_second': 70.014,
 'eval_steps_per_second': 35.108,
 'epoch': 5.0}