# 自己的情感分析任务
针对自己的数据集的情感分析任务，加载之前训练好的预训练模型，用自己的游记文本数据再次训练模型，学习游记文本中的语义


## 参数设置和变量设置

In [1]:
model_dir = "/home/chenli/pre_model/checkpoint-14400/"
batch_size = 2 # 每一批次的数量
num_labels = 2 # 多少分类，这里是二分类问题，积极和消极
output_dir = "/home/chenli/pre_model/20221112" # 模型保存路径
learning_rate = 1e-5 # 学习率
num_train_epochs = 10 # 训练轮次

## 加载数据

In [2]:
from datasets import load_dataset
from datasets import load_from_disk
# 加载一个评估标准，默认的评估标准
from datasets import load_metric

In [3]:
train_dataset = load_dataset('csv',data_files='../data/MyDataset/data2/train_dataset.csv',split='train')
valid_dataset = load_dataset('csv',data_files='../data/MyDataset/data2/valid_dataset.csv',split='train')
test_dataset = load_dataset('csv',data_files='../data/MyDataset/data2/test_dataset.csv',split='train')

Using custom data configuration default-5602383f9cde0ea3
Reusing dataset csv (/home/chenli/.cache/huggingface/datasets/csv/default-5602383f9cde0ea3/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a)
Using custom data configuration default-062c84d526dcea84
Reusing dataset csv (/home/chenli/.cache/huggingface/datasets/csv/default-062c84d526dcea84/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a)
Using custom data configuration default-0f8395db45727ded
Reusing dataset csv (/home/chenli/.cache/huggingface/datasets/csv/default-0f8395db45727ded/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a)


In [4]:
train_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 2755
})

In [6]:
metric = load_metric("glue","mrpc")
metric

Downloading builder script:   0%|          | 0.00/1.84k [00:00<?, ?B/s]

Metric(name: "glue", features: {'predictions': Value(dtype='int64', id=None), 'references': Value(dtype='int64', id=None)}, usage: """
Compute GLUE evaluation metric associated to each GLUE dataset.
Args:
    predictions: list of predictions to score.
        Each translation should be tokenized into a list of tokens.
    references: list of lists of references for each translation.
        Each reference should be tokenized into a list of tokens.
Returns: depending on the GLUE subset, one or several of:
    "accuracy": Accuracy
    "f1": F1 score
    "pearson": Pearson Correlation
    "spearmanr": Spearman Correlation
    "matthews_correlation": Matthew Correlation
Examples:

    >>> glue_metric = datasets.load_metric('glue', 'sst2')  # 'sst2' or any of ["mnli", "mnli_mismatched", "mnli_matched", "qnli", "rte", "wnli", "hans"]
    >>> references = [0, 1]
    >>> predictions = [0, 1]
    >>> results = glue_metric.compute(predictions=predictions, references=references)
    >>> print(res

## 数据预处理

In [4]:
from transformers import AutoTokenizer
    
# tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
# 加载分词器
tokenizer = AutoTokenizer.from_pretrained(model_dir)

In [6]:
tokenizer

PreTrainedTokenizerFast(name_or_path='/home/chenli/pre_model/checkpoint-14400/', vocab_size=21128, model_max_len=4096, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [5]:
# 分词
def preprocess_function(data):
    return tokenizer(data['text'],padding='max_length',max_length=1500,truncation=True)

In [7]:
encoded_train_dataset = train_dataset.map(function=preprocess_function,
                     batched=True,
                     remove_columns=['text'])
encoded_train_dataset

Loading cached processed dataset at /home/chenli/.cache/huggingface/datasets/csv/default-5602383f9cde0ea3/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-748542c6017cdd78.arrow


Dataset({
    features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 2755
})

In [9]:
len(encoded_train_dataset[0]['input_ids'])

1500

In [8]:
# Rename the label column to labels because the model expects the argument to be named labels
encoded_train_dataset = encoded_train_dataset.rename_column("label", "labels")
encoded_train_dataset

Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 2755
})

In [9]:
encoded_valid_dataset = valid_dataset.map(function=preprocess_function,
                     batched=True,
                     remove_columns=['text'])
encoded_valid_dataset

  0%|          | 0/1 [00:00<?, ?ba/s]

Dataset({
    features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 344
})

In [10]:
# Rename the label column to labels because the model expects the argument to be named labels
encoded_valid_dataset = encoded_valid_dataset.rename_column("label", "labels")
encoded_valid_dataset

Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 344
})

In [11]:
encoded_test_dataset = test_dataset.map(function=preprocess_function,
                     batched=True,
                     remove_columns=['text'])
encoded_test_dataset

Loading cached processed dataset at /home/chenli/.cache/huggingface/datasets/csv/default-0f8395db45727ded/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-8755639d5d2841f2.arrow


Dataset({
    features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 345
})

In [12]:
# Rename the label column to labels because the model expects the argument to be named labels
encoded_test_dataset = encoded_test_dataset.rename_column("label", "labels")
encoded_test_dataset

Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 345
})

## 微调预训练模型
针对自己数据集进行微调

是这样加载模型吗？

In [15]:
from transformers import AutoTokenizer,AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using {device} device')
# 加载原始模型
model = AutoModelForSequenceClassification.from_pretrained(model_dir, num_labels=num_labels)
model.to(device)
# 启动模型
model.eval()

Using cuda device


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(4096, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [14]:
from transformers import AutoTokenizer,AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch
metric_name = "accuracy"

args = TrainingArguments(
    output_dir = output_dir,
    evaluation_strategy = "epoch",
    learning_rate = learning_rate,
    weight_decay=0.01,
    #save_strategy = "epoch",
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    num_train_epochs = num_train_epochs,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
)

In [15]:
import numpy as np
def compute_metrics(eval_preds):
    metric = load_metric('glue','mrpc')
    logits,labels = eval_preds # 预测值和真实值
    predictions = np.argmax(logits,axis=-1)
    return metric.compute(predictions=predictions,references=labels)

In [19]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_train_dataset,
    eval_dataset=encoded_valid_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

## 训练前先评估一下

### 20221111 训练前评估

In [47]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 236
  Batch size = 2
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Using the latest cached version of the module from /home/chenli/.cache/huggingface/modules/datasets_modules/metrics/glue/91f3cfc5498873918ecf119dbf806fb10815786c84f41b85a5d3c47c1519b343 (last modified on Mon Nov  7 19:47:24 2022) since it couldn't be found locally at glue, or remotely on the Hugging Face Hub.


{'eval_loss': 3.103043794631958,
 'eval_accuracy': 0.6059322033898306,
 'eval_f1': 0.6618181818181819,
 'eval_runtime': 4249.1272,
 'eval_samples_per_second': 0.056,
 'eval_steps_per_second': 0.028}

In [24]:
# 20221111 GPU服务器评估的
trainer.evaluate()



{'eval_loss': 3.2056150436401367,
 'eval_accuracy': 0.5932203389830508,
 'eval_f1': 0.6444444444444445,
 'eval_runtime': 56.4997,
 'eval_samples_per_second': 4.177}

In [25]:
trainer.evaluate(eval_dataset=encoded_test_dataset)

Using the latest cached version of the module from /home/chenli/.cache/huggingface/modules/datasets_modules/metrics/glue/91f3cfc5498873918ecf119dbf806fb10815786c84f41b85a5d3c47c1519b343 (last modified on Fri Nov 11 21:04:53 2022) since it couldn't be found locally at glue, or remotely on the Hugging Face Hub.


{'eval_loss': 3.3177285194396973,
 'eval_accuracy': 0.5654008438818565,
 'eval_f1': 0.6308243727598566,
 'eval_runtime': 154.2689,
 'eval_samples_per_second': 1.536}

In [16]:
# 模型训练参数
trainer.args

TrainingArguments(output_dir=/home/chenli/pre_model/20221111, overwrite_output_dir=False, do_train=False, do_eval=None, do_predict=False, evaluation_strategy=EvaluationStrategy.EPOCH, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, gradient_accumulation_steps=1, eval_accumulation_steps=None, learning_rate=5e-05, weight_decay=0.0, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=10, max_steps=-1, lr_scheduler_type=SchedulerType.LINEAR, warmup_steps=0, logging_dir=runs/Nov11_21-53-03_yuanshan-ai01, logging_first_step=False, logging_steps=500, save_steps=500, save_total_limit=None, no_cuda=False, seed=42, fp16=False, fp16_opt_level=O1, fp16_backend=auto, local_rank=-1, tpu_num_cores=None, tpu_metrics_debug=False, debug=False, dataloader_drop_last=False, eval_steps=500, dataloader_num_workers=0, past_index=-1, run_name=/home/chenli/pre_model/20221111, disable_tqdm=False, remove_unused_columns=True, label_name

In [26]:
trainer.predict(test_dataset=encoded_test_dataset)



PredictionOutput(predictions=array([[ 1.7803435 , -1.6993524 ],
       [ 4.3600445 , -3.9928086 ],
       [-4.6731315 ,  4.2382636 ],
       [ 4.312089  , -3.9896758 ],
       [ 4.4367046 , -4.036243  ],
       [-2.7386227 ,  2.8976026 ],
       [ 4.208422  , -3.7916079 ],
       [ 3.5128722 , -3.1391063 ],
       [-4.6174827 ,  4.064423  ],
       [-3.6724384 ,  3.6719346 ],
       [ 4.358325  , -4.000134  ],
       [ 3.9915786 , -3.4594805 ],
       [ 4.39191   , -4.031818  ],
       [ 4.391055  , -4.0118213 ],
       [ 4.3339343 , -3.972685  ],
       [-4.1531625 ,  3.919069  ],
       [-4.3742166 ,  4.0700526 ],
       [-3.2580388 ,  3.0946445 ],
       [ 4.2600293 , -3.928056  ],
       [-3.8968449 ,  3.7068942 ],
       [-4.570051  ,  4.1923323 ],
       [ 3.8919013 , -3.378051  ],
       [-3.1564581 ,  3.230141  ],
       [ 4.248337  , -3.8271031 ],
       [ 4.4168496 , -4.0184016 ],
       [ 4.24119   , -3.8115687 ],
       [-3.855914  ,  3.646643  ],
       [-4.349819  ,  4.08

### 20221111 训练后的评估

In [18]:
# 20221111 GPU服务器评估的
trainer.evaluate()



Using the latest cached version of the module from /home/chenli/.cache/huggingface/modules/datasets_modules/metrics/glue/91f3cfc5498873918ecf119dbf806fb10815786c84f41b85a5d3c47c1519b343 (last modified on Fri Nov 11 21:04:53 2022) since it couldn't be found locally at glue, or remotely on the Hugging Face Hub.


{'eval_loss': 0.9832331538200378,
 'eval_accuracy': 0.7542372881355932,
 'eval_f1': 0.8599033816425121,
 'eval_runtime': 131.7387,
 'eval_samples_per_second': 1.791,
 'epoch': 10.0}

In [19]:
trainer.evaluate(eval_dataset=encoded_test_dataset)



{'eval_loss': 0.879255473613739,
 'eval_accuracy': 0.7805907172995781,
 'eval_f1': 0.8767772511848342,
 'eval_runtime': 31.7737,
 'eval_samples_per_second': 7.459,
 'epoch': 10.0}

In [None]:
trainer.predict(test_dataset=encoded_test_dataset)

### 20221112 训练前评估

In [20]:
# 模型训练参数
trainer.args

TrainingArguments(output_dir=/home/chenli/pre_model/20221112, overwrite_output_dir=False, do_train=False, do_eval=None, do_predict=False, evaluation_strategy=EvaluationStrategy.EPOCH, prediction_loss_only=False, per_device_train_batch_size=2, per_device_eval_batch_size=2, gradient_accumulation_steps=1, eval_accumulation_steps=None, learning_rate=1e-05, weight_decay=0.01, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=10, max_steps=-1, lr_scheduler_type=SchedulerType.LINEAR, warmup_steps=0, logging_dir=runs/Nov12_13-25-32_yuanshan-ai01, logging_first_step=False, logging_steps=500, save_steps=500, save_total_limit=None, no_cuda=False, seed=42, fp16=False, fp16_opt_level=O1, fp16_backend=auto, local_rank=-1, tpu_num_cores=None, tpu_metrics_debug=False, debug=False, dataloader_drop_last=False, eval_steps=500, dataloader_num_workers=0, past_index=-1, run_name=/home/chenli/pre_model/20221112, disable_tqdm=False, remove_unused_columns=True, label_nam

In [21]:
# 20221112 GPU服务器评估的
trainer.evaluate()



Using the latest cached version of the module from /home/chenli/.cache/huggingface/modules/datasets_modules/metrics/glue/91f3cfc5498873918ecf119dbf806fb10815786c84f41b85a5d3c47c1519b343 (last modified on Fri Nov 11 21:04:53 2022) since it couldn't be found locally at glue, or remotely on the Hugging Face Hub.


{'eval_loss': 2.3056046962738037,
 'eval_accuracy': 0.6831395348837209,
 'eval_f1': 0.6784660766961652,
 'eval_runtime': 131.928,
 'eval_samples_per_second': 2.607}

In [22]:
# 20221112 对测试集进行评估
trainer.evaluate(eval_dataset=encoded_test_dataset)



{'eval_loss': 2.5707414150238037,
 'eval_accuracy': 0.6521739130434783,
 'eval_f1': 0.6511627906976745,
 'eval_runtime': 30.5454,
 'eval_samples_per_second': 11.295}

In [23]:
trainer.predict(test_dataset=encoded_test_dataset)



PredictionOutput(predictions=array([[-3.9755266 ,  3.7398612 ],
       [-2.1951642 ,  2.1820455 ],
       [ 3.8285606 , -3.4650245 ],
       [ 4.1110287 , -3.685024  ],
       [ 4.2735567 , -3.9067492 ],
       [-4.836384  ,  4.317507  ],
       [-4.4138474 ,  4.0330114 ],
       [ 1.8987294 , -2.1746373 ],
       [-2.729421  ,  2.5207024 ],
       [ 4.360242  , -3.958197  ],
       [ 4.376498  , -4.011953  ],
       [ 3.8747442 , -3.5028982 ],
       [ 4.475733  , -4.000242  ],
       [-4.348307  ,  4.0007772 ],
       [ 4.107893  , -3.6853325 ],
       [ 4.329009  , -3.9644237 ],
       [-4.4139233 ,  3.9566133 ],
       [ 3.904513  , -3.576008  ],
       [ 4.384933  , -4.024852  ],
       [-4.8293667 ,  4.4141774 ],
       [ 2.49205   , -2.5287247 ],
       [-4.880908  ,  4.32991   ],
       [ 2.4057236 , -2.498466  ],
       [-4.8401446 ,  4.332661  ],
       [-2.7338622 ,  2.6253526 ],
       [ 4.3459554 , -3.9959042 ],
       [ 4.469338  , -4.064098  ],
       [-4.339988  ,  3.96

## 20221112 训练后评估

In [25]:
# 模型训练参数
trainer.args

TrainingArguments(output_dir=/home/chenli/pre_model/20221112, overwrite_output_dir=False, do_train=False, do_eval=None, do_predict=False, evaluation_strategy=EvaluationStrategy.EPOCH, prediction_loss_only=False, per_device_train_batch_size=2, per_device_eval_batch_size=2, gradient_accumulation_steps=1, eval_accumulation_steps=None, learning_rate=1e-05, weight_decay=0.01, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=10, max_steps=-1, lr_scheduler_type=SchedulerType.LINEAR, warmup_steps=0, logging_dir=runs/Nov12_13-25-32_yuanshan-ai01, logging_first_step=False, logging_steps=500, save_steps=500, save_total_limit=None, no_cuda=False, seed=42, fp16=False, fp16_opt_level=O1, fp16_backend=auto, local_rank=-1, tpu_num_cores=None, tpu_metrics_debug=False, debug=False, dataloader_drop_last=False, eval_steps=500, dataloader_num_workers=0, past_index=-1, run_name=/home/chenli/pre_model/20221112, disable_tqdm=False, remove_unused_columns=True, label_nam

In [26]:
# 20221112 GPU服务器评估的
trainer.evaluate()



{'eval_loss': 0.13330306112766266,
 'eval_accuracy': 0.9767441860465116,
 'eval_f1': 0.9792746113989639,
 'eval_runtime': 129.3818,
 'eval_samples_per_second': 2.659,
 'epoch': 10.0}

In [27]:
# 20221112 对测试集进行评估
trainer.evaluate(eval_dataset=encoded_test_dataset)



{'eval_loss': 0.17834007740020752,
 'eval_accuracy': 0.9652173913043478,
 'eval_f1': 0.9693877551020408,
 'eval_runtime': 29.6151,
 'eval_samples_per_second': 11.649,
 'epoch': 10.0}

# 开始训练

## 20221110 训练
model_dir = "/home/chenli/pre_model/20221108/checkpoint-14400/" <br/>
batch_size = 1 # 每一批次的数量 <br/>
num_labels = 2 # 多少分类，这里是二分类问题，积极和消极 <br/>
output_dir = "/home/chenli/pre_model/20221109" # 模型保存路径 <br/>
num_train_epochs = 5 # 训练轮次 <br/>
把文本统一成3000，并且batch_size=1才跑通

In [None]:
trainer.train()

***** Running training *****
  Num examples = 1889
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 9445
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.1783,1.275712,0.754237,0.859903


***** Running Evaluation *****
  Num examples = 236
  Batch size = 1
  metric = load_metric('glue','mrpc')
Saving model checkpoint to /home/chenli/pre_model/20221109/checkpoint-1889
Configuration saved in /home/chenli/pre_model/20221109/checkpoint-1889/config.json
Model weights saved in /home/chenli/pre_model/20221109/checkpoint-1889/pytorch_model.bin
tokenizer config file saved in /home/chenli/pre_model/20221109/checkpoint-1889/tokenizer_config.json
Special tokens file saved in /home/chenli/pre_model/20221109/checkpoint-1889/special_tokens_map.json


## 20221111 训练
GPU服务器训练 <br/>
model_dir = "/home/chenli/pre_model/checkpoint-14400/" <br/>
batch_size = 1 # 每一批次的数量 <br/>
num_labels = 2 # 多少分类，这里是二分类问题，积极和消极 <br/>
output_dir = "/home/chenli/pre_model/20221111" # 模型保存路径 <br/>
learning_rate = 1e-5 # 学习率 <br/>
num_train_epochs = 10 # 训练轮次 <br/>
GPU 两块

In [16]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1,Runtime,Samples Per Second
1,0.389,0.983233,0.754237,0.859903,132.391,1.783
2,0.9016,0.895185,0.754237,0.859903,31.8635,7.407
3,0.9685,1.140204,0.754237,0.859903,134.2238,1.758
4,0.9089,0.872491,0.754237,0.859903,32.6964,7.218
5,0.9782,1.101258,0.754237,0.859903,31.9618,7.384
6,0.9868,0.994989,0.754237,0.859903,32.087,7.355
7,0.8918,1.001408,0.754237,0.859903,33.4583,7.054
8,1.0092,1.008894,0.754237,0.859903,40.936,5.765
9,0.9077,1.028743,0.754237,0.859903,31.8379,7.413
10,0.9902,0.993204,0.754237,0.859903,31.8167,7.417


Using the latest cached version of the module from /home/chenli/.cache/huggingface/modules/datasets_modules/metrics/glue/91f3cfc5498873918ecf119dbf806fb10815786c84f41b85a5d3c47c1519b343 (last modified on Fri Nov 11 21:04:53 2022) since it couldn't be found locally at glue, or remotely on the Hugging Face Hub.
Using the latest cached version of the module from /home/chenli/.cache/huggingface/modules/datasets_modules/metrics/glue/91f3cfc5498873918ecf119dbf806fb10815786c84f41b85a5d3c47c1519b343 (last modified on Fri Nov 11 21:04:53 2022) since it couldn't be found locally at glue, or remotely on the Hugging Face Hub.


TrainOutput(global_step=9450, training_loss=0.8838690266281208, metrics={'train_runtime': 7203.9919, 'train_samples_per_second': 1.312, 'total_flos': 23806318502640000, 'epoch': 10.0})

In [17]:
# 模型训练参数
trainer.args

TrainingArguments(output_dir=/home/chenli/pre_model/20221111, overwrite_output_dir=False, do_train=False, do_eval=None, do_predict=False, evaluation_strategy=EvaluationStrategy.EPOCH, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, gradient_accumulation_steps=1, eval_accumulation_steps=None, learning_rate=5e-05, weight_decay=0.0, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=10, max_steps=-1, lr_scheduler_type=SchedulerType.LINEAR, warmup_steps=0, logging_dir=runs/Nov11_22-01-32_yuanshan-ai01, logging_first_step=False, logging_steps=500, save_steps=500, save_total_limit=None, no_cuda=False, seed=42, fp16=False, fp16_opt_level=O1, fp16_backend=auto, local_rank=-1, tpu_num_cores=None, tpu_metrics_debug=False, debug=False, dataloader_drop_last=False, eval_steps=500, dataloader_num_workers=0, past_index=-1, run_name=/home/chenli/pre_model/20221111, disable_tqdm=False, remove_unused_columns=True, label_name

## 20221112 训练
model_dir = "/home/chenli/pre_model/checkpoint-14400/" <br/>
batch_size = 2 # 每一批次的数量 <br/>
num_labels = 2 # 多少分类，这里是二分类问题，积极和消极 <br/>
output_dir = "/home/chenli/pre_model/20221112" # 模型保存路径 <br/>
learning_rate = 1e-5 # 学习率 <br/>
weight_decay=0.01 <br/>
num_train_epochs = 10 # 训练轮次 <br/>
GPU两块 <br/>
添加了消极文本数据1000篇

In [24]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1,Runtime,Samples Per Second
1,0.1803,0.161894,0.973837,0.976623,30.3625,11.33
2,0.0956,0.133303,0.976744,0.979275,30.6061,11.24
3,0.0787,0.184718,0.962209,0.965517,30.9305,11.122
4,0.0652,0.142353,0.976744,0.979275,130.7439,2.631
5,0.0702,0.143575,0.976744,0.979275,129.9804,2.647
6,0.0525,0.1708,0.97093,0.974093,30.4172,11.309
7,0.0317,0.231094,0.97093,0.973958,32.3242,10.642
8,0.028,0.220399,0.973837,0.976378,140.7558,2.444
9,0.0158,0.244575,0.968023,0.970976,30.2519,11.371
10,0.0123,0.219517,0.973837,0.976378,30.5707,11.253


Using the latest cached version of the module from /home/chenli/.cache/huggingface/modules/datasets_modules/metrics/glue/91f3cfc5498873918ecf119dbf806fb10815786c84f41b85a5d3c47c1519b343 (last modified on Fri Nov 11 21:04:53 2022) since it couldn't be found locally at glue, or remotely on the Hugging Face Hub.
Using the latest cached version of the module from /home/chenli/.cache/huggingface/modules/datasets_modules/metrics/glue/91f3cfc5498873918ecf119dbf806fb10815786c84f41b85a5d3c47c1519b343 (last modified on Fri Nov 11 21:04:53 2022) since it couldn't be found locally at glue, or remotely on the Hugging Face Hub.
Using the latest cached version of the module from /home/chenli/.cache/huggingface/modules/datasets_modules/metrics/glue/91f3cfc5498873918ecf119dbf806fb10815786c84f41b85a5d3c47c1519b343 (last modified on Fri Nov 11 21:04:53 2022) since it couldn't be found locally at glue, or remotely on the Hugging Face Hub.


TrainOutput(global_step=6890, training_loss=0.05755438195603679, metrics={'train_runtime': 6806.2861, 'train_samples_per_second': 1.012, 'total_flos': 26040130019100000, 'epoch': 10.0})

## 超参数搜索
Trainer同样支持超参搜索，使用optuna or Ray Tune代码库。

反注释下面两行安装依赖：

In [20]:
! pip install optuna
! pip install ray[tune]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting optuna
  Downloading optuna-3.0.3-py3-none-any.whl (348 kB)
[K     |████████████████████████████████| 348 kB 20 kB/s eta 0:00:012
Collecting cliff
  Downloading cliff-4.0.0-py3-none-any.whl (80 kB)
[K     |████████████████████████████████| 80 kB 25 kB/s eta 0:00:01
Collecting colorlog
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting cmaes>=0.8.2
  Downloading cmaes-0.9.0-py3-none-any.whl (23 kB)
Collecting alembic>=1.5.0
  Downloading alembic-1.8.1-py3-none-any.whl (209 kB)
[K     |████████████████████████████████| 209 kB 35 kB/s eta 0:00:01
[?25hCollecting importlib-resources
  Downloading importlib_resources-5.10.0-py3-none-any.whl (34 kB)
Collecting Mako
  Downloading Ma

Installing collected packages: distlib, virtualenv, tabulate, ray
Successfully installed distlib-0.3.6 ray-2.1.0 tabulate-0.9.0 virtualenv-20.16.6
You should consider upgrading via the '/home/anaconda/anaconda3/bin/python -m pip install --upgrade pip' command.[0m


超参搜索时，Trainer将会返回多个训练好的模型，所以需要传入一个定义好的模型从而让Trainer可以不断重新初始化该传入的模型

In [16]:
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(model_dir, num_labels=num_labels)

和之前调用 Trainer类似:

In [17]:
trainer = Trainer(
    model_init=model_init,
    args=args,
    train_dataset=encoded_train_dataset,
    eval_dataset=encoded_valid_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

调用方法hyperparameter_search。注意，这个过程可能很久，我们可以先用部分数据集进行超参搜索，再进行全量训练。 比如使用1/10的数据进行搜索

In [18]:
best_run = trainer.hyperparameter_search(n_trials=10, direction="maximize")

[32m[I 2022-11-12 15:49:06,605][0m A new study created in memory with name: no-name-22dfba30-7ebc-496f-a5cc-7c8aef494c61[0m
[33m[W 2022-11-12 15:49:12,325][0m Trial 0 failed because of the following error: RuntimeError('Caught RuntimeError in replica 0 on device 0.\nOriginal Traceback (most recent call last):\n  File "/home/anaconda/anaconda3/envs/pytorch/lib/python3.7/site-packages/torch/nn/parallel/parallel_apply.py", line 61, in _worker\n    output = module(*input, **kwargs)\n  File "/home/anaconda/anaconda3/envs/pytorch/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl\n    return forward_call(*input, **kwargs)\n  File "/home/anaconda/anaconda3/envs/pytorch/lib/python3.7/site-packages/transformers/models/bert/modeling_bert.py", line 1505, in forward\n    return_dict=return_dict,\n  File "/home/anaconda/anaconda3/envs/pytorch/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl\n    return forward_call(*input, **kwarg

RuntimeError: Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
  File "/home/anaconda/anaconda3/envs/pytorch/lib/python3.7/site-packages/torch/nn/parallel/parallel_apply.py", line 61, in _worker
    output = module(*input, **kwargs)
  File "/home/anaconda/anaconda3/envs/pytorch/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/anaconda/anaconda3/envs/pytorch/lib/python3.7/site-packages/transformers/models/bert/modeling_bert.py", line 1505, in forward
    return_dict=return_dict,
  File "/home/anaconda/anaconda3/envs/pytorch/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/anaconda/anaconda3/envs/pytorch/lib/python3.7/site-packages/transformers/models/bert/modeling_bert.py", line 976, in forward
    return_dict=return_dict,
  File "/home/anaconda/anaconda3/envs/pytorch/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/anaconda/anaconda3/envs/pytorch/lib/python3.7/site-packages/transformers/models/bert/modeling_bert.py", line 574, in forward
    output_attentions,
  File "/home/anaconda/anaconda3/envs/pytorch/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/anaconda/anaconda3/envs/pytorch/lib/python3.7/site-packages/transformers/models/bert/modeling_bert.py", line 460, in forward
    past_key_value=self_attn_past_key_value,
  File "/home/anaconda/anaconda3/envs/pytorch/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/anaconda/anaconda3/envs/pytorch/lib/python3.7/site-packages/transformers/models/bert/modeling_bert.py", line 393, in forward
    output_attentions,
  File "/home/anaconda/anaconda3/envs/pytorch/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/anaconda/anaconda3/envs/pytorch/lib/python3.7/site-packages/transformers/models/bert/modeling_bert.py", line 308, in forward
    attention_scores = attention_scores / math.sqrt(self.attention_head_size)
RuntimeError: CUDA out of memory. Tried to allocate 6.44 GiB (GPU 0; 14.76 GiB total capacity; 8.83 GiB already allocated; 4.73 GiB free; 8.87 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF


hyperparameter_search会返回效果最好的模型相关的参数：

In [None]:
best_run

将Trainner设置为搜索到的最好参数，进行训练：

In [None]:
for n, v in best_run.hyperparameters.items():
    setattr(trainer.args, n, v)

trainer.train()