# 文本分类实例

## Step1 导入相关包

In [1]:
import os

# 设置可见的 GPU
os.environ["CUDA_VISIBLE_DEVICES"] = "4"

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


## Step2 加载数据集

In [2]:
dataset = load_dataset("csv", data_files="../05分布式训练篇/ChnSentiCorp_htl_all.csv", split="train")
dataset = dataset.filter(lambda x: x["review"] is not None)
dataset

Dataset({
    features: ['label', 'review'],
    num_rows: 7765
})

## Step3 划分数据集


In [3]:
datasets = dataset.train_test_split(test_size=0.1)
datasets

DatasetDict({
    train: Dataset({
        features: ['label', 'review'],
        num_rows: 6988
    })
    test: Dataset({
        features: ['label', 'review'],
        num_rows: 777
    })
})

## Step4 数据集预处理

In [4]:
import torch

tokenizer = AutoTokenizer.from_pretrained("hfl/rbt3")

def process_function(examples):
    tokenized_examples = tokenizer(examples["review"], max_length=128, truncation=True)
    tokenized_examples["labels"] = examples["label"]
    return tokenized_examples

tokenized_datasets = datasets.map(process_function, batched=True, remove_columns=datasets["train"].column_names)
tokenized_datasets

Map: 100%|██████████| 6988/6988 [00:01<00:00, 5882.65 examples/s]
Map: 100%|██████████| 777/777 [00:00<00:00, 8208.11 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 6988
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 777
    })
})

## Step5 创建模型

In [5]:
def model_init():
    model = AutoModelForSequenceClassification.from_pretrained("hfl/rbt3")
    return model

## Step6 创建评估函数

In [6]:
import evaluate

acc_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

In [7]:
def eval_metric(eval_predict):
    predictions, labels = eval_predict
    predictions = predictions.argmax(axis=-1)
    acc = acc_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels)
    acc.update(f1)
    return acc

## Step7 创建TrainArguments

In [8]:
train_args = TrainingArguments(output_dir="./checkpoints",      # 输出文件夹
                               per_device_train_batch_size=64,  # 训练时的batch_size
                               per_device_eval_batch_size=128,  # 验证时的batch_size
                               logging_steps=100,                # log 打印的频率
                               evaluation_strategy="epoch",     # 评估策略
                               save_strategy="epoch",           # 保存策略
                               save_total_limit=2,              # 最大保存数
                               learning_rate=2e-5,              # 学习率
                               weight_decay=0.01,               # weight_decay
                               metric_for_best_model="f1",      # 设定评估指标
                               load_best_model_at_end=True      # 训练完成后加载最优模型
                               )     
train_args

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_strategy=epoch,
eval_use_gather_object

## Step8 创建Trainer

In [9]:
from transformers import DataCollatorWithPadding
trainer = Trainer(model_init=model_init, 
                  args=train_args, 
                  train_dataset=tokenized_datasets["train"], 
                  eval_dataset=tokenized_datasets["test"], 
                  data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
                  compute_metrics=eval_metric)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/rbt3 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Step9 模型训练

In [14]:
trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/rbt3 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4049,0.291725,0.880309,0.915837
2,0.2718,0.270209,0.885457,0.918274
3,0.2418,0.26222,0.889318,0.92037


TrainOutput(global_step=330, training_loss=0.2976711099798029, metrics={'train_runtime': 31.3353, 'train_samples_per_second': 669.022, 'train_steps_per_second': 10.531, 'total_flos': 351909933963264.0, 'train_loss': 0.2976711099798029, 'epoch': 3.0})

## Step9 模型训练（自动搜索）

In [15]:
def default_hp_space_optuna(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 1, 5),
        "seed": trial.suggest_int("seed", 1, 40),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [4, 8, 16, 32, 64]),
        "optim": trial.suggest_categorical("optim", ["sgd", "adamw_hf"]),
    }

In [16]:
trainer.hyperparameter_search(hp_space=default_hp_space_optuna, compute_objective=lambda x: x["eval_f1"], direction="maximize", n_trials=10)

[I 2024-12-11 07:51:00,558] A new study created in memory with name: no-name-58e95eab-6699-4085-aa12-62bc758a8a7b


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/rbt3 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.672,0.662222,0.671815,0.7981
2,0.6569,0.637774,0.706564,0.827534
3,0.6495,0.626894,0.706564,0.828054
4,0.6278,0.622083,0.707851,0.828937
5,0.6387,0.620722,0.707851,0.828937


[I 2024-12-11 07:52:07,912] Trial 0 finished with value: 0.8289374529012811 and parameters: {'learning_rate': 3.327051311355805e-05, 'num_train_epochs': 5, 'seed': 11, 'per_device_train_batch_size': 16, 'optim': 'sgd'}. Best is trial 0 with value: 0.8289374529012811.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/rbt3 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.8799,0.855397,0.298584,0.021544
2,0.8323,0.828971,0.293436,0.028319


[I 2024-12-11 07:52:28,136] Trial 1 finished with value: 0.02831858407079646 and parameters: {'learning_rate': 9.158423745771438e-05, 'num_train_epochs': 2, 'seed': 4, 'per_device_train_batch_size': 64, 'optim': 'sgd'}. Best is trial 0 with value: 0.8289374529012811.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/rbt3 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.7506,0.739275,0.335907,0.208589
2,0.7396,0.731983,0.343629,0.267241


[I 2024-12-11 07:52:47,927] Trial 2 finished with value: 0.2672413793103448 and parameters: {'learning_rate': 3.408122506023807e-05, 'num_train_epochs': 2, 'seed': 12, 'per_device_train_batch_size': 64, 'optim': 'sgd'}. Best is trial 0 with value: 0.8289374529012811.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/rbt3 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.7517,0.750978,0.317889,0.192073
2,0.746,0.746472,0.324324,0.215247
3,0.742,0.743394,0.326898,0.222883
4,0.741,0.741586,0.332046,0.240117
5,0.7374,0.741007,0.337194,0.250364


[I 2024-12-11 07:53:38,620] Trial 3 finished with value: 0.25036390101892286 and parameters: {'learning_rate': 3.8549151182395826e-06, 'num_train_epochs': 5, 'seed': 33, 'per_device_train_batch_size': 32, 'optim': 'sgd'}. Best is trial 0 with value: 0.8289374529012811.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/rbt3 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6873,0.681661,0.604891,0.746071
2,0.6557,0.638595,0.696268,0.820941
3,0.6555,0.626701,0.707851,0.828937
4,0.6351,0.623724,0.707851,0.828937


[I 2024-12-11 07:56:43,130] Trial 4 finished with value: 0.8289374529012811 and parameters: {'learning_rate': 2.9908744606902177e-05, 'num_train_epochs': 4, 'seed': 17, 'per_device_train_batch_size': 4, 'optim': 'sgd'}. Best is trial 0 with value: 0.8289374529012811.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/rbt3 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4523,0.309912,0.867439,0.907291


[I 2024-12-11 07:56:55,436] Trial 5 finished with value: 0.9072907290729073 and parameters: {'learning_rate': 1.752184625079546e-05, 'num_train_epochs': 1, 'seed': 34, 'per_device_train_batch_size': 64, 'optim': 'adamw_hf'}. Best is trial 5 with value: 0.9072907290729073.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/rbt3 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.3435,0.277418,0.888031,0.921692
2,0.2899,0.249766,0.906049,0.934175
3,0.2523,0.243454,0.911197,0.937443
4,0.2224,0.242064,0.908623,0.935747


[I 2024-12-11 07:57:41,256] Trial 6 finished with value: 0.9357466063348416 and parameters: {'learning_rate': 8.830023040172657e-06, 'num_train_epochs': 4, 'seed': 33, 'per_device_train_batch_size': 32, 'optim': 'adamw_hf'}. Best is trial 6 with value: 0.9357466063348416.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/rbt3 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.3272,0.393507,0.894466,0.922201
2,0.3299,0.365877,0.912484,0.939068
3,0.2864,0.444708,0.903475,0.933215
4,0.0941,0.515035,0.903475,0.93188


[I 2024-12-11 08:00:58,045] Trial 7 finished with value: 0.9318801089918256 and parameters: {'learning_rate': 3.081905472373097e-05, 'num_train_epochs': 4, 'seed': 17, 'per_device_train_batch_size': 4, 'optim': 'adamw_hf'}. Best is trial 6 with value: 0.9357466063348416.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/rbt3 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.3051,0.236322,0.899614,0.928571
2,0.204,0.232384,0.902188,0.93053


[I 2024-12-11 08:01:22,284] Trial 8 finished with value: 0.9305301645338209 and parameters: {'learning_rate': 6.812815878123918e-05, 'num_train_epochs': 2, 'seed': 20, 'per_device_train_batch_size': 32, 'optim': 'adamw_hf'}. Best is trial 6 with value: 0.9357466063348416.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/rbt3 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.7459,0.753693,0.333333,0.251445


[I 2024-12-11 08:01:32,834] Trial 9 pruned. 


BestRun(run_id='6', objective=0.9357466063348416, hyperparameters={'learning_rate': 8.830023040172657e-06, 'num_train_epochs': 4, 'seed': 33, 'per_device_train_batch_size': 32, 'optim': 'adamw_hf'}, run_summary=None)

## 训练过程可视化
1、终端进入abc的conda环境和checkpoints目录，执行tensorboard --logdir=runs --host=0.0.0.0 --port=8418

2、vscode中ctrl+shift+p，搜索TensorBoard