## Homework: 使用完整的 YelpReviewFull 数据集训练，看 Acc 最高能到多少

## 全量测试集运行结果

```json
{
    "eval_loss": 0.7691748142242432, 
    "eval_accuracy": 0.69276, 
    "eval_runtime": 228.3275, 
    "eval_samples_per_second": 218.984, 
    "eval_steps_per_second": 13.686, 
    "epoch": 3.0}
```

## 代码说明
因为是用的云服务器，所以不能一直开着Jupyter，就写了python脚本+nohup运行。运行完后的结果单独写入了文件。

下面是运行代码：

In [None]:
import subprocess
import os
from datasets import load_dataset
import random
import pandas as pd
import datasets
from IPython.display import display, HTML
from transformers import AutoTokenizer

# 设置模型路径，下载到autodl的临时数据盘
os.environ['HF_HOME'] = '/root/autodl-tmp/hf'
os.environ['HF_HUB_CACHE'] = '/root/autodl-tmp/hf/hub'
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com/'

# 加载数据集
print('loading datasets...')
dataset = load_dataset("yelp_review_full")
print('data sets loaded!')

# 简单预处理数据集
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

print('mapping datasets...')
tokenized_datasets = dataset.map(tokenize_function, batched=True)
print('mapping done!')

# 使用全量数据集
full_train_dataset = tokenized_datasets["train"].shuffle(seed=42)
full_eval_dataset = tokenized_datasets["test"].shuffle(seed=42)

from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
import numpy as np
import evaluate
from transformers import TrainingArguments, Trainer
import json

model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
model_dir = "/root/autodl-tmp/saved-models/bert-base-cased-finetune-yelp"
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


training_args = TrainingArguments(output_dir=model_dir,
                                  evaluation_strategy="no",
                                  logging_strategy='steps',
                                  logging_steps=1000,
                                  save_steps=1000,
                                  per_device_train_batch_size=32,
                                  num_train_epochs=3,
                                  dataloader_num_workers=1,
                                  fp16=True                                 )

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=full_train_dataset,
    eval_dataset=full_eval_dataset,
    compute_metrics=compute_metrics,
)

print('training started!')
# 如果由于异常中断，需要从checkpoint开始接着训练就这么写
# trainer.train(resume_from_checkpoint=model_dir + '/checkpoint-30000')

# 开始训练
trainer.train()
print('training finished!')

# 训练完后第一时间保存
print('saving...')
trainer.save_model(model_dir)
trainer.save_state()
print('saved!')

# 评估模型表现
full_test_dataset = tokenized_datasets["test"].shuffle(seed=64)
eval_result = trainer.evaluate(full_test_dataset)

# 结果写入文件
with open("final_eval.txt", "w", encoding="utf-8") as file:
    file.write(json.dumps(eval_result))

print(eval_result)

