In [None]:
import torch
from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

data_files = {"train": "receipt_dataset_6000.csv"}
dataset = load_dataset("csv", data_files=data_files)
dataset = dataset['train'].train_test_split(test_size=0.2)

label2id = {"交通": 0, "食飯": 1, "購物": 2, "娛樂": 3, "其他": 4}
id2label = {v: k for k, v in label2id.items()}

def encode(example):
    example['labels'] = label2id[example['label']]
    return example

dataset = dataset.map(encode)

tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

def tokenize(batch):
    return tokenizer(
        batch['text'],
        padding='max_length',
        truncation=True,
        max_length=256
    )

tokenized_datasets = dataset.map(tokenize, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['text', 'label', 'total_price'])
tokenized_datasets.set_format('torch')

model = BertForSequenceClassification.from_pretrained(
    'bert-base-chinese',
    num_labels=5,
    id2label=id2label,
    label2id=label2id
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average='weighted')
    }

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=200,
    weight_decay=0.01,
    logging_dir="./logs",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    compute_metrics=compute_metrics
)

trainer.train()
trainer.save_model("./fine_tuned_hk_classifier")
tokenizer.save_pretrained("./fine_tuned_hk_classifier")
print("Finish！")

Map: 100%|██████████| 4800/4800 [00:01<00:00, 4202.82 examples/s]
Map: 100%|██████████| 1200/1200 [00:00<00:00, 4029.43 examples/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.000485,1.0,1.0
2,0.143900,0.001005,0.999167,0.999167
3,0.143900,0.001082,1.0,1.0
4,0.003000,0.000105,1.0,1.0
5,0.000200,9.3e-05,1.0,1.0




Model saved to ./fine_tuned_hk_classifier_modified
