In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset, Dataset

In [2]:
import pandas as pd

dataset_file = r"Sentiment_classification_training_dataset.xlsx"
df = pd.read_excel(dataset_file)

In [None]:
dataset_row = Dataset.from_pandas(df, split='train')
dataset_row

In [None]:
from datasets import DatasetDict, ClassLabel

# 為了使 label 平均分配，須將 int 轉換為 ClassLabel，使 HF 知道這四個類別對應的名稱，順序與 Label 對應
label_feature = ClassLabel(names=[ 
    "neutral",  # 0
    "positive", # 1
    "negative"  # 2
])

# 使 label cast 成 ClassLabel，註記類別名稱
datasets_row = dataset_row.cast_column(
    "label",
    label_feature
)

print(datasets_row.features["label"], datasets_row.features["label"].num_classes)

In [5]:
datasets = datasets_row.train_test_split(test_size=0.20, stratify_by_column="label", seed=40)

In [None]:
import torch

tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")

def process_function(examples):
    tokenized_examples = tokenizer(examples["text"], max_length=128, truncation=True)
    tokenized_examples["labels"] = examples["label"]
    return tokenized_examples


tokenized_datasets = datasets.map(process_function, batched=True, remove_columns=datasets_row.column_names)
tokenized_datasets

In [7]:
# 由於 FinBERT 本身是三分類的 Fine-Tuning 模型，所以需要跳過它的 classifier.weight
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert", num_labels=3, ignore_mismatched_sizes=True)

In [8]:
import evaluate

metrics_1 = evaluate.load("accuracy")

metrics_2 = evaluate.combine([
    evaluate.load("precision"),          
    evaluate.load("recall"),
    evaluate.load("f1")
])

In [9]:
def eval_metric(eval_predict):
    preds, labels = eval_predict
    preds = preds.argmax(axis=-1)

    results = metrics_2.compute(
        predictions=preds,
        references=labels,
        average="macro"         
    ) | metrics_1.compute(
        predictions=preds,
        references=labels
    )

    return results

In [10]:
train_args = TrainingArguments(
    output_dir="./checkpoints",      
    per_device_train_batch_size=32,                
    evaluation_strategy="epoch",     
    save_strategy="epoch",
    num_train_epochs=10,           
    save_total_limit=2,              
    learning_rate=2e-5,              
    weight_decay=0.01,               
    greater_is_better=True,
    metric_for_best_model="f1",
    load_best_model_at_end=True,
    report_to="none") 


In [11]:
from transformers import DataCollatorWithPadding

trainer = Trainer(model=model, 
                  args=train_args, 
                  train_dataset=tokenized_datasets["train"], 
                  eval_dataset=tokenized_datasets["test"], 
                  data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
                  compute_metrics=eval_metric)

In [None]:
trainer.train()

In [13]:
trainer.save_model("FinBERT_SENTIMENT_TASK") 