In [38]:
!pip install transformers datasets



In [39]:
from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
import torch


In [40]:
# 確認加速設備 (GPU/CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cuda


In [41]:
# 載入 Financial PhraseBank 資料集
dataset = load_dataset("takala/financial_phrasebank", "sentences_allagree")
print(dataset)


DatasetDict({
    train: Dataset({
        features: ['sentence', 'label'],
        num_rows: 2264
    })
})


In [42]:
# 載入分詞器
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# 定義預處理函數
def preprocess_function(examples):
    texts = examples["sentence"]
    labels = examples["label"]
    labels = [0 if label == "negative" else 1 if label == "neutral" else 2 for label in labels]
    tokenized_inputs = tokenizer(texts, truncation=True, padding="max_length", max_length=128)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# 對資料進行預處理
tokenized_dataset = dataset.map(preprocess_function, batched=True)
print(tokenized_dataset)


DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 2264
    })
})


In [43]:
from datasets import DatasetDict

# 分割資料集為訓練集與測試集
train_test_split = tokenized_dataset["train"].train_test_split(test_size=0.2)
dataset = DatasetDict({
    "train": train_test_split["train"],
    "test": train_test_split["test"]
})

print(dataset)


DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 1811
    })
    test: Dataset({
        features: ['sentence', 'label', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 453
    })
})


In [44]:
# 載入 BERT 模型，並設定輸出類別數 (3 個情緒分類標籤)
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)
model.to(device)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [45]:
# 設定訓練參數
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=50,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True  # 確保使用最優模型
)



In [46]:
# 初始化 Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
)


In [47]:
# 開始訓練模型
trainer.train()


Epoch,Training Loss,Validation Loss
1,0.1552,0.017676
2,0.006,0.002584
3,0.0022,0.001341
4,0.0013,0.000879
5,0.0009,0.000632
6,0.0007,0.000487
7,0.0005,0.000394
8,0.0005,0.00033
9,0.0004,0.000284
10,0.0003,0.000248


TrainOutput(global_step=1450, training_loss=0.005527998213535817, metrics={'train_runtime': 1165.1455, 'train_samples_per_second': 77.716, 'train_steps_per_second': 1.244, 'total_flos': 5956229993817600.0, 'train_loss': 0.005527998213535817, 'epoch': 50.0})

In [48]:
# 使用測試集進行評估
results = trainer.evaluate()
print("Evaluation Results:", results)


Evaluation Results: {'eval_loss': 5.286174200591631e-05, 'eval_runtime': 1.6704, 'eval_samples_per_second': 271.197, 'eval_steps_per_second': 4.789, 'epoch': 50.0}


In [49]:
# 保存訓練好的模型和分詞器
model.save_pretrained("./sentiment_model")
tokenizer.save_pretrained("./sentiment_model")


('./sentiment_model/tokenizer_config.json',
 './sentiment_model/special_tokens_map.json',
 './sentiment_model/vocab.txt',
 './sentiment_model/added_tokens.json')

In [50]:
# 測試句子
test_texts = [
    "The company's profit has increased significantly this quarter.",  # Positive
    "The increase in costs negatively affected the revenue.",          # Negative
    "The company's performance remained stable."                       # Neutral
]

# 預處理測試句子
test_encodings = tokenizer(test_texts, truncation=True, padding=True, return_tensors="pt").to(device)

# 推論
with torch.no_grad():
    outputs = model(**test_encodings)

# 取得預測結果
preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()

# 將數字標籤轉換為文字標籤
label_map = {0: "Negative", 1: "Neutral", 2: "Positive"}
predicted_labels = [label_map[pred] for pred in preds]

# 輸出預測結果
print("Predicted Labels:", predicted_labels)


Predicted Labels: ['Positive', 'Positive', 'Positive']
