<a href="https://colab.research.google.com/github/B10956048/113_Finance/blob/main/week10test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install transformers datasets



In [None]:
from datasets import load_dataset

# 加載資料集
dataset = load_dataset("takala/financial_phrasebank", "sentences_50agree")

# 查看資料集結構
print(dataset)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


DatasetDict({
    train: Dataset({
        features: ['sentence', 'label'],
        num_rows: 4846
    })
})


In [None]:
from transformers import BertTokenizer

# 加載預訓練的 BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize 函數
def tokenize_function(examples):
    return tokenizer(examples["sentence"], padding="max_length",max_length = 128,truncation=True)
# 對資料集進行 Tokenization
tokenized_dataset = dataset.map(tokenize_function, batched=True)


In [None]:
from transformers import DataCollatorWithPadding

# 劃分數據集
train_test_split = tokenized_dataset["train"].train_test_split(test_size=0.2)
train_dataset = train_test_split["train"]
test_dataset = train_test_split["test"]

# 自動補全批量樣本的長度
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


In [None]:
from transformers import BertForSequenceClassification

# 初始化模型
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=1,
    logging_dir="./logs",
    logging_steps=10,
)

# 定義評估指標
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="weighted")
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}




In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()


  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3169,0.438994,0.849485,0.848133,0.849726,0.849485
2,0.2716,0.497262,0.858763,0.859082,0.859584,0.858763
3,0.1286,0.629586,0.857732,0.857838,0.857963,0.857732


TrainOutput(global_step=1455, training_loss=0.33741119536747227, metrics={'train_runtime': 353.8705, 'train_samples_per_second': 32.859, 'train_steps_per_second': 4.112, 'total_flos': 764870705335296.0, 'train_loss': 0.33741119536747227, 'epoch': 3.0})

In [None]:
model.save_pretrained("./financial_sentiment_model")
tokenizer.save_pretrained("./financial_sentiment_model")


('./financial_sentiment_model/tokenizer_config.json',
 './financial_sentiment_model/special_tokens_map.json',
 './financial_sentiment_model/vocab.txt',
 './financial_sentiment_model/added_tokens.json')

In [None]:
from transformers import pipeline

# 加載訓練好的模型
sentiment_analyzer = pipeline("text-classification", model="./financial_sentiment_model", tokenizer="./financial_sentiment_model")
label_map = {0: "Negative", 1: "Neutral", 2: "Positive"}

# 測試新文本
texts = [
     "The company's profit has increased significantly this quarter.",  # 本季公司獲利大幅成長
    "The increase in costs negatively affected the revenue.",          # 成本的增加對收入產生了負面影響
    "The company's performance remained stable."                       # 公司業績保持穩定
]
predictions = sentiment_analyzer(texts)
readable_predictions = [
    {"label": label_map[int(pred["label"].split("_")[1])], "score": pred["score"]}
    for pred in predictions
]
print(readable_predictions)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'label': 'Positive', 'score': 0.9974042773246765}, {'label': 'Negative', 'score': 0.9966392517089844}, {'label': 'Positive', 'score': 0.9970753192901611}]
