<a href="https://colab.research.google.com/github/ArnyWu/-Preview-AI-/blob/main/week10%E4%BD%9C%E6%A5%AD_%E9%87%91%E8%9E%8D%E6%96%87%E6%9C%AC%E6%83%85%E7%B7%92%E8%BE%A8%E8%AD%98Hugging_Face.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install transformers datasets torch scikit-learn

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [3]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset, concatenate_datasets
import torch

# 選擇不同的資料配置進行微調
configs = ["sentences_75agree", "sentences_allagree"]

# 加載並合併數據集
datasets = []
for config in configs:
    print(f"\nLoading dataset with configuration: {config}...")
    dataset = load_dataset("takala/financial_phrasebank", config, trust_remote_code=True)
    datasets.append(dataset["train"])

# 合併數據集
print("Concatenating datasets...")
merged_dataset = concatenate_datasets(datasets)

# 檢查數據格式
print(merged_dataset.features)

# 資料預處理
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

def preprocess_function(examples):
    return tokenizer(examples['sentence'], truncation=True, padding='max_length', max_length=128)

print("Tokenizing dataset...")
tokenized_dataset = merged_dataset.map(preprocess_function, batched=True)

# 將標籤轉為整數類別
label_map = {0: "negative", 1: "neutral", 2: "positive"}  # 更新映射以匹配數據集中的整數標籤

def encode_labels(example):
    example['label'] = example['label']  # 保留數據集中的整數標籤
    return example

tokenized_dataset = tokenized_dataset.map(encode_labels)

# 分割訓練、驗證與測試數據集
print("Splitting dataset...")
split_dataset = tokenized_dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = split_dataset["train"]
val_dataset = split_dataset["test"]

# 模型定義
print("Initializing model...")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=3)
model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

# 訓練參數設置
training_args = TrainingArguments(
    output_dir="./results_merged",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs_merged",
    logging_steps=10,
    load_best_model_at_end=True,
)

# 設置訓練器
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1)
    accuracy = (predictions == torch.tensor(labels)).float().mean().item()
    return {"accuracy": accuracy}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# 開始訓練
print("Starting training for merged dataset...")
trainer.train()

# 模型評估
print("Evaluating model...")
evaluation_results = trainer.evaluate()
print(evaluation_results)

# 保存模型
print("Saving model for merged dataset...")
trainer.save_model("./financial_sentiment_model_merged")

# 測試預測函數
def predict_sentiment(sentence):
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
    inputs = {key: value.to(torch.device("cuda" if torch.cuda.is_available() else "cpu")) for key, value in inputs.items()}
    model.eval()  # 設置模型為評估模式
    with torch.no_grad():
        outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    sentiment_map = {0: "negative", 1: "neutral", 2: "positive"}
    sentiment = sentiment_map[torch.argmax(probs).item()]
    return {"sentiment": sentiment, "probabilities": probs.tolist()}

# 測試示例 1
sentence = "The company's revenue growth is excellent."
predicted_sentiment = predict_sentiment(sentence)
print(f"Predicted sentiment for \"{sentence}\" using merged dataset:", predicted_sentiment)
# 測試示例 2
# 試試用中文測試多語言模型
sentence = "公司的收入增長非常出色。"
predicted_sentiment = predict_sentiment(sentence)
print(f"Predicted sentiment for \"{sentence}\" using merged dataset:", predicted_sentiment)



Loading dataset with configuration: sentences_75agree...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/8.88k [00:00<?, ?B/s]

financial_phrasebank.py:   0%|          | 0.00/6.04k [00:00<?, ?B/s]

FinancialPhraseBank-v1.0.zip:   0%|          | 0.00/682k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3453 [00:00<?, ? examples/s]


Loading dataset with configuration: sentences_allagree...


Generating train split:   0%|          | 0/2264 [00:00<?, ? examples/s]

Concatenating datasets...
{'sentence': Value(dtype='string', id=None), 'label': ClassLabel(names=['negative', 'neutral', 'positive'], id=None)}


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Tokenizing dataset...


Map:   0%|          | 0/5717 [00:00<?, ? examples/s]

Map:   0%|          | 0/5717 [00:00<?, ? examples/s]

Splitting dataset...
Initializing model...


model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Starting training for merged dataset...


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Accuracy
1,0.201,0.212888,0.922203
2,0.119,0.182397,0.949301
3,0.0764,0.149354,0.965035
4,0.0359,0.174198,0.967657
5,0.0324,0.168474,0.967657


Evaluating model...


{'eval_loss': 0.1493539661169052, 'eval_accuracy': 0.9650349617004395, 'eval_runtime': 7.9093, 'eval_samples_per_second': 144.641, 'eval_steps_per_second': 4.552, 'epoch': 5.0}
Saving model for merged dataset...
Predicted sentiment for "The company's revenue growth is excellent." using merged dataset: {'sentiment': 'positive', 'probabilities': [[0.0018886958714574575, 0.0015520453453063965, 0.9965593218803406]]}
Predicted sentiment for "公司的收入增長非常出色。" using merged dataset: {'sentiment': 'positive', 'probabilities': [[0.006365248002111912, 0.003359535476192832, 0.9902751445770264]]}


In [4]:
# 資料集中的一些範例
for i in range(3):
    example = merged_dataset[i]
    sentence = example['sentence']
    label = example['label']
    sentiment = label_map[label]  # 將數字標籤轉換為情緒文字
    print(f"Sentence: {sentence}")
    print(f"Correct Label: {sentiment} (Label Value: {label})")
    print("---")

Sentence: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
Correct Label: neutral (Label Value: 1)
---
Sentence: With the new production plant the company would increase its capacity to meet the expected increase in demand and would improve the use of raw materials and therefore increase the production profitability .
Correct Label: positive (Label Value: 2)
---
Sentence: For the last quarter of 2010 , Componenta 's net sales doubled to EUR131m from EUR76m for the same period a year earlier , while it moved to a zero pre-tax profit from a pre-tax loss of EUR7m .
Correct Label: positive (Label Value: 2)
---
