In [None]:
!pip3 install datasets==2.21.0
!pip3 install torchmetrics

Collecting datasets==2.21.0
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets==2.21.0)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets==2.21.0)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets==2.21.0)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.6.1,>=2023.1.0 (from fsspec[http]<=2024.6.1,>=2023.1.0->datasets==2.21.0)
  Downloading fsspec-2024.6.1-py3-none-any.whl.metadata (11 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
Collecting multiprocess (from datasets==2.21.0)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import transformers as T
from datasets import load_dataset
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from tqdm import tqdm
from torchmetrics import SpearmanCorrCoef, Accuracy, F1Score
import os
device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [None]:
# 有些中文的標點符號在tokenizer編碼以後會變成[UNK]，所以將其換成英文標點
token_replacement = [
    ["：" , ":"],
    ["，" , ","],
    ["“" , "\""],
    ["”" , "\""],
    ["？" , "?"],
    ["……" , "..."],
    ["！" , "!"]
]

In [None]:
tokenizer = T.BertTokenizer.from_pretrained("google-bert/bert-base-uncased", cache_dir="/content/cache/")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
class SemevalDataset(Dataset):
    def __init__(self, split="train") -> None:
        super().__init__()
        assert split in ["train", "validation", "test"]
        self.data = load_dataset(
            "sem_eval_2014_task_1", split=split, cache_dir="/content/cache/"
        ).to_list()

    def __getitem__(self, index):
        d = self.data[index]
        # 把中文標點替換掉
        for k in ["premise", "hypothesis"]:
            for tok in token_replacement:
                d[k] = d[k].replace(tok[0], tok[1])
        return d

    def __len__(self):
        return len(self.data)

data_sample = SemevalDataset(split="train").data[:3]
print(f"Dataset example: \n{data_sample[0]} \n{data_sample[1]} \n{data_sample[2]}")

Downloading builder script:   0%|          | 0.00/5.20k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/3.56k [00:00<?, ?B/s]

The repository for sem_eval_2014_task_1 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/sem_eval_2014_task_1.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/87.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/93.4k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/16.4k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/4500 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/4927 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/500 [00:00<?, ? examples/s]

Dataset example: 
{'sentence_pair_id': 1, 'premise': 'A group of kids is playing in a yard and an old man is standing in the background', 'hypothesis': 'A group of boys in a yard is playing and a man is standing in the background', 'relatedness_score': 4.5, 'entailment_judgment': 0} 
{'sentence_pair_id': 2, 'premise': 'A group of children is playing in the house and there is no man standing in the background', 'hypothesis': 'A group of kids is playing in a yard and an old man is standing in the background', 'relatedness_score': 3.200000047683716, 'entailment_judgment': 0} 
{'sentence_pair_id': 3, 'premise': 'The young boys are playing outdoors and the man is smiling nearby', 'hypothesis': 'The kids are playing outdoors near a man with a smile', 'relatedness_score': 4.699999809265137, 'entailment_judgment': 1}


In [None]:
# Define the hyperparameters
lr = 3e-5
epochs = 3
train_batch_size = 8
validation_batch_size = 8

In [None]:
# TODO1: Create batched data for DataLoader
# `collate_fn` is a function that defines how the data batch should be packed.
# This function will be called in the DataLoader to pack the data batch.

def collate_fn(batch):
    # TODO1-1: Implement the collate_fn function
    # Write your code here
    # The input parameter is a data batch (tuple), and this function packs it into tensors.
    # Use tokenizer to pack tokenize and pack the data and its corresponding labels.
    # Return the data batch and labels for each sub-task.
    # Tokenize premise and hypothesis
    premises = [item["premise"] for item in batch]
    hypotheses = [item["hypothesis"] for item in batch]
    relatedness_scores = torch.tensor([item["relatedness_score"] for item in batch], dtype=torch.float)
    entailment_judgment = torch.tensor([item["entailment_judgment"] for item in batch], dtype=torch.long)

    # 使用 tokenizer 對批量輸入進行編碼
    encoding = tokenizer(premises, hypotheses, padding=True, truncation=True, return_tensors="pt")

    return {
        "input_ids": encoding["input_ids"],
        "attention_mask": encoding["attention_mask"],
        "token_type_ids": encoding["token_type_ids"],
        "relatedness_scores": relatedness_scores,
        "entailment_judgment": entailment_judgment
    }

# TODO1-2: Define your DataLoader
dl_train = DataLoader(SemevalDataset(split="train"), batch_size=train_batch_size, collate_fn=collate_fn, shuffle=True)# Write your code here
dl_validation = DataLoader(SemevalDataset(split="validation"), batch_size=validation_batch_size, collate_fn=collate_fn, shuffle=True)# Write your code here
next(iter(dl_train))

{'input_ids': tensor([[  101,  1037,  2158,  2003,  2652,  1037, 14601,   102,  1037,  2158,
           2003, 14601,  2075,  2006,  2055,  1037,  2377,   102,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0],
         [  101,  1037,  8928,  2003,  2108,  2209,  2011,  1037,  2611,   102,
           2028,  2450,  2003, 12560,  2007,  1037,  8928,   102,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0],
         [  101,  1996,  2879,  2003,  9361,  1037,  8094,   102,  1037,  2879,
           2003,  9361,  1037,  8094,   102,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0],
         [  101,  1037,  3899,  2007,  3585,  6519,  2003,  1999,  1996,  2300,
            102,  1037,  2829,  3899,  2003, 12885,  3061,  1999,  199

In [None]:
# TODO2: Construct your model
class MultiLabelModel(torch.nn.Module):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # Write your code here
        # Define what modules you will use in the model
        self.bert = T.BertModel.from_pretrained("google-bert/bert-base-uncased", cache_dir="/content/cache/")
        self.relatedness_head = torch.nn.Linear(self.bert.config.hidden_size, 1)
        self.entailment_head = torch.nn.Linear(self.bert.config.hidden_size, 3)
        self.dropout = torch.nn.Dropout(p=0.2)

    def forward(self,  input_ids, attention_mask, token_type_ids):
        # Write your code here
        # Forward pass
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        pooled_output = outputs.pooler_output #

        # 避免過擬和Dropout
        pooled_output = self.dropout(pooled_output)

        # 預測相關性分數
        relatedness_score = self.relatedness_head(pooled_output).squeeze(-1)

        # 預測蘊涵關係
        entailment_logits = self.entailment_head(pooled_output)

        return relatedness_score, entailment_logits

In [None]:
model = MultiLabelModel().to(device)
print(model._modules)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

{'bert': BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inpl

In [None]:
# TODO3: Define your optimizer and loss function

# TODO3-1: Define your Optimizer
optimizer = AdamW(model.parameters(), lr=lr)# Write your code here

# TODO3-2: Define your loss functions (you should have two)
# Write your code here
class CombinedLoss(torch.nn.Module):
    def __init__(self, alpha=0.5):
        super(CombinedLoss, self).__init__()
        self.mse_loss = torch.nn.MSELoss()
        self.ce_loss = torch.nn.CrossEntropyLoss()
        self.alpha = alpha

    def forward(self, pred_numeric, true_numeric, pred_category, true_category):
        loss_numeric = self.mse_loss(pred_numeric, true_numeric)
        loss_category = self.ce_loss(pred_category, true_category)


        total_loss = self.alpha * loss_numeric + (1 - self.alpha) * loss_category

        return total_loss
criterion = CombinedLoss(alpha=0.5)
# scoring functions
spc = SpearmanCorrCoef().to(device)
acc = Accuracy(task="multiclass", num_classes=3).to(device)
f1 = F1Score(task="multiclass", num_classes=3, average='macro').to(device)



In [None]:
for ep in range(epochs):
    pbar = tqdm(dl_train)
    pbar.set_description(f"Training epoch [{ep+1}/{epochs}]")
    model.train()
    train_loss = 0
    # TODO4: Write the training loop
    # Write your code here
    # train your model
    # clear gradient
    # forward pass
    # compute loss
    # back-propagation
    # model optimization
    for batch in pbar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        relatedness_targets = batch['relatedness_scores'].to(device)
        entailment_targets = batch['entailment_judgment'].to(device)


        relatedness_scores, entailment_logits = model(input_ids, attention_mask, token_type_ids)


        loss = criterion(relatedness_scores, relatedness_targets, entailment_logits, entailment_targets)


        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    pbar = tqdm(dl_validation)
    pbar.set_description(f"Validation epoch [{ep+1}/{epochs}]")
    model.eval()
    val_relatedness_scores, val_entailment_logits, val_relatedness_targets, val_entailment_targets = None, None, None, None
    # TODO5: Write the evaluation loop
    # Write your code here
    # Evaluate your model
    # Output all the evaluation scores (SpearmanCorrCoef, Accuracy, F1Score)
    with torch.no_grad():  # Disable gradient calculation during evaluation
        for batch in pbar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            relatedness_targets = batch['relatedness_scores'].to(device)
            entailment_targets = batch['entailment_judgment'].to(device)
            relatedness_scores, entailment_logits = model(input_ids, attention_mask, token_type_ids)

            # store each batch results
            if val_relatedness_scores == None and val_entailment_logits==None and val_relatedness_targets==None and val_entailment_targets == None:
                val_relatedness_scores = relatedness_scores
                val_entailment_logits = entailment_logits
                val_relatedness_targets = relatedness_targets
                val_entailment_targets = entailment_targets
            else:
                val_relatedness_scores = torch.cat((val_relatedness_scores, relatedness_scores), dim=0)
                val_entailment_logits = torch.cat((val_entailment_logits, entailment_logits), dim=0)
                val_relatedness_targets = torch.cat((val_relatedness_targets, relatedness_targets), dim=0)
                val_entailment_targets = torch.cat((val_entailment_targets, entailment_targets), dim=0)

    spearman = spc(val_relatedness_scores, val_relatedness_targets)
    accuracy = acc(val_entailment_logits, val_entailment_targets)
    f1score = f1(val_entailment_logits, val_entailment_targets)

        # 5. Output evaluation scores
    print(f"spearman: {spearman}\naccuracy: {accuracy}\nf1score: {f1score}")
    print("="*50)
    os.makedirs('./saved_models', exist_ok=True)
    torch.save(model, f'./saved_models/ep{ep}.ckpt')

Training epoch [1/3]: 100%|██████████| 563/563 [00:49<00:00, 11.38it/s]
Validation epoch [1/3]: 100%|██████████| 63/63 [00:01<00:00, 45.74it/s]


spearman: 0.8252767324447632
accuracy: 0.8360000252723694
f1score: 0.8352042436599731


Training epoch [2/3]: 100%|██████████| 563/563 [00:48<00:00, 11.53it/s]
Validation epoch [2/3]: 100%|██████████| 63/63 [00:01<00:00, 45.61it/s]


spearman: 0.830572247505188
accuracy: 0.8640000224113464
f1score: 0.8603941798210144


Training epoch [3/3]: 100%|██████████| 563/563 [00:49<00:00, 11.34it/s]
Validation epoch [3/3]: 100%|██████████| 63/63 [00:01<00:00, 52.56it/s]


spearman: 0.8275524973869324
accuracy: 0.8500000238418579
f1score: 0.8442127108573914


For test set predictions, you can write perform evaluation simlar to #TODO5.

In [None]:
# Test_data
dl_test = DataLoader(SemevalDataset(split="test").data, batch_size=8, collate_fn=collate_fn, shuffle=True)
model = torch.load("/content/saved_models/ep1.ckpt")
model.eval()
test_relatedness_scores, test_entailment_logits, test_relatedness_targets, test_entailment_targets = None, None, None, None
pbar = tqdm(dl_test)
with torch.no_grad():
    for batch in pbar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        relatedness_targets = batch['relatedness_scores'].to(device)
        entailment_targets = batch['entailment_judgment'].to(device)
        relatedness_scores, entailment_logits = model(input_ids, attention_mask, token_type_ids)

        # store each batch results
        if test_relatedness_scores == None and test_entailment_logits==None and test_relatedness_targets==None and test_entailment_targets == None:
            test_relatedness_scores = relatedness_scores
            test_entailment_logits = entailment_logits
            test_relatedness_targets = relatedness_targets
            test_entailment_targets = entailment_targets
        else:
            test_relatedness_scores = torch.cat((test_relatedness_scores, relatedness_scores), dim=0)
            test_entailment_logits = torch.cat((test_entailment_logits, entailment_logits), dim=0)
            test_relatedness_targets = torch.cat((test_relatedness_targets, relatedness_targets), dim=0)
            test_entailment_targets = torch.cat((test_entailment_targets, entailment_targets), dim=0)

spearman = spc(test_relatedness_scores, test_relatedness_targets)
accuracy = acc(test_entailment_logits, test_entailment_targets)
f1score = f1(test_entailment_logits, test_entailment_targets)

print(f"spearman: {spearman}\naccuracy: {accuracy}\nf1score: {f1score}")

  model = torch.load("/content/saved_models/ep1.ckpt")
100%|██████████| 616/616 [00:11<00:00, 52.29it/s]


spearman: 0.8215921521186829
accuracy: 0.8684797883033752
f1score: 0.8615207672119141


In [None]:
import pandas as pd
from sklearn.metrics import classification_report, mean_squared_error

# 收集預測結果與真實標籤
relatedness_predictions = test_relatedness_scores.cpu().numpy()
entailment_predictions = torch.argmax(test_entailment_logits, dim=1).cpu().numpy()
relatedness_targets = test_relatedness_targets.cpu().numpy()
entailment_targets = test_entailment_targets.cpu().numpy()

# 建立 DataFrame 儲存結果
test_results = pd.DataFrame({
    "premise": [item["premise"] for item in SemevalDataset(split="test").data],
    "hypothesis": [item["hypothesis"] for item in SemevalDataset(split="test").data],
    "relatedness_true": relatedness_targets,
    "relatedness_pred": relatedness_predictions,
    "entailment_true": entailment_targets,
    "entailment_pred": entailment_predictions
})

# 計算相關性預測的 MSE
relatedness_mse = mean_squared_error(test_results["relatedness_true"], test_results["relatedness_pred"])
print(f"Mean Squared Error (Relatedness): {relatedness_mse:.4f}")

# 分析蘊含關係分類錯誤
classification_report_str = classification_report(
    test_results["entailment_true"],
    test_results["entailment_pred"],
    target_names=["Neutral", "Contradiction", "Entailment"]
)
print("Classification Report (Entailment):")
print(classification_report_str)


Mean Squared Error (Relatedness): 0.2413
Classification Report (Entailment):
               precision    recall  f1-score   support

      Neutral       0.89      0.89      0.89      2793
Contradiction       0.83      0.83      0.83      1414
   Entailment       0.85      0.88      0.87       720

     accuracy                           0.87      4927
    macro avg       0.86      0.86      0.86      4927
 weighted avg       0.87      0.87      0.87      4927



In [None]:
# 計算每個 premise 對應的平均 MSE
relatedness_error_by_premise = test_results.groupby("premise").apply(
    lambda x: mean_squared_error(x["relatedness_true"], x["relatedness_pred"])
)

# 計算每個 hypothesis 的分類準確率
entailment_accuracy_by_hypothesis = test_results.groupby("hypothesis").apply(
    lambda x: (x["entailment_true"] == x["entailment_pred"]).mean()
)

# 輸出錯誤分析
print("Relatedness Error by Premise:")
print(relatedness_error_by_premise.sort_values(ascending=False).head(10))

print("Entailment Accuracy by Hypothesis:")
print(entailment_accuracy_by_hypothesis.sort_values().head(10))


  relatedness_error_by_premise = test_results.groupby("premise").apply(


Relatedness Error by Premise:
premise
The students are filling the classroom                          5.294359
One panda is climbing                                           4.945291
A young man is pushing a motocross bike up a dirt hill          4.465765
A man and a young boy are jumping off of a yellow kayak         4.389011
The man is silent and still                                     3.640218
A tower is being looked at by a blonde lady                     3.593285
One girl is jumping off a rock and another is standing on it    3.415971
A man is tearing up the pictures of a lake                      3.366041
A woman is washing a big pepper                                 3.240942
Children in red shirts are sleeping in the leaves               3.117961
dtype: float64
Entailment Accuracy by Hypothesis:
hypothesis
A little girl in a green coat and a boy holding a red sled are lying in the snow                              0.0
A girl in blue sweater is holding a multicolor toy and is

  entailment_accuracy_by_hypothesis = test_results.groupby("hypothesis").apply(


In [None]:
# 單任務模型 (相關性預測)
class RelatednessModel(torch.nn.Module):
    def __init__(self):
        super(RelatednessModel, self).__init__()
        self.bert = T.BertModel.from_pretrained("google-bert/bert-base-uncased")
        self.fc = torch.nn.Linear(self.bert.config.hidden_size, 1)
        self.dropout = torch.nn.Dropout(p=0.2)

    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        pooled_output = self.dropout(outputs.pooler_output)
        return self.fc(pooled_output).squeeze(-1)

# 初始化相關性預測模型
relatedness_model = RelatednessModel().to(device)
relatedness_criterion = torch.nn.MSELoss()
relatedness_optimizer = AdamW(relatedness_model.parameters(), lr=3e-5)


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
# 單任務模型 (蘊含關係)
class EntailmentModel(torch.nn.Module):
    def __init__(self):
        super(EntailmentModel, self).__init__()
        self.bert = T.BertModel.from_pretrained("google-bert/bert-base-uncased")
        self.fc = torch.nn.Linear(self.bert.config.hidden_size, 3)  # 3分類
        self.dropout = torch.nn.Dropout(p=0.2)

    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        pooled_output = self.dropout(outputs.pooler_output)
        return self.fc(pooled_output)

# 初始化蘊含關係模型
entailment_model = EntailmentModel().to(device)
entailment_criterion = torch.nn.CrossEntropyLoss()
entailment_optimizer = AdamW(entailment_model.parameters(), lr=3e-5)


In [None]:
# 訓練相關性預測模型
for epoch in range(epochs):
    relatedness_model.train()
    total_loss = 0
    for batch in dl_train:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        targets = batch['relatedness_scores'].to(device)

        outputs = relatedness_model(input_ids, attention_mask, token_type_ids)
        loss = relatedness_criterion(outputs, targets)

        relatedness_optimizer.zero_grad()
        loss.backward()
        relatedness_optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Relatedness Loss: {total_loss:.4f}")

# 訓練蘊含關係模型
for epoch in range(epochs):
    entailment_model.train()
    total_loss = 0
    for batch in dl_train:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        targets = batch['entailment_judgment'].to(device)

        outputs = entailment_model(input_ids, attention_mask, token_type_ids)
        loss = entailment_criterion(outputs, targets)

        entailment_optimizer.zero_grad()
        loss.backward()
        entailment_optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Entailment Loss: {total_loss:.4f}")


Epoch 1, Relatedness Loss: 352.7566
Epoch 2, Relatedness Loss: 151.1279
Epoch 3, Relatedness Loss: 108.6996
Epoch 1, Entailment Loss: 300.8612
Epoch 2, Entailment Loss: 180.0536
Epoch 3, Entailment Loss: 122.7693


In [None]:
# 測試相關性預測與蘊含關係模型的三個指標
relatedness_model.eval()
entailment_model.eval()
dl_test = DataLoader(SemevalDataset(split="test").data, batch_size=8, collate_fn=collate_fn, shuffle=True)

test_relatedness_scores, test_entailment_logits, test_relatedness_targets, test_entailment_targets = None, None, None, None

with torch.no_grad():
    for batch in dl_test:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)

        # 相關性預測
        relatedness_targets = batch['relatedness_scores'].to(device)
        relatedness_outputs = relatedness_model(input_ids, attention_mask, token_type_ids)

        # 蘊含關係預測
        entailment_targets = batch['entailment_judgment'].to(device)
        entailment_outputs = entailment_model(input_ids, attention_mask, token_type_ids)

        # 收集結果
        if test_relatedness_scores is None:
            test_relatedness_scores = relatedness_outputs
            test_entailment_logits = entailment_outputs
            test_relatedness_targets = relatedness_targets
            test_entailment_targets = entailment_targets
        else:
            test_relatedness_scores = torch.cat((test_relatedness_scores, relatedness_outputs), dim=0)
            test_entailment_logits = torch.cat((test_entailment_logits, entailment_outputs), dim=0)
            test_relatedness_targets = torch.cat((test_relatedness_targets, relatedness_targets), dim=0)
            test_entailment_targets = torch.cat((test_entailment_targets, entailment_targets), dim=0)

# 計算指標
spearman = spc(test_relatedness_scores, test_relatedness_targets)
accuracy = acc(test_entailment_logits, test_entailment_targets)
f1score = f1(test_entailment_logits, test_entailment_targets)

# 輸出結果
print(f"Spearman Correlation Coefficient: {spearman:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1score:.4f}")


Spearman Correlation Coefficient: 0.8310
Accuracy: 0.8565
F1 Score: 0.8498
