In [22]:
import transformers as T
from datasets import load_dataset
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from tqdm import tqdm
from torchmetrics import SpearmanCorrCoef, Accuracy, F1Score
device = "cuda:0" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
print(f'device = {device}')

device = cuda:0


In [23]:
# 有些中文的標點符號在tokenizer編碼以後會變成[UNK]，所以將其換成英文標點
token_replacement = [
    ["：" , ":"],
    ["，" , ","],
    ["“" , "\""],
    ["”" , "\""],
    ["？" , "?"],
    ["……" , "..."],
    ["！" , "!"]
]

In [24]:
class SemevalDataset(Dataset):
    def __init__(self, split="train") -> None:
        super().__init__()
        assert split in ["train", "validation", "test"]
        self.data = load_dataset(
            "sem_eval_2014_task_1", split=split, cache_dir="./cache/", trust_remote_code=True
        ).to_list()

    def __getitem__(self, index):
        d = self.data[index]
        # 把中文標點替換掉
        for k in ["premise", "hypothesis"]:
            for tok in token_replacement:
                d[k] = d[k].replace(tok[0], tok[1])
        return d

    def __len__(self):
        return len(self.data)

data_sample = SemevalDataset(split="train").data[:3]
print(f"Dataset example: \n{data_sample[0]} \n{data_sample[1]} \n{data_sample[2]}")

Dataset example: 
{'sentence_pair_id': 1, 'premise': 'A group of kids is playing in a yard and an old man is standing in the background', 'hypothesis': 'A group of boys in a yard is playing and a man is standing in the background', 'relatedness_score': 4.5, 'entailment_judgment': 0} 
{'sentence_pair_id': 2, 'premise': 'A group of children is playing in the house and there is no man standing in the background', 'hypothesis': 'A group of kids is playing in a yard and an old man is standing in the background', 'relatedness_score': 3.200000047683716, 'entailment_judgment': 0} 
{'sentence_pair_id': 3, 'premise': 'The young boys are playing outdoors and the man is smiling nearby', 'hypothesis': 'The kids are playing outdoors near a man with a smile', 'relatedness_score': 4.699999809265137, 'entailment_judgment': 1}


In [25]:
# Define the hyperparameters
lr = 3e-5
epochs = 10
train_batch_size = 50
validation_batch_size = 50
test_batch_size = 50

In [26]:
tokenizer = T.BertTokenizer.from_pretrained("bert-base-uncased")

In [27]:
# TODO1: Create batched data for DataLoader
# `collate_fn` is a function that defines how the data batch should be packed.
# This function will be called in the DataLoader to pack the data batch.

def collate_fn(batch):
    # TODO1-1: Implement the collate_fn function
    # Write your code here
    # The input parameter is a data batch (tuple), and this function packs it into tensors.
    # Use tokenizer to pack tokenize and pack the data and its corresponding labels.
    # Return the data batch and labels for each sub-task.
    # Extract relevant fields from the batch
    sentence_pairs = [item['premise'] + " [SEP] " + item['hypothesis'] for item in batch]
    relatedness_scores = [item['relatedness_score'] for item in batch]
    entailment_judgments = [item['entailment_judgment'] for item in batch]
    
    # Tokenize the input text and convert to tensors with padding
    encoded_inputs = tokenizer(
        sentence_pairs,
        return_tensors='pt',
        padding=True,            # Pad the sequences to the maximum length in the batch
        truncation=True,          # Truncate sequences that are longer than the max model length
        max_length=512            # You can change max_length depending on your data/model
    )
    
    # Convert relatedness scores and entailment judgments to tensors
    relatedness_scores_tensor = torch.tensor(relatedness_scores, dtype=torch.float)
    entailment_judgments_tensor = torch.tensor(entailment_judgments, dtype=torch.long)
    
    # Return the dictionary containing input_ids, attention_mask, relatedness_scores, and entailment_judgments
    return {
        'input_ids': encoded_inputs['input_ids'],
        'token_type_ids': encoded_inputs['token_type_ids'],
        'attention_mask': encoded_inputs['attention_mask'],
        'relatedness_score': relatedness_scores_tensor,
        'entailment_judgment': entailment_judgments_tensor
    }

    
# TODO1-2: Define your DataLoader
ds_train = SemevalDataset(split="train").data
ds_validation = SemevalDataset(split="validation").data
dl_train = DataLoader(ds_train, batch_size=train_batch_size, collate_fn=collate_fn) # Write your code here
dl_validation = DataLoader(ds_validation, batch_size=validation_batch_size, collate_fn=collate_fn) # Write your code here

In [28]:
# TODO2: Construct your model
class MultiLabelModel(torch.nn.Module):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # Write your code here
        # Define what modules you will use in the model
        self.bert = T.BertModel.from_pretrained("bert-base-uncased")
        self.linear_list = torch.nn.Sequential(torch.nn.Linear(self.bert.config.hidden_size, 2000), 
                                               torch.nn.Linear(2000, 500), 
                                               torch.nn.Linear(500, 125),
                                               )
        self.linear_relatedness = torch.nn.Linear(125, 1)  # For predicting relatedness score
        self.linear_entailment = torch.nn.Linear(125, 3)   # For predicting entailment judgment (e.g., 3 classes)
    def forward(self, **kwargs):
        # Write your code here
        # Forward pass
        # Forward pass through BERT
        input_ids = kwargs['input_ids']
        attention_mask = kwargs['attention_mask']
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        x = outputs.pooler_output  # Get the pooled output for classification tasks
        
        x = self.linear_list(x)
        
        # Predict relatedness score
        relatedness_score = self.linear_relatedness(x).squeeze(-1)
        
        # Predict entailment judgment
        entailment_judgment = self.linear_entailment(x)
        
        return {
            'relatedness_score': relatedness_score,
            'entailment_judgment': entailment_judgment
        }

In [29]:
model = MultiLabelModel().to(device)

In [30]:
# TODO3: Define your optimizer and loss function

# TODO3-1: Define your Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=lr)# Write your code here

# TODO3-2: Define your loss functions (you should have two)
# Write your code here
MSE = torch.nn.MSELoss()
CrossEntorpy = torch.nn.CrossEntropyLoss()

# scoring functions
spc = SpearmanCorrCoef().to(device)
acc = Accuracy(task="multiclass", num_classes=3).to(device)
f1 = F1Score(task="multiclass", num_classes=3, average='macro').to(device)

In [31]:
for ep in range(epochs):
    pbar = tqdm(dl_train)
    pbar.set_description(f"Training epoch [{ep+1}/{epochs}]")
    model.train()
    # TODO4: Write the training loop
    # Write your code here
    # train your model
    # clear gradient
    # forward pass
    # compute loss
    # back-propagation
    # model optimization
    for batch in pbar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        relatedness_scores = batch['relatedness_score'].to(device)
        entailment_judgments = batch['entailment_judgment'].to(device)
        
        optimizer.zero_grad()
        pred = model(input_ids = input_ids, attention_mask = attention_mask)

        loss_regression = MSE(pred['relatedness_score'], relatedness_scores)
        loss_classification = CrossEntorpy(pred['entailment_judgment'], entailment_judgments)
        # weight be 1 means only trains the resgression part, and 0 means only trains the classification part
        # 0.5 means training at the same time -> multi-output
        loss = 0 * loss_regression + 1 * loss_classification
        loss.backward()
        optimizer.step()
        

    pbar = tqdm(dl_validation)
    pbar.set_description(f"Validation epoch [{ep+1}/{epochs}]")
    model.eval()
    # TODO5: Write the evaluation loop
    # Write your code here
    # Evaluate your model
    # Output all the evaluation scores (SpearmanCorrCoef, Accuracy, F1Score)
    with torch.no_grad():
        for batch in pbar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            relatedness_score = batch['relatedness_score'].to(device)
            entailment_judgment = batch['entailment_judgment'].to(device)
            
            # Forward pass
            pred = model(input_ids=input_ids, attention_mask=attention_mask)
            
            # Compute loss
            spc.update(relatedness_score, pred['relatedness_score'])
            acc.update(entailment_judgment, torch.argmax(pred["entailment_judgment"], dim=-1))
            f1.update(entailment_judgment, torch.argmax(pred["entailment_judgment"], dim=-1))
        
        print(f'spc = {spc.compute()}, acc = {acc.compute()}, f1 = {f1.compute()}')
        spc.reset()
        acc.reset()
        f1.reset()
    # torch.save(model, f'./ep{ep}.ckpt')

Training epoch [1/10]: 100%|██████████| 90/90 [00:06<00:00, 14.13it/s]
Validation epoch [1/10]: 100%|██████████| 10/10 [00:00<00:00, 27.67it/s]


spc = 0.3857594430446625, acc = 0.7580000162124634, f1 = 0.7792052030563354


Training epoch [2/10]: 100%|██████████| 90/90 [00:06<00:00, 14.20it/s]
Validation epoch [2/10]: 100%|██████████| 10/10 [00:00<00:00, 28.41it/s]


spc = -0.12660935521125793, acc = 0.7820000052452087, f1 = 0.7944146990776062


Training epoch [3/10]: 100%|██████████| 90/90 [00:06<00:00, 14.27it/s]
Validation epoch [3/10]: 100%|██████████| 10/10 [00:00<00:00, 28.11it/s]


spc = -0.2774164080619812, acc = 0.8479999899864197, f1 = 0.846908688545227


Training epoch [4/10]: 100%|██████████| 90/90 [00:06<00:00, 14.27it/s]
Validation epoch [4/10]: 100%|██████████| 10/10 [00:00<00:00, 28.06it/s]


spc = -0.3434978723526001, acc = 0.8539999723434448, f1 = 0.852716326713562


Training epoch [5/10]: 100%|██████████| 90/90 [00:06<00:00, 14.19it/s]
Validation epoch [5/10]: 100%|██████████| 10/10 [00:00<00:00, 27.76it/s]


spc = -0.4419223368167877, acc = 0.8379999995231628, f1 = 0.8384534120559692


Training epoch [6/10]: 100%|██████████| 90/90 [00:06<00:00, 14.22it/s]
Validation epoch [6/10]: 100%|██████████| 10/10 [00:00<00:00, 27.90it/s]


spc = -0.2994832396507263, acc = 0.8339999914169312, f1 = 0.8330016136169434


Training epoch [7/10]: 100%|██████████| 90/90 [00:06<00:00, 14.19it/s]
Validation epoch [7/10]: 100%|██████████| 10/10 [00:00<00:00, 28.11it/s]


spc = -0.22948558628559113, acc = 0.8379999995231628, f1 = 0.8429731130599976


Training epoch [8/10]: 100%|██████████| 90/90 [00:06<00:00, 14.23it/s]
Validation epoch [8/10]: 100%|██████████| 10/10 [00:00<00:00, 28.25it/s]


spc = -0.18127447366714478, acc = 0.8379999995231628, f1 = 0.8412142992019653


Training epoch [9/10]: 100%|██████████| 90/90 [00:06<00:00, 14.22it/s]
Validation epoch [9/10]: 100%|██████████| 10/10 [00:00<00:00, 28.44it/s]


spc = -0.23653079569339752, acc = 0.8500000238418579, f1 = 0.8469316959381104


Training epoch [10/10]: 100%|██████████| 90/90 [00:06<00:00, 14.22it/s]
Validation epoch [10/10]: 100%|██████████| 10/10 [00:00<00:00, 27.93it/s]

spc = -0.1394266039133072, acc = 0.8259999752044678, f1 = 0.8224334716796875





For test set predictions, you can write perform evaluation simlar to #TODO5.

In [32]:
ds_test = SemevalDataset(split="test").data
dl_test = DataLoader(dataset=ds_test, batch_size=test_batch_size, collate_fn=collate_fn)

pbar = tqdm(dl_test)
pbar.set_description(f"Test")
model.eval()

with torch.no_grad():
    for batch in pbar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        relatedness_score = batch['relatedness_score'].to(device)
        entailment_judgment = batch['entailment_judgment'].to(device)
        
        output = model(input_ids=input_ids, attention_mask=attention_mask)
        
       # calculate scores
        spc.update(relatedness_score, output['relatedness_score'])
        acc.update(entailment_judgment, torch.argmax(output["entailment_judgment"], dim=-1))
        f1.update(entailment_judgment, torch.argmax(output["entailment_judgment"], dim=-1))
        
    print(f'\nspc = {spc.compute()}')
    print(f'acc = {acc.compute()}')
    print(f'f1 = {f1.compute()}')
    spc.reset()
    acc.reset()
    f1.reset()

Test: 100%|██████████| 99/99 [00:02<00:00, 34.66it/s]


spc = -0.20264136791229248
acc = 0.8376293778419495
f1 = 0.8322234153747559



