In [122]:
# Imports
import os
import math
import random
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
from torch.cuda.amp import autocast, GradScaler
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import f1_score
from tqdm import tqdm
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
import importlib
import wic
from transformers import AutoTokenizer, BertForSequenceClassification
from torch.utils.data import Dataset, DataLoader
importlib.reload(wic)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\amush\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet_ic to
[nltk_data]     C:\Users\amush\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet_ic is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\amush\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


# Gloss Bert

In [123]:
tokenizer = AutoTokenizer.from_pretrained('kanishka/GlossBERT')
model = BertForSequenceClassification.from_pretrained('kanishka/GlossBERT')
model.to(device)

# Custom Dataset to handle sentence pairs with labels
# Custom Dataset
class SentencePairDataset(Dataset):
    def __init__(self, sentence_pairs_with_labels):
        self.data = sentence_pairs_with_labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

# Collate function
def collate_fn_glossb(batch):
    sentences1 = [item[0] for item in batch]
    sentences2 = [item[1] for item in batch]
    labels = torch.tensor([item[2] for item in batch], dtype=torch.long)
    inputs = tokenizer(sentences1, sentences2, return_tensors="pt", padding=True, truncation=True)
    return inputs, labels, batch

# Evaluation Function
def classify_and_evaluate(dataloader):
    total = 0
    correct = 0
    results = []
    model.eval()

    with torch.no_grad():
        for inputs, labels, raw_batch in dataloader:
            # Move inputs and labels to device
            inputs = {key: val.to(device) for key, val in inputs.items()}
            labels = labels.to(device)

            # Forward pass
            outputs = model(**inputs)
            probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
            predicted_labels = torch.argmax(probs, dim=-1)

            # Accuracy
            total += labels.size(0)
            correct += (predicted_labels == labels).sum().item()

            # Store results
            for i in range(len(raw_batch)):
                sentence1, sentence2, gold_label = raw_batch[i]
                pred_label = predicted_labels[i].item()
                confidence = probs[i][1].item()
                results.append({
                    'sentence1': sentence1,
                    'sentence2': sentence2,
                    'gold_label': gold_label,
                    'predicted_label': pred_label,
                    'confidence': confidence
                })

    accuracy = correct / total if total > 0 else 0.0
    return results, accuracy



# Custom Fine-Tuned Models

In [124]:
# Dataset
class WSDSiameseDataset(Dataset):
    def __init__(self, sentence_pairs, labels, tokenizer, max_length=128):
        self.sentence_pairs = sentence_pairs
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.sentence_pairs)

    def __getitem__(self, idx):
        sent1, sent2 = self.sentence_pairs[idx]
        label = self.labels[idx]

        tokens_1 = self.tokenizer(sent1, padding="max_length", truncation=True, max_length=self.max_length, return_tensors="pt")
        tokens_2 = self.tokenizer(sent2, padding="max_length", truncation=True, max_length=self.max_length, return_tensors="pt")

        return {
            "input_ids_1": tokens_1["input_ids"].squeeze(0),
            "attention_mask_1": tokens_1["attention_mask"].squeeze(0),
            "input_ids_2": tokens_2["input_ids"].squeeze(0),
            "attention_mask_2": tokens_2["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.float),
        }

# Model
class SiameseBERT(nn.Module):
    def __init__(self, model_name):
        super(SiameseBERT, self).__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.fc = nn.Linear(self.bert.config.hidden_size, 256)
        # Define cosine distance as a lambda function
        self.distance = lambda x, y: F.cosine_similarity(x, y)

    def get_embedding(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_embedding = outputs.last_hidden_state[:, 0, :]  # [CLS] token
        proj = self.fc(cls_embedding)
        return F.normalize(proj, p=2, dim=1)  # L2-normalize for cosine

    def forward(self, input_ids_1, attention_mask_1, input_ids_2, attention_mask_2, labels):
        emb1 = self.get_embedding(input_ids_1, attention_mask_1)
        emb2 = self.get_embedding(input_ids_2, attention_mask_2)
        return labels - self.distance(emb1, emb2)  # returns tensor of distances

# Contrastive Loss
class ContrastiveLoss(nn.Module):
    def __init__(self, margin=1.0):
        super(ContrastiveLoss, self).__init__()
        self.margin = margin

    def forward(self, distance, label):
        loss = (1 - label) * distance.pow(2) + label * torch.clamp(self.margin - distance, min=0.0).pow(2)
        return loss.mean()

# Collate Function
def collate_fn_custom(batch):
    return {
        "input_ids_1": torch.stack([item["input_ids_1"] for item in batch]),
        "attention_mask_1": torch.stack([item["attention_mask_1"] for item in batch]),
        "input_ids_2": torch.stack([item["input_ids_2"] for item in batch]),
        "attention_mask_2": torch.stack([item["attention_mask_2"] for item in batch]),
        "labels": torch.stack([item["label"] for item in batch]),
    }

# Evaluation

def evaluate(model, loader, loss_fn, device):
    model.eval()
    total_loss = 0.0
    with torch.no_grad():
        for batch in loader:
            distances = model(
                batch["input_ids_1"].to(device),
                batch["attention_mask_1"].to(device),
                batch["input_ids_2"].to(device),
                batch["attention_mask_2"].to(device),
                batch["labels"].to(device)
            )
            loss = loss_fn(distances, batch["labels"].to(device))
            total_loss += loss.item()
    return total_loss / len(loader)

# Prediction

def predict(model, loader, device):
    model.eval()
    all_distances, all_labels = [], []
    with torch.no_grad():
        for batch in loader:
            distances = model(
                batch["input_ids_1"].to(device),
                batch["attention_mask_1"].to(device),
                batch["input_ids_2"].to(device),
                batch["attention_mask_2"].to(device),
                batch["labels"].to(device)
            )
            all_distances.extend(distances.cpu().numpy())
            all_labels.extend(batch["labels"].cpu().numpy())
    return np.array(all_distances), np.array(all_labels)

# Metrics

def compute_accuracy_f1(distances, labels, threshold=0.0):
    preds = (distances > threshold).astype(int)
    print(labels)
    print(preds)
    accuracy = (preds == labels).mean()
    f1 = f1_score(labels, preds)
    return accuracy, f1


# TinyBert Fine-Tuned 

In [125]:
model_name = "huawei-noah/TinyBERT_General_4L_312D"
tokenizer_tiny = AutoTokenizer.from_pretrained(model_name)
model_tiny = SiameseBERT(model_name=model_name)
model_tiny.bert.load_state_dict(torch.load(r"C:\Users\amush\INLP_Project\Finetuning\fine_tuned_tiny.pth"))
loss_fn = ContrastiveLoss(margin=0.5)

  model_tiny.bert.load_state_dict(torch.load(r"C:\Users\amush\INLP_Project\Finetuning\fine_tuned_tiny.pth"))


# DistilBert Fine-Tuned

In [126]:
model_name = "distilbert-base-uncased"
tokenizer_distil = AutoTokenizer.from_pretrained(model_name)
model_distil = SiameseBERT(model_name=model_name)
model_distil.bert.load_state_dict(torch.load(r"C:\Users\amush\INLP_Project\Finetuning\fine_tuned_distill.pth"))
loss_fn = ContrastiveLoss(margin=0.5)

  model_distil.bert.load_state_dict(torch.load(r"C:\Users\amush\INLP_Project\Finetuning\fine_tuned_distill.pth"))


In [127]:
import pandas as pd

# SemEval 2015

In [128]:
sem15 = pd.read_csv(r'C:\Users\amush\INLP_Project\Finetuning\semeval2015.csv')
sem15

Unnamed: 0,sent1,sent2,lemma,ground_truth
0,This document is a summary of the European Pub...,This document is a summary of the European Pub...,document,0
1,It explains how the Committee for Medicinal Pr...,It explains how the Committee for Medicinal Pr...,explain,0
2,If you want more information on the basis of t...,"If we want to understand how it works , the be...",want,0
3,If you want more information on the basis of t...,If you want to use a typical f(x) function it ...,want,0
4,"If we want to understand how it works , the be...",If you want to use a typical f(x) function it ...,want,0
...,...,...,...,...
102,"The Foundation organised , together with the E...",The Foundation recently published a comparativ...,foundation,0
103,"The Foundation organised , together with the E...",The Foundation aims to document the characteri...,foundation,0
104,The Foundation recently published a comparativ...,The Foundation aims to document the characteri...,foundation,0
105,Case Studies Each national report contains a p...,These case studies analysed the background of ...,case_study,0


### TinyBert Performance

In [None]:
sentence_pairs = list(zip(sem15["sent1"], sem15["sent2"]))
labels = list(sem15["ground_truth"])
model_tiny.to(device)
test_dataset = WSDSiameseDataset(sentence_pairs, labels, tokenizer_tiny)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn_custom)
evaluate(model_tiny, test_loader, loss_fn, device=device)

0.6336020231246948

### DistilBert Performance

In [None]:
sentence_pairs = list(zip(sem15["sent1"], sem15["sent2"]))
labels = list(sem15["ground_truth"])
model_distil.to(device)
test_dataset = WSDSiameseDataset(sentence_pairs, labels, tokenizer_distil)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn_custom)
evaluate(model_tiny, test_loader, loss_fn, device=device)

0.6077485382556915

### GlossBert Performance

In [141]:
glossbert_data = [(sem15['sent1'][i], sem15['sent2'][i], sem15['ground_truth'][i]) for i in range(len(sem15))]
dataset = SentencePairDataset(glossbert_data)
dataloader = DataLoader(dataset, batch_size=2, shuffle=False, collate_fn=collate_fn_glossb)
model.to(device)
# Run classification and compute accuracy
results, accuracy = classify_and_evaluate(dataloader)
accuracy

0.5794392523364486

# SemEval 2013

In [142]:
sem13 = pd.read_csv(r'C:\Users\amush\INLP_Project\Finetuning\semeval2013.csv')
sem13

Unnamed: 0,sent1,sent2,lemma,ground_truth
0,The U.N.-sponsored climate conference -- chara...,"Artur Runge-Metzger , who heads international ...",climate,0
1,"It gives a lot of flexibility to the process ,...",There is a lot of consensus between the Left a...,lot,0
2,"Together , the countries would cut emissions b...",Some of the countries most vulnerable to the i...,country,0
3,U.S. special climate envoy Todd Stern rejected...,U.S. firms were in some cases at a disadvantag...,u.s.,0
4,U.S. special climate envoy Todd Stern rejected...,Major U.S. firms such as Chevron and ConocoPhi...,u.s.,0
...,...,...,...,...
145,"The only difference , the degree of generosity",The only difference resides in the degree of g...,difference,0
146,"In 2005 , the fear of invasion of the national...",Fears about the impact of immigrants are based...,fear,0
147,The National Credit Union Administration ( NCU...,The regulator had also approached other major ...,regulator,0
148,The National Credit Union Administration ( NCU...,The regulator is demanding that the Frankfurt-...,regulator,0


### TinyBert Performance

In [165]:
sentence_pairs = list(zip(sem13["sent1"], sem13["sent2"]))
labels = list(sem13["ground_truth"])
model_tiny.to(device)
test_dataset = WSDSiameseDataset(sentence_pairs, labels, tokenizer_tiny)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn_custom)
evaluate(model_tiny, test_loader, loss_fn, device=device)

0.7464426159858704

### DistilBert Performance

In [None]:
sentence_pairs = list(zip(sem13["sent1"], sem13["sent2"]))
labels = list(sem13["ground_truth"])
model_distil.to(device)
test_dataset = WSDSiameseDataset(sentence_pairs, labels, tokenizer_distil)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn_custom)
evaluate(model_tiny, test_loader, loss_fn, device=device)

0.7541502594947815

### GlossBert Performance

In [169]:
glossbert_data = [(sem13['sent1'][i], sem13['sent2'][i], sem13['ground_truth'][i]) for i in range(len(sem13))]
dataset = SentencePairDataset(glossbert_data)
dataloader = DataLoader(dataset, batch_size=32, shuffle=False, collate_fn=collate_fn_glossb)
model.to(device)
# Run classification and compute accuracy
results, accuracy = classify_and_evaluate(dataloader)
accuracy

0.5533333333333333

# RAW-C (Related Words in Context)

In [180]:
rawc = pd.read_csv(r'C:\Users\amush\INLP_Project\Finetuning\raw-c.csv')
rawc = rawc[['word', 'sentence1', 'sentence2', 'same']]
rawc = rawc.rename(columns={'word':'lemma', 'sentence1': 'sent1', 'sentence2': 'sent2', 'same':'ground_truth'})
rawc['ground_truth']  = rawc['ground_truth'].apply(lambda x : 0 if x == False else 1)
rawc

Unnamed: 0,lemma,sent1,sent2,ground_truth
0,act,It was a desperate act.,It was a magic act.,0
1,act,It was a desperate act.,It was a comedic act.,0
2,act,It was a humane act.,It was a magic act.,0
3,act,It was a humane act.,It was a comedic act.,0
4,act,It was a desperate act.,It was a humane act.,1
...,...,...,...,...
667,yard,It was five yards.,They were cluttered yards.,0
668,yard,It was ten yards.,They were big yards.,0
669,yard,It was ten yards.,They were cluttered yards.,0
670,yard,It was five yards.,It was ten yards.,1


### TinyBert Performance

In [187]:
sentence_pairs = list(zip(rawc["sent1"], rawc["sent2"]))
labels = list(rawc["ground_truth"])
model_tiny.to(device)
test_dataset = WSDSiameseDataset(sentence_pairs, labels, tokenizer_tiny)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn_custom)
evaluate(model_tiny, test_loader, loss_fn, device=device)

0.7395407330422175

### DistilBert Performance

In [189]:
sentence_pairs = list(zip(rawc["sent1"], rawc["sent2"]))
labels = list(rawc["ground_truth"])
model_distil.to(device)
test_dataset = WSDSiameseDataset(sentence_pairs, labels, tokenizer_distil)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn_custom)
evaluate(model_tiny, test_loader, loss_fn, device=device)

0.7395407330422175

### GlossBert Performance

In [192]:
glossbert_data = [(rawc['sent1'][i], rawc['sent2'][i], rawc['ground_truth'][i]) for i in range(len(rawc))]
dataset = SentencePairDataset(glossbert_data)
dataloader = DataLoader(dataset, batch_size=32, shuffle=False, collate_fn=collate_fn_glossb)
model.to(device)
# Run classification and compute accuracy
results, accuracy = classify_and_evaluate(dataloader)
accuracy

0.6190476190476191