In [11]:
# Imports
import os
import math
import random
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
from torch.cuda.amp import autocast, GradScaler
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import f1_score
from tqdm import tqdm
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
import importlib
import wic
from transformers import AutoTokenizer, BertForSequenceClassification
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn.functional as F
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import cdist
importlib.reload(wic)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\amush\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet_ic to
[nltk_data]     C:\Users\amush\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet_ic is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\amush\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


# Gloss Bert

In [12]:
tokenizer = AutoTokenizer.from_pretrained('kanishka/GlossBERT')
model = BertForSequenceClassification.from_pretrained('kanishka/GlossBERT')
model.to(device)

# Custom Dataset to handle sentence pairs with labels
# Custom Dataset
class SentencePairDataset(Dataset):
    def __init__(self, sentence_pairs_with_labels):
        self.data = sentence_pairs_with_labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

# Collate function
def collate_fn_glossb(batch):
    sentences1 = [item[0] for item in batch]
    sentences2 = [item[1] for item in batch]
    labels = torch.tensor([item[2] for item in batch], dtype=torch.long)
    inputs = tokenizer(sentences1, sentences2, return_tensors="pt", padding=True, truncation=True)
    return inputs, labels, batch

# Evaluation Function
def classify_and_evaluate(dataloader):
    total = 0
    correct = 0
    results = []
    model.eval()

    with torch.no_grad():
        for inputs, labels, raw_batch in dataloader:
            # Move inputs and labels to device
            inputs = {key: val.to(device) for key, val in inputs.items()}
            labels = labels.to(device)

            # Forward pass
            outputs = model(**inputs)
            probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
            predicted_labels = torch.argmax(probs, dim=-1)

            # Accuracy
            total += labels.size(0)
            correct += (predicted_labels == labels).sum().item()

            # Store results
            for i in range(len(raw_batch)):
                sentence1, sentence2, gold_label = raw_batch[i]
                pred_label = predicted_labels[i].item()
                confidence = probs[i][1].item()
                results.append({
                    'sentence1': sentence1,
                    'sentence2': sentence2,
                    'gold_label': gold_label,
                    'predicted_label': pred_label,
                    'confidence': confidence
                })

    accuracy = correct / total if total > 0 else 0.0
    return results, accuracy



# Custom Fined Tuned Model (Triplet Loss Based)

In [13]:
class WSDTripletDataset(Dataset):
    def __init__(self, hf_dataset, tokenizer, max_length):
        self.dataset = hf_dataset
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        data = self.dataset.iloc[idx]  # Use iloc for pandas DataFrame

        def tokenize(text):
            tokens = self.tokenizer(
                text,
                padding="max_length",
                truncation=True,
                max_length=self.max_length,
                return_tensors="pt"
            )
            return {
                'input_ids': tokens['input_ids'].squeeze(0),
                'attention_mask': tokens['attention_mask'].squeeze(0)
            }

        return {
            'anchor': tokenize(data['anchor']),
            'positive': tokenize(data['positive']),
            'negative': tokenize(data['negative']),
            'target_word': data['target_word']
        }

In [14]:
class TripletBERT(nn.Module):
    def __init__(self, model_name):
        super(TripletBERT, self).__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.hidden_size = self.bert.config.hidden_size
        self.fc = nn.Linear(self.hidden_size, 256)

    def get_embedding(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        proj = self.fc(cls_output)
        return proj  

    def forward(self, anchor, positive, negative):
        anchor_embed = self.get_embedding(anchor["input_ids"], anchor["attention_mask"])
        positive_embed = self.get_embedding(positive["input_ids"], positive["attention_mask"])
        negative_embed = self.get_embedding(negative["input_ids"], negative["attention_mask"])
        return anchor_embed, positive_embed, negative_embed

## TinyBert Fine-Tuned

In [15]:
model_name = "huawei-noah/TinyBERT_General_4L_312D"
tokenizer_tiny = AutoTokenizer.from_pretrained(model_name)

# Reinitialize model and load state_dict
model_tiny = TripletBERT(model_name=model_name).to(device)
model_tiny.load_state_dict(torch.load(r"C:\Users\amush\INLP_Project\Finetuning\triplet_fine_tuned_tinybert.pth", map_location=device))


  model_tiny.load_state_dict(torch.load(r"C:\Users\amush\INLP_Project\Finetuning\triplet_fine_tuned_tinybert.pth", map_location=device))


<All keys matched successfully>

## DistilBert Fine-Tuned

In [16]:
model_name = "distilbert-base-uncased"
tokenizer_distil = AutoTokenizer.from_pretrained(model_name)

# Reinitialize model and load state_dict
model_distil = TripletBERT(model_name=model_name).to(device)
model_distil.load_state_dict(torch.load(r"C:\Users\amush\INLP_Project\Finetuning\triplet_fine_tuned_distil.pth", map_location=device))


  model_distil.load_state_dict(torch.load(r"C:\Users\amush\INLP_Project\Finetuning\triplet_fine_tuned_distil.pth", map_location=device))


<All keys matched successfully>

In [17]:
import pandas as pd

# SemEval 2015

In [52]:
sem15 = pd.read_csv(r'C:\Users\amush\INLP_Project\Finetuning\semeval2015.csv')
sem15.head(5)

Unnamed: 0,sent1,sent2,lemma,ground_truth
0,This document is a summary of the European Pub...,This document is a summary of the European Pub...,document,0
1,It explains how the Committee for Medicinal Pr...,It explains how the Committee for Medicinal Pr...,explain,0
2,If you want more information on the basis of t...,"If we want to understand how it works , the be...",want,0
3,If you want more information on the basis of t...,If you want to use a typical f(x) function it ...,want,0
4,"If we want to understand how it works , the be...",If you want to use a typical f(x) function it ...,want,0


### TinyBert Performance

In [86]:
correct = 0
cosin_dis = 0
dist = 0
model_tiny.eval()

for i in range(len(sem15)):
    
    sentence1 = sem15['sent1'][i]
    sentence2 = sem15['sent2'][i]
    
    sentence1_tokens = tokenizer_tiny(sentence1, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
    sentence2_tokens = tokenizer_tiny(sentence2, padding="max_length", truncation=True, max_length=128, return_tensors="pt")  
    
    sentence1_tokens = {k: v.to(device) for k, v in sentence1_tokens.items()}
    sentence2_tokens = {k: v.to(device) for k, v in sentence2_tokens.items()} 
    
    with torch.no_grad():
        embedding1 = model_tiny.get_embedding(sentence1_tokens['input_ids'], sentence1_tokens['attention_mask'])
        embedding2 = model_tiny.get_embedding(sentence2_tokens['input_ids'], sentence2_tokens['attention_mask'])
        
        
    embedding1 = embedding1.cpu().numpy()
    embedding2 = embedding2.cpu().numpy()   
    
    cosine_sim = cosine_similarity(embedding1, embedding2)
    
    embedding1 = embedding1 / np.linalg.norm(embedding1, axis=1, keepdims=True)
    embedding2 = embedding2 / np.linalg.norm(embedding2, axis=1, keepdims=True)

    dis = cdist(embedding1, embedding2, metric = 'euclidean')
    
    label = 0
    
    if dis < 0.01:
        label = 1
    
    correct += (label == sem15['ground_truth'][i])
    
print(correct/len(sem15))
    

0.6822429906542056


### DistilBert Performance

In [85]:
correct = 0
cosin_dis = 0
dist = 0
model_distil.eval()

for i in range(len(sem15)):
    
    sentence1 = sem15['sent1'][i]
    sentence2 = sem15['sent2'][i]
    
    sentence1_tokens = tokenizer_distil(sentence1, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
    sentence2_tokens = tokenizer_distil(sentence2, padding="max_length", truncation=True, max_length=128, return_tensors="pt")  
    
    sentence1_tokens = {k: v.to(device) for k, v in sentence1_tokens.items()}
    sentence2_tokens = {k: v.to(device) for k, v in sentence2_tokens.items()} 
    
    with torch.no_grad():
        embedding1 = model_distil.get_embedding(sentence1_tokens['input_ids'], sentence1_tokens['attention_mask'])
        embedding2 = model_distil.get_embedding(sentence2_tokens['input_ids'], sentence2_tokens['attention_mask'])
        
        
    embedding1 = embedding1.cpu().numpy()
    embedding2 = embedding2.cpu().numpy()   
    
    cosine_sim = cosine_similarity(embedding1, embedding2)
    
    embedding1 = embedding1 / np.linalg.norm(embedding1, axis=1, keepdims=True)
    embedding2 = embedding2 / np.linalg.norm(embedding2, axis=1, keepdims=True)

    dis = cdist(embedding1, embedding2, metric = 'euclidean')
    
    label = 0
    
    if dis < 0.4:
        label = 1
    
    correct += (label == sem15['ground_truth'][i])
    
print(correct/len(sem15))
    

0.7476635514018691


### GlossBert Performance

In [62]:
glossbert_data = [(sem15['sent1'][i], sem15['sent2'][i], sem15['ground_truth'][i]) for i in range(len(sem15))]
dataset = SentencePairDataset(glossbert_data)
dataloader = DataLoader(dataset, batch_size=2, shuffle=False, collate_fn=collate_fn_glossb)
model.to(device)
# Run classification and compute accuracy
results, accuracy = classify_and_evaluate(dataloader)
accuracy

0.5794392523364486

# SemEval 2013

In [64]:
sem13 = pd.read_csv(r'C:\Users\amush\INLP_Project\Finetuning\semeval2013.csv')
sem13.head(5)

Unnamed: 0,sent1,sent2,lemma,ground_truth
0,The U.N.-sponsored climate conference -- chara...,"Artur Runge-Metzger , who heads international ...",climate,0
1,"It gives a lot of flexibility to the process ,...",There is a lot of consensus between the Left a...,lot,0
2,"Together , the countries would cut emissions b...",Some of the countries most vulnerable to the i...,country,0
3,U.S. special climate envoy Todd Stern rejected...,U.S. firms were in some cases at a disadvantag...,u.s.,0
4,U.S. special climate envoy Todd Stern rejected...,Major U.S. firms such as Chevron and ConocoPhi...,u.s.,0


### TinyBert Performance

In [84]:
correct = 0
cosin_dis = 0
dist = 0
model_tiny.eval()

for i in range(len(sem13)):
    
    sentence1 = sem13['sent1'][i]
    sentence2 = sem13['sent2'][i]
    
    sentence1_tokens = tokenizer_tiny(sentence1, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
    sentence2_tokens = tokenizer_tiny(sentence2, padding="max_length", truncation=True, max_length=128, return_tensors="pt")  
    
    sentence1_tokens = {k: v.to(device) for k, v in sentence1_tokens.items()}
    sentence2_tokens = {k: v.to(device) for k, v in sentence2_tokens.items()} 
    
    with torch.no_grad():
        embedding1 = model_tiny.get_embedding(sentence1_tokens['input_ids'], sentence1_tokens['attention_mask'])
        embedding2 = model_tiny.get_embedding(sentence2_tokens['input_ids'], sentence2_tokens['attention_mask'])
        
        
    embedding1 = embedding1.cpu().numpy()
    embedding2 = embedding2.cpu().numpy()   
    
    cosine_sim = cosine_similarity(embedding1, embedding2)
    
    embedding1 = embedding1 / np.linalg.norm(embedding1, axis=1, keepdims=True)
    embedding2 = embedding2 / np.linalg.norm(embedding2, axis=1, keepdims=True)

    dis = cdist(embedding1, embedding2, metric = 'euclidean')
    
    label = 0
    
    if dis < 0.01:
        label = 1
    
    correct += (label == sem13['ground_truth'][i])
    
print(correct/len(sem13))
    

0.7466666666666667


### DistilBert Performance

In [83]:
correct = 0
cosin_dis = 0
dist = 0
model_distil.eval()

for i in range(len(sem13)):
    
    sentence1 = sem13['sent1'][i]
    sentence2 = sem13['sent2'][i]
    
    sentence1_tokens = tokenizer_distil(sentence1, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
    sentence2_tokens = tokenizer_distil(sentence2, padding="max_length", truncation=True, max_length=128, return_tensors="pt")  
    
    sentence1_tokens = {k: v.to(device) for k, v in sentence1_tokens.items()}
    sentence2_tokens = {k: v.to(device) for k, v in sentence2_tokens.items()} 
    
    with torch.no_grad():
        embedding1 = model_distil.get_embedding(sentence1_tokens['input_ids'], sentence1_tokens['attention_mask'])
        embedding2 = model_distil.get_embedding(sentence2_tokens['input_ids'], sentence2_tokens['attention_mask'])
        
        
    embedding1 = embedding1.cpu().numpy()
    embedding2 = embedding2.cpu().numpy()   
    
    cosine_sim = cosine_similarity(embedding1, embedding2)
    
    embedding1 = embedding1 / np.linalg.norm(embedding1, axis=1, keepdims=True)
    embedding2 = embedding2 / np.linalg.norm(embedding2, axis=1, keepdims=True)

    dis = cdist(embedding1, embedding2, metric = 'euclidean')
    
    label = 0
    
    if dis < 0.4:
        label = 1
    
    correct += (label == sem13['ground_truth'][i])
    
print(correct/len(sem13))
    

0.7733333333333333


In [68]:
glossbert_data = [(sem13['sent1'][i], sem13['sent2'][i], sem13['ground_truth'][i]) for i in range(len(sem13))]
dataset = SentencePairDataset(glossbert_data)
dataloader = DataLoader(dataset, batch_size=32, shuffle=False, collate_fn=collate_fn_glossb)
model.to(device)
# Run classification and compute accuracy
results, accuracy = classify_and_evaluate(dataloader)
accuracy

0.5533333333333333

# RAW-C (Related Words in Context)

In [69]:
rawc = pd.read_csv(r'C:\Users\amush\INLP_Project\Finetuning\raw-c.csv')
rawc = rawc[['word', 'sentence1', 'sentence2', 'same']]
rawc = rawc.rename(columns={'word':'lemma', 'sentence1': 'sent1', 'sentence2': 'sent2', 'same':'ground_truth'})
rawc['ground_truth']  = rawc['ground_truth'].apply(lambda x : 0 if x == False else 1)
rawc

Unnamed: 0,lemma,sent1,sent2,ground_truth
0,act,It was a desperate act.,It was a magic act.,0
1,act,It was a desperate act.,It was a comedic act.,0
2,act,It was a humane act.,It was a magic act.,0
3,act,It was a humane act.,It was a comedic act.,0
4,act,It was a desperate act.,It was a humane act.,1
...,...,...,...,...
667,yard,It was five yards.,They were cluttered yards.,0
668,yard,It was ten yards.,They were big yards.,0
669,yard,It was ten yards.,They were cluttered yards.,0
670,yard,It was five yards.,It was ten yards.,1


### TinyBert Performance

In [81]:
correct = 0
cosin_dis = 0
dist = 0
model_tiny.eval()

for i in range(len(rawc)):
    
    sentence1 = rawc['sent1'][i]
    sentence2 = rawc['sent2'][i]
    
    sentence1_tokens = tokenizer_tiny(sentence1, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
    sentence2_tokens = tokenizer_tiny(sentence2, padding="max_length", truncation=True, max_length=128, return_tensors="pt")  
    
    sentence1_tokens = {k: v.to(device) for k, v in sentence1_tokens.items()}
    sentence2_tokens = {k: v.to(device) for k, v in sentence2_tokens.items()} 
    
    with torch.no_grad():
        embedding1 = model_tiny.get_embedding(sentence1_tokens['input_ids'], sentence1_tokens['attention_mask'])
        embedding2 = model_tiny.get_embedding(sentence2_tokens['input_ids'], sentence2_tokens['attention_mask'])
        
        
    embedding1 = embedding1.cpu().numpy()
    embedding2 = embedding2.cpu().numpy()   
    
    cosine_sim = cosine_similarity(embedding1, embedding2)
    
    embedding1 = embedding1 / np.linalg.norm(embedding1, axis=1, keepdims=True)
    embedding2 = embedding2 / np.linalg.norm(embedding2, axis=1, keepdims=True)

    dis = cdist(embedding1, embedding2, metric = 'euclidean')
    
    label = 0
    
    if dis < 0.01:
        label = 1
    
    correct += (label == rawc['ground_truth'][i])
    
print(correct/len(rawc))
    

0.6279761904761905


### DistilBert Performance

In [82]:
correct = 0
cosin_dis = 0
dist = 0
model_distil.eval()

for i in range(len(rawc)):
    
    sentence1 = rawc['sent1'][i]
    sentence2 = rawc['sent2'][i]
    
    sentence1_tokens = tokenizer_distil(sentence1, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
    sentence2_tokens = tokenizer_distil(sentence2, padding="max_length", truncation=True, max_length=128, return_tensors="pt")  
    
    sentence1_tokens = {k: v.to(device) for k, v in sentence1_tokens.items()}
    sentence2_tokens = {k: v.to(device) for k, v in sentence2_tokens.items()} 
    
    with torch.no_grad():
        embedding1 = model_distil.get_embedding(sentence1_tokens['input_ids'], sentence1_tokens['attention_mask'])
        embedding2 = model_distil.get_embedding(sentence2_tokens['input_ids'], sentence2_tokens['attention_mask'])
        
        
    embedding1 = embedding1.cpu().numpy()
    embedding2 = embedding2.cpu().numpy()   
    
    cosine_sim = cosine_similarity(embedding1, embedding2)
    
    embedding1 = embedding1 / np.linalg.norm(embedding1, axis=1, keepdims=True)
    embedding2 = embedding2 / np.linalg.norm(embedding2, axis=1, keepdims=True)

    dis = cdist(embedding1, embedding2, metric = 'euclidean')
    
    label = 0
    
    if dis < 0.4:
        label = 1
    
    correct += (label == rawc['ground_truth'][i])
    
print(correct/len(rawc))
    

0.6934523809523809


In [80]:
glossbert_data = [(rawc['sent1'][i], rawc['sent2'][i], rawc['ground_truth'][i]) for i in range(len(rawc))]
dataset = SentencePairDataset(glossbert_data)
dataloader = DataLoader(dataset, batch_size=32, shuffle=False, collate_fn=collate_fn_glossb)
model.to(device)
# Run classification and compute accuracy
results, accuracy = classify_and_evaluate(dataloader)
accuracy

0.6190476190476191