In [1]:
# Imports
import os
import math
import random
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
from torch.cuda.amp import autocast, GradScaler
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import f1_score
from tqdm import tqdm
import torch.nn.functional as F
from sklearn.model_selection import train_test_split


In [2]:
# Dataset
class WSDSiameseDataset(Dataset):
    def __init__(self, sentence_pairs, labels, tokenizer, max_length=128):
        self.sentence_pairs = sentence_pairs
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.sentence_pairs)

    def __getitem__(self, idx):
        sent1, sent2 = self.sentence_pairs[idx]
        label = self.labels[idx]

        tokens_1 = self.tokenizer(sent1, padding="max_length", truncation=True, max_length=self.max_length, return_tensors="pt")
        tokens_2 = self.tokenizer(sent2, padding="max_length", truncation=True, max_length=self.max_length, return_tensors="pt")

        return {
            "input_ids_1": tokens_1["input_ids"].squeeze(0),
            "attention_mask_1": tokens_1["attention_mask"].squeeze(0),
            "input_ids_2": tokens_2["input_ids"].squeeze(0),
            "attention_mask_2": tokens_2["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.float),
        }

# Model
class SiameseBERT(nn.Module):
    def __init__(self, model_name):
        super(SiameseBERT, self).__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.fc = nn.Linear(self.bert.config.hidden_size, 256)
        # Define cosine distance as a lambda function
        self.distance = lambda x, y: F.cosine_similarity(x, y)

    def get_embedding(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_embedding = outputs.last_hidden_state[:, 0, :]  # [CLS] token
        proj = self.fc(cls_embedding)
        return F.normalize(proj, p=2, dim=1)  # L2-normalize for cosine

    def forward(self, input_ids_1, attention_mask_1, input_ids_2, attention_mask_2, labels):
        emb1 = self.get_embedding(input_ids_1, attention_mask_1)
        emb2 = self.get_embedding(input_ids_2, attention_mask_2)
        return labels - self.distance(emb1, emb2)  # returns tensor of distances

# Contrastive Loss
class ContrastiveLoss(nn.Module):
    def __init__(self, margin=1.0):
        super(ContrastiveLoss, self).__init__()
        self.margin = margin

    def forward(self, distance, label):
        loss = (1 - label) * distance.pow(2) + label * torch.clamp(self.margin - distance, min=0.0).pow(2)
        return loss.mean()

# Collate Function
def collate_fn(batch):
    return {
        "input_ids_1": torch.stack([item["input_ids_1"] for item in batch]),
        "attention_mask_1": torch.stack([item["attention_mask_1"] for item in batch]),
        "input_ids_2": torch.stack([item["input_ids_2"] for item in batch]),
        "attention_mask_2": torch.stack([item["attention_mask_2"] for item in batch]),
        "labels": torch.stack([item["label"] for item in batch]),
    }

# Evaluation

def evaluate(model, loader, loss_fn, device):
    model.eval()
    total_loss = 0.0
    with torch.no_grad():
        for batch in loader:
            distances = model(
                batch["input_ids_1"].to(device),
                batch["attention_mask_1"].to(device),
                batch["input_ids_2"].to(device),
                batch["attention_mask_2"].to(device),
                batch["labels"].to(device)
            )
            loss = loss_fn(distances, batch["labels"].to(device))
            total_loss += loss.item()
    return total_loss / len(loader)

# Prediction

def predict(model, loader, device):
    model.eval()
    all_distances, all_labels = [], []
    with torch.no_grad():
        for batch in loader:
            distances = model(
                batch["input_ids_1"].to(device),
                batch["attention_mask_1"].to(device),
                batch["input_ids_2"].to(device),
                batch["attention_mask_2"].to(device),
                batch["labels"].to(device)
            )
            all_distances.extend(distances.cpu().numpy())
            all_labels.extend(batch["labels"].cpu().numpy())
    return np.array(all_distances), np.array(all_labels)

# Metrics

def compute_accuracy_f1(distances, labels, threshold=0.0):
    preds = (distances > threshold).astype(int)
    print(labels)
    print(preds)
    accuracy = (preds == labels).mean()
    f1 = f1_score(labels, preds)
    return accuracy, f1


In [3]:
model_name = "huawei-noah/TinyBERT_General_4L_312D"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = SiameseBERT(model_name=model_name)
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5)
loss_fn = ContrastiveLoss(margin=0.5)

2025-05-02 14:45:12.655652: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746197112.678141     109 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746197112.684885     109 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:
# Manually define a balanced list of 4 test samples (2 label 0 and 2 label 1)
balanced_test_samples = [
    {
        "Sentence 1": "The lawyer presented his case in court.",
        "Sentence 2": "The detective reviewed the case again to find new clues.",
        "Lemma": "case",
        "Label": 1
    },
    {
        "Sentence 1": "He closed the deal with a handshake.",
        "Sentence 2": "She found a great deal on shoes online.",
        "Lemma": "deal",
        "Label": 1
    },
    {
        "Sentence 1": "He went to the bank to deposit his paycheck.",
        "Sentence 2": "She visited the bank with her friend yesterday to withdraw.",
        "Lemma": "bank",
        "Label": 1
    },
    {
        "Sentence 1": "The chef prepared a delicious dish.",
        "Sentence 2": "The satellite captured a dish image from space.",
        "Lemma": "dish",
        "Label": 0
    },
    {
        "Sentence 1": "The coach gave a motivational speech before the game.",
        "Sentence 2": "The coach broke down on the highway.",
        "Lemma": "coach",
        "Label": 0
    },
    {
        "Sentence 1": "She dropped her ring on the floor.",
        "Sentence 2": "The ring of the phone startled everyone.",
        "Lemma": "ring",
        "Label": 0
    }
]

# Convert to DataFrame so it can be used easily
df_balanced_manual = pd.DataFrame(balanced_test_samples)
df_balanced_manual


Unnamed: 0,Sentence 1,Sentence 2,Lemma,Label
0,The lawyer presented his case in court.,The detective reviewed the case again to find ...,case,1
1,He closed the deal with a handshake.,She found a great deal on shoes online.,deal,1
2,He went to the bank to deposit his paycheck.,She visited the bank with her friend yesterday...,bank,1
3,The chef prepared a delicious dish.,The satellite captured a dish image from space.,dish,0
4,The coach gave a motivational speech before th...,The coach broke down on the highway.,coach,0
5,She dropped her ring on the floor.,The ring of the phone startled everyone.,ring,0


In [5]:
# =============================
# Train full data, test on sampled examples
# =============================

def train_and_eval(data_path, definition_col, model_save_name):
    # Step 1: Load and prepare training data
    df = pd.read_csv(data_path).rename(columns={definition_col: "Gloss"})
    df = df.dropna(subset=["Gloss"])
    df["Gloss"] = df["Gloss"].astype(str)
    sentence_pairs = list(zip(df["Context Sentence"], df["Gloss"]))
    labels = list(df["Label"])

    # Train/val split (90/10)
    train_pairs, val_pairs, train_labels, val_labels = train_test_split(
        sentence_pairs, labels, test_size=0.1, random_state=42
    )

    train_dataset = WSDSiameseDataset(train_pairs, train_labels, tokenizer)
    val_dataset = WSDSiameseDataset(val_pairs, val_labels, tokenizer)

    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

    # Step 2: Train the model
    model.to(device)
    scaler = GradScaler()
    best_val_loss = float('inf')

    for epoch in range(2):  # You can increase this
        model.train()
        total_loss = 0
        for batch in tqdm(train_loader, desc="Training"):
            optimizer.zero_grad()
            with autocast():
                distances = model(
                    batch["input_ids_1"].to(device),
                    batch["attention_mask_1"].to(device),
                    batch["input_ids_2"].to(device),
                    batch["attention_mask_2"].to(device),
                    batch["labels"].to(device)
                )
                loss = loss_fn(distances, batch["labels"].to(device))
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            total_loss += loss.item()

        val_loss = evaluate(model, val_loader, loss_fn, device)
        print(f"Training Loss: {total_loss/len(train_loader):.5f}, Validation Loss: {val_loss:.5f}")
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.bert.state_dict(), model_save_name)
            print(f"[INFO] Saved best model with val loss: {val_loss:.4f}")

    # Step 3: Evaluate on manually defined test set
    print("\n===== Manual Evaluation on Balanced Samples =====")
    test_pairs = list(zip(df_balanced_manual["Sentence 1"], df_balanced_manual["Sentence 2"]))
    test_labels = list(df_balanced_manual["Label"])

    test_dataset = WSDSiameseDataset(test_pairs, test_labels, tokenizer)
    test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False, collate_fn=collate_fn)

    distances, labels = predict(model, test_loader, device)

    for i in range(len(df_balanced_manual)):
        print(f"\n🔹 Example {i+1}")
        print(f"Sentence 1: {df_balanced_manual.iloc[i]['Sentence 1']}")
        print(f"Sentence 2: {df_balanced_manual.iloc[i]['Sentence 2']}")
        print(f"Lemma: {df_balanced_manual.iloc[i]['Lemma']}")
        print(f"True Label: {df_balanced_manual.iloc[i]['Label']} | Predicted Distance: {distances[i]:.4f}")

    acc, f1 = compute_accuracy_f1(distances, labels)
    print(f"\n✅ Accuracy: {acc*100:.4f}%, F1 Score: {f1:.4f}")


In [6]:
# Run
train_and_eval("/kaggle/input/datasets-contrastive/context_gloss_pairs_mixed.csv", "Synset Gloss Definition", "shared_weights_gloss.pth")

  scaler = GradScaler()
  with autocast():
Training: 100%|██████████| 2226/2226 [03:08<00:00, 11.78it/s]


Training Loss: 0.00139, Validation Loss: 0.00002
[INFO] Saved best model with val loss: 0.0000

===== Manual Evaluation on Balanced Samples =====

🔹 Example 1
Sentence 1: The lawyer presented his case in court.
Sentence 2: The detective reviewed the case again to find new clues.
Lemma: case
True Label: 1 | Predicted Distance: 0.0187

🔹 Example 2
Sentence 1: He closed the deal with a handshake.
Sentence 2: She found a great deal on shoes online.
Lemma: deal
True Label: 1 | Predicted Distance: 0.0164

🔹 Example 3
Sentence 1: He went to the bank to deposit his paycheck.
Sentence 2: She visited the bank with her friend yesterday to withdraw.
Lemma: bank
True Label: 1 | Predicted Distance: 0.0126

🔹 Example 4
Sentence 1: The chef prepared a delicious dish.
Sentence 2: The satellite captured a dish image from space.
Lemma: dish
True Label: 0 | Predicted Distance: -0.9812

🔹 Example 5
Sentence 1: The coach gave a motivational speech before the game.
Sentence 2: The coach broke down on the hig

In [7]:
model_name = "huawei-noah/TinyBERT_General_4L_312D"
model = SiameseBERT(model_name=model_name)
model.bert.load_state_dict(torch.load("/kaggle/working/shared_weights_gloss.pth"))
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5)

  model.bert.load_state_dict(torch.load("/kaggle/working/shared_weights_gloss.pth"))


In [8]:
train_and_eval("/kaggle/input/datasets-contrastive/context_hypernym_pairs_mixed.csv", "Hypernym Gloss Definition", "shared_weights_gloss_hypernym.pth")


  scaler = GradScaler()
  with autocast():
Training: 100%|██████████| 1494/1494 [02:06<00:00, 11.85it/s]


Training Loss: 0.00012, Validation Loss: 0.00012
[INFO] Saved best model with val loss: 0.0001

===== Manual Evaluation on Balanced Samples =====

🔹 Example 1
Sentence 1: The lawyer presented his case in court.
Sentence 2: The detective reviewed the case again to find new clues.
Lemma: case
True Label: 1 | Predicted Distance: 0.0031

🔹 Example 2
Sentence 1: He closed the deal with a handshake.
Sentence 2: She found a great deal on shoes online.
Lemma: deal
True Label: 1 | Predicted Distance: 0.0017

🔹 Example 3
Sentence 1: He went to the bank to deposit his paycheck.
Sentence 2: She visited the bank with her friend yesterday to withdraw.
Lemma: bank
True Label: 1 | Predicted Distance: 0.0017

🔹 Example 4
Sentence 1: The chef prepared a delicious dish.
Sentence 2: The satellite captured a dish image from space.
Lemma: dish
True Label: 0 | Predicted Distance: -0.9973

🔹 Example 5
Sentence 1: The coach gave a motivational speech before the game.
Sentence 2: The coach broke down on the hig