<a href="https://colab.research.google.com/github/DharshiBalasubramaniyam/super-duper-rotary-phone/blob/main/dual-encoder-model-xlm-r/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers torch

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

## Training

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import json
import random
import torch.nn.functional as F

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cpu


In [None]:
# Tamil questions
question_encoder = AutoModel.from_pretrained("xlm-roberta-base").to(device)
question_tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

# English passages
passage_encoder = AutoModel.from_pretrained("xlm-roberta-base").to(device)
passage_tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
# 2. Custom Dataset
class QADataset(Dataset):
    def __init__(self, path):
        with open(path) as f:
            self.data = json.load(f)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        entry = self.data[idx]
        return entry["question"], entry["positive_passage"]


In [None]:
def collate(batch):
    questions, passages = zip(*batch)

    q_tokens = question_tokenizer(list(questions), return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
    p_tokens = passage_tokenizer(list(passages), return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)

    return q_tokens, p_tokens

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# 4. Dual Encoder Training Loop
no_of_epoches = 5
batch_size = 10
def train():
    question_encoder.train()
    passage_encoder.train()
    dataset = QADataset("/content/drive/My Drive/Colab Notebooks/fyrp/ta_en.json")
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate, drop_last=True)

    # Pass the values of weights and biases of both encoders
    optimizer = torch.optim.AdamW(list(question_encoder.parameters()) + list(passage_encoder.parameters()), lr=3e-5)

    loss_fn = nn.CrossEntropyLoss()

    for epoch in range(no_of_epoches):
        for step, (q_inputs, p_inputs) in enumerate(dataloader): # (q_inputs, p_inputs) returned by collate_fn

            # if epoch == 0:
            #   print(q_inputs, p_inputs)

            # Encode the Inputs using encoders seperately
            # **q_inputs unpacks the dict into keyword arguments like input_ids, attention_mask
            # .last_hidden_state[:, 0] extracts the embedding of the [CLS] token, which is usually used as the sentence representation in transformers.
            q_emb = question_encoder(**q_inputs).last_hidden_state[:, 0]  # [CLS]
            p_emb = passage_encoder(**p_inputs).last_hidden_state[:, 0]

            # q_emb = mean_pooling(question_encoder(**q_inputs), q_inputs['attention_mask'])
            # p_emb = mean_pooling(passage_encoder(**p_inputs), p_inputs['attention_mask'])

            q_emb = F.normalize(q_emb, p=2, dim=1)
            p_emb = F.normalize(p_emb, p=2, dim=1)

            # Compute similarity (dot product)
            # This computes dot products between each question and each passage in the batch
            # The shape of q_emb: batch_size x embedding_dimension
            # The shape of p_emb: batch_size x embedding_dimension
            # The shape of sim_matrix: batch_size x batch_size
            sim_matrix = torch.matmul(q_emb, p_emb.T)
            labels = torch.arange(sim_matrix.size(0)).to(device)  # positives are diagonal
            loss = loss_fn(sim_matrix, labels)

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            print(f"Epoch: {epoch} Step: {step} Loss: {loss.item():.4f}")
    return question_encoder, passage_encoder

In [None]:
question_encoder, passage_encoder = train()

Epoch: 0 Step: 0 Loss: 2.3042
Epoch: 0 Step: 1 Loss: 2.3022
Epoch: 0 Step: 2 Loss: 2.3052
Epoch: 1 Step: 0 Loss: 2.3124
Epoch: 1 Step: 1 Loss: 2.3062
Epoch: 1 Step: 2 Loss: 2.3069
Epoch: 2 Step: 0 Loss: 2.2984
Epoch: 2 Step: 1 Loss: 2.3025
Epoch: 2 Step: 2 Loss: 2.3068
Epoch: 3 Step: 0 Loss: 2.3028
Epoch: 3 Step: 1 Loss: 2.3025
Epoch: 3 Step: 2 Loss: 2.3021
Epoch: 4 Step: 0 Loss: 2.3034
Epoch: 4 Step: 1 Loss: 2.3031
Epoch: 4 Step: 2 Loss: 2.3026


In [None]:
# Make sure your models are in eval mode
question_encoder.eval()
passage_encoder.eval()

# Sample Tamil question (change as needed)
# in sinhala - දරුවෙකු හදා වඩා ගැනීමට අවශ්‍ය නම්, දරුකමට හදා ගත හැකි දරුවෙකුගේ උපරිම වයස කීයද?
sample_question = "ஒரு குழந்தையை தத்தெடுக்க தேவைப்பட்டால் தத்தெடுக்கக்கூடிய குழந்தையின் அதிகபட்ச வயது என்ன?"

# Sample candidate English passages
candidate_passages = [
    "Paris is the capital of France.",
    "Age of the adopted child should not be more than 14 years",
    "The age of the applicant should be more than 25 years",
    "London is the capital of the United Kingdom."
]

# Tokenize and encode the question
q_inputs = question_tokenizer(sample_question, return_tensors="pt", padding=True, truncation=True, max_length=128)
with torch.no_grad():
    q_emb = question_encoder(**q_inputs).last_hidden_state[:, 0]  # CLS token embedding

# Tokenize and encode all candidate passages in a batch
p_inputs = passage_tokenizer(candidate_passages, return_tensors="pt", padding=True, truncation=True, max_length=128)
with torch.no_grad():
    p_emb = passage_encoder(**p_inputs).last_hidden_state[:, 0]  # CLS token embeddings for all passages

# Compute similarity scores (dot product)
scores = torch.matmul(q_emb, p_emb.T)  # shape: [1 x num_passages]

# Convert scores to CPU and numpy for easy handling
scores = scores.cpu().numpy().flatten()
print(scores)
# Find best passage
best_idx = scores.argmax()
print(f"Best passage (score={scores[best_idx]:.4f}): {candidate_passages[best_idx]}")

[348.0043  347.65564 347.50555 347.71603]
Best passage (score=348.0043): Paris is the capital of France.
