## Choose Devise

In [1]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")


There are 1 GPU(s) available.
We will use the GPU: NVIDIA RTX A4000


## Import Libraries

In [2]:
pip install pytorch-transformers

Collecting pytorch-transformers
  Downloading pytorch_transformers-1.2.0-py3-none-any.whl (176 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m176.4/176.4 kB[0m [31m26.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m880.6/880.6 kB[0m [31m88.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25ldone
[?25h  Created wheel for sacremoses: filename=sacremoses-0.0.53-py3-none-any.whl size=895242 sha256=f9e04d5191aa71bc96a3dad421d4ac3ac0f2c8d3ca3eeaf47794a50f580ffad8
  Stored in directory: /root/.cache/pip/wheels/42/79/78/5ad3b042cb2d97c294535162cdbaf9b167e3b186eae55ab72d
Successfully built sacremoses
Installing collected packages: sacremoses, pytorch-transformers
Successfully installed pytorch-transformers-1.2.0 sacr

In [3]:
!pip install transformers

[0m

In [4]:
import torch
import os
import string
import copy
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.metrics import classification_report
from pytorch_transformers import *
import numpy as np
import json
import collections
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd

In [5]:
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

## Import Dataset

In [6]:
#load_data basically reads in data --> takes in everything from jsonl files
def load_data(filename):
    data = []
    # read in each line and add it to list
    with open(filename, mode = "r") as file:
        for line in file:
            data.append(json.loads(line))
    return data


train_json_objs = load_data("train.jsonl")

sentences1=[]
sentences2=[]
words=[]
labels=[]
for i in range(0,len(train_json_objs)):
    sentences1.append(train_json_objs[i]['sentence1'])
    sentences2.append(train_json_objs[i]['sentence2'])
    words.append(train_json_objs[i]['word'])
    labels.append(train_json_objs[i]['label'])

print(len(sentences1))

5428


## Import Model & Model Tokenizer | Model Definition

### List of models present in the experiment:
The below models specified were used for the purpose to understand how the models perform when trained and tested on the Hindi Language:

* bert-base-multilingual-cased

* xlm-roberta-base

Please remove the # character to run the chosen model.

In [7]:
#tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
model_name='bert-base-multilingual-cased'
print(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

xlm-roberta-base


Downloading config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

Downloading sentencepiece.bpe.model:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/8.68M [00:00<?, ?B/s]

In [8]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels = 2, # The number of output labels--2 for binary classification.
    # output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = True, # Whether the model returns all hidden-states.
)
model.to(device)

Downloading pytorch_model.bin:   0%|          | 0.00/1.04G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense

XLMRobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (La

## Create Train Dataset

The below functions will find the range of indexes of the target words from the two context sentences

In [9]:
def find_indexes_before(list1, list2):
    index = 0
    while index <= len(list1) - len(list2):
        if list1[index:index + len(list2)] == list2:
            return list(range(index, index + len(list2)))
        index += 1
    return []

def find_indexes_after(list1, before_length,list2):
    index = 0
    while index <= len(list1) - len(list2):
        if list1[index:index + len(list2)] == list2:
            #print(index)
            return list(range(index +before_length, index +before_length +  len(list2)))
        index += 1
    return []

In [15]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [16]:
# This step is done to preprocess th target words in the English WiC dataset
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

The below function creates the dataset in the required format that is it contains a dictionary of lists with:
1. Input IDs -  sequence of integer tokens that represent the input text. Each token in the text is mapped to a unique integer ID based on a tokenizer. BERT uses a fixed-size vocabulary, and each token in the text is converted to its corresponding ID from the vocabulary.

2. Attention mask - binary mask tensor that indicates which tokens in the input should be attended to (receive attention) and which tokens should be ignored. It is used to handle variable-length input sequences. The mask has the same length as the input sequence and contains 0s and 1s

3. Target Word Location in sentence 1 - used to extract embeddings of target word from sentence 1

4. Target Word Location in sentence 2 - used to extract embeddings of target word from sentence 2

5. Labels

In [17]:

def create_data_set(sentences1,sentences2,target_word,labels):
    wic_padded=[]
    missed=[]
    for i in range(0,len(sentences1)):
        lemmatized_tokens=[]
        tokens = word_tokenize(sentences1[i])
        for t1 in tokens:
            lemma = lemmatizer.lemmatize(t1)
            lemmatized_tokens.append(lemma.lower())
        lemmatized_sentence1 = ' '.join(lemmatized_tokens)

        lemmatized_tokens=[]
        tokens = word_tokenize(sentences2[i])
        for t1 in tokens:
            lemma = lemmatizer.lemmatize(t1)
            lemmatized_tokens.append(lemma.lower())
        lemmatized_sentence2 = ' '.join(lemmatized_tokens)

        #print(words[i])
        #sentence = f"[CLS] {lemmatized_sentence1} [SEP] {lemmatized_sentence2} [SEP]"
        sentence = f"<s> {sentences1[i]}</s><s>{sentences2[i]}</s>"
        #print(sentence)
        tokens=tokenizer(sentence, add_special_tokens=False,pad_to_max_length=True,
                  truncation=True,max_length=512)
        input_ids = tokens["input_ids"]
        attention_mask = tokens["attention_mask"]
        #print(input_ids)
        target_word=words[i]
        target_token = tokenizer.encode(target_word)
        target_token=target_token[1:-1]

        #For XLM-RoBERTa
        s_token_index = tokenizer.convert_tokens_to_ids('</s>')
        sep_occurrences = [index for index, token_id in enumerate(input_ids) if token_id == s_token_index]

        #ANY OTHER MODEL
        #sep_occurrences = [index for index, token_id in enumerate(input_ids) if token_id == tokenizer.sep_token_id]

        if len(sep_occurrences)!=0:
            sep_index = sep_occurrences[0]

            tokens_before_sep = input_ids[:sep_index]
            tokens_after_sep = input_ids[sep_index + 1:]

            is_present1= str(target_token).replace("[","").replace("]","") in str(tokens_before_sep)
            is_present2= str(target_token).replace("[","").replace("]","") in str(tokens_after_sep)

            if is_present1!=False and is_present2!=False:
                target_word_ids_before_sep = find_indexes_before(tokens_before_sep,target_token)
                target_word_ids_after_sep = find_indexes_after(tokens_after_sep,len(tokens_before_sep)-1,target_token)
                if labels[i]==False:
                    label=0
                else:
                    label=1

                mask_tensor_sent1 = torch.zeros_like(torch.tensor(input_ids))
                mask_tensor_sent1[target_word_ids_before_sep] = 1
                mask_tensor_sent2 = torch.zeros_like(torch.tensor(input_ids))
                mask_tensor_sent2[target_word_ids_after_sep] = 1
                sample_data = {
                                "input_ids": torch.tensor(input_ids),
                                "attention_mask": torch.tensor(attention_mask),
                                "word1_locs": mask_tensor_sent1,
                                "word2_locs": mask_tensor_sent2,
                                "labels": torch.tensor(label),
                                "sentence": sentence,
                                "target_word":words[i]
                            }

                # Append the data for the current sample to the list
                wic_padded.append(sample_data)
            else:missed.append([sentence, words[i]])
        else:missed.append([sentence, words[i]])
    return wic_padded,missed

In [18]:
#create training dataset

wic_train_set,l1 = create_data_set(sentences1,sentences2,words,labels)



In [19]:
BATCH_SIZE = 16

In [20]:
train_labels=[]
for i in wic_train_set:
    x=i["labels"].item()
    train_labels.append(x)


In [21]:
for i in wic_train_set:
    x=i["word1_locs"]
    y=i["word2_locs"]
    print("sentence: ",i["sentence"])
    print("target_word: ",i["target_word"])
    print("word1 location: ",x)
    print("word2 location: ",y)
    break

sentence:  <s> Do you want to come over to my place later?</s><s>A political system with no place for the less prominent groups.</s>
target_word:  place
word1 location:  tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0

In [22]:
train_data = TensorDataset(
    torch.stack([sample["input_ids"] for sample in wic_train_set]),
    torch.stack([sample["attention_mask"] for sample in wic_train_set]),
    torch.stack([sample["word1_locs"] for sample in wic_train_set]),
    torch.stack([sample["word2_locs"] for sample in wic_train_set]),
    torch.stack([sample["labels"] for sample in wic_train_set])
)

# Create a sampler and loader
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)

## Create Validation Dataset

In [23]:
#load_data basically reads in data --> takes in everything from jsonl files
def load_data(filename):
    data = []
    # read in each line and add it to list
    with open(filename, mode = "r") as file:
        for line in file:
            data.append(json.loads(line))
    return data


train_json_objs = load_data("val.jsonl")

sentences1=[]
sentences2=[]
words=[]
labels=[]
for i in range(0,len(train_json_objs)):
    sentences1.append(train_json_objs[i]['sentence1'])
    sentences2.append(train_json_objs[i]['sentence2'])
    words.append(train_json_objs[i]['word'])
    labels.append(train_json_objs[i]['label'])

print(len(sentences1))

638


In [24]:
#create validation dataset

wic_val_set,l1 = create_data_set(sentences1,sentences2,words,labels)

In [25]:

val_data = TensorDataset(
    torch.stack([sample["input_ids"] for sample in wic_val_set]),
    torch.stack([sample["attention_mask"] for sample in wic_val_set]),
    torch.stack([sample["word1_locs"] for sample in wic_val_set]),
    torch.stack([sample["word2_locs"] for sample in wic_val_set]),
    torch.stack([sample["labels"] for sample in wic_val_set])
)

# Create a sampler and loader
val_sampler = RandomSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=BATCH_SIZE)

## Model Architecture

In [26]:

class WiC_Head(torch.nn.Module):
    def __init__(self, model_used,weights,embedding_size=768):
        super(WiC_Head, self).__init__()
        self.model=model_used
        self.embedding_size = embedding_size
        self.linear_diff = torch.nn.Linear(embedding_size, 100, bias=True)
        self.linear_seperator = torch.nn.Linear(100, 2, bias=True)
        self.loss = torch.nn.CrossEntropyLoss(weight=weights)
        self.activation = torch.nn.ReLU()
        self.softmax = torch.nn.Softmax()

        self.to(device)
        self.linear_diff.to(device)
        self.loss.to(device)
        self.linear_seperator.to(device)
        self.activation.to(device)
        self.softmax.to(device)

    def forward(self, input_ids=None, attention_mask=None, word1_locs=None, word2_locs=None, labels=None):
        batch_size = input_ids.shape[0]
        input_ids_tensor = input_ids.to(device)
        attention_mask_tensor = attention_mask.to(device)
        word1_locs = word1_locs.to(device)
        word1_locs=word1_locs.unsqueeze(1)

        word2_locs = word2_locs.to(device)
        word2_locs=word2_locs.unsqueeze(1)

        outputs=model(input_ids_tensor,attention_mask_tensor)

        token_embeddings=outputs.hidden_states[-1]

        token_embeddings = torch.squeeze(token_embeddings, dim=0)

        word1_embs=torch.matmul(word1_locs.float(),token_embeddings.float()).view(batch_size, self.embedding_size)

        word2_embs=torch.matmul(word2_locs.float(),token_embeddings.float()).view(batch_size, self.embedding_size)

        diff = word1_embs - word2_embs

        layer1_results = self.activation(self.linear_diff(diff))
        logits = self.softmax(self.linear_seperator(layer1_results))
        if labels is not None:
            loss = self.loss(logits.view(-1, 2).to(device), labels.view(-1).to(device))
            outputs = (loss, logits)
        return outputs


We need to ensure that both classes are assigned weights accordingly in case any class in our dataset over represented

In [27]:
from sklearn.utils.class_weight import compute_class_weight

#compute the class weights
class_wts = compute_class_weight('balanced', classes=[0,1], y=train_labels)

print(class_wts)

weights= torch.tensor(class_wts,dtype=torch.float)
weights = weights.to(device)

[1.07196198 0.93709199]


In [28]:
class_model = WiC_Head(model, weights,embedding_size = 768)

class_model.to(device)

WiC_Head(
  (model): XLMRobertaForSequenceClassification(
    (roberta): RobertaModel(
      (embeddings): RobertaEmbeddings(
        (word_embeddings): Embedding(250002, 768, padding_idx=1)
        (position_embeddings): Embedding(514, 768, padding_idx=1)
        (token_type_embeddings): Embedding(1, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): RobertaEncoder(
        (layer): ModuleList(
          (0): RobertaLayer(
            (attention): RobertaAttention(
              (self): RobertaSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): RobertaSelfOutput(
                (dense): Linear(in

## Optimization Function

I used an improved version of the Adam Optimizer called AdamW for the below reasons:

AdamW is an adaptation of the Adam optimizer, designed to incorporate the weight decay (L2 regularization) term directly into the optimization process. The "W" in AdamW stands for "weight decay." 

In [29]:

param_optimizer = list(class_model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=1e-5)

## Fine Tune the Model

In [30]:
torch.cuda.empty_cache()

In [None]:
BATCH_SIZE = 16 #decreased the size until the CPU stops dying
EPOCHS = 10 #could do more for higher accuracy buts takes too long

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels, return_predict_correctness = False):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    if return_predict_correctness:
        return np.sum(pred_flat == labels_flat) / len(labels_flat), pred_flat == labels_flat
    else:
        return np.sum(pred_flat == labels_flat) / len(labels_flat)

best_weights = class_model.state_dict()
logits_train=[]
labels_train=[]
logits_test=[]
labels_test=[]

# maximize from 0
max_val_acc = (0, 0)

# Store our loss and accuracy for plotting
train_loss=[]
train_accuracy=[]
val_loss=[]
val_accuracy=[]
epoch_number = 0

while epoch_number < EPOCHS:
    epoch_number += 1
    print(f"Training epoch #{epoch_number}")

    # Tracking variables
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    # Training
    class_model.train()

    #class_model.embedder.requires_grad_ = False
    # Train the data for each epoch
    for step, batch in enumerate(train_dataloader):
        b_input_ids, b_input_mask, b_word1, b_word2, b_labels = batch
        #reset gradient
        optimizer.zero_grad()
        # get input and compute loss
        b_input_ids = b_input_ids.to(device)
        b_input_mask = b_input_mask.to(device)
        b_word1 = b_word1.to(device)
        b_word2 = b_word2.to(device)
        b_labels=b_labels.to(device)
        loss, logits = class_model(input_ids=b_input_ids, attention_mask=b_input_mask, word1_locs = b_word1, word2_locs = b_word2,labels=b_labels)
        torch.cuda.empty_cache()
        # get gradient
        loss.backward()
        #accelerator.backward(loss)
        # Update model
        optimizer.step()

        logits = logits.detach().cpu().numpy()
        logits_train.append(logits)
        label_ids = b_labels.cpu().numpy()
        labels_train.append(label_ids)
        # Calculate the accuracy
        b_accuracy = flat_accuracy(logits, label_ids) # For RobertaForClassification
        # Append to fit history
        train_loss.append(loss.item())
        train_accuracy.append(b_accuracy)
        # Update tracking variables
        tr_loss += loss.item()
        tr_accuracy += b_accuracy
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

    print("Training:\n\tLoss: {}; Accuracy: {}".format(tr_loss/nb_tr_steps, tr_accuracy/nb_tr_steps))

    # Validation
    class_model.eval()
    # Evaluate data for one epoch
    for batch in val_dataloader:
        b_input_ids, b_input_mask, b_word1, b_word2, b_labels  = batch
        # don't store gradients
        with torch.no_grad():
          # get input and compute loss
            loss, logits = class_model(b_input_ids, attention_mask=b_input_mask, word1_locs = b_word1, word2_locs = b_word2,labels=b_labels)
            #print(logits)
        logits = logits.detach().cpu().numpy()
        logits_test.append(logits)
        label_ids = b_labels.cpu().numpy()
        labels_test.append(label_ids)
        # Calculate the accuracy
        b_accuracy = flat_accuracy(logits, label_ids) # For RobertaForClassification

        # Append to fit history
        val_loss.append(loss.item())
        val_accuracy.append(b_accuracy)
        # Update tracking variables
        eval_loss += loss.item()
        eval_accuracy += b_accuracy
        nb_eval_examples += b_input_ids.size(0)
        nb_eval_steps += 1

    eval_acc = eval_accuracy/nb_eval_steps
    if eval_acc >= max_val_acc[0]:
        max_val_acc = (eval_acc, epoch_number)

    print("Validation:\n\tLoss={}; Accuracy: {}".format(eval_loss/nb_eval_steps, eval_accuracy/nb_eval_steps))
print(f"Best Validation accuracy ({max_val_acc[0]}) obtained at epoch #{max_val_acc[1]}.")
# Reload the best weights (from memory)
class_model.load_state_dict(best_weights)

Training epoch #1


  logits = self.softmax(self.linear_seperator(layer1_results))


Training:
	Loss: 0.6924933300475882; Accuracy: 0.48874158249158245
Validation:
	Loss=0.6908018016815185; Accuracy: 0.5375
Training epoch #2


In [38]:
PATH = 'saved_weights_xlmr_1.pt'
torch.save(class_model.state_dict(), PATH)

## Test the Model

In [39]:
# Load the dataset into a pandas dataframe.
df = pd.read_csv("hindi-wsd_test.csv")

print(df.head(10))
df1=df.sample(2000)


sentences1= df1.context_instance1.values
sentences2= df1.context_instance2.values
words= df1.target_word.values
labels = df1.labels.values

  target_word  word_index                                  context_instance1  \
0       उत्तर           5   जमशेदपुर : केंद्रीय माध्यमिक शिक्षा बोर्ड की ...   
1         अंग           1   हाथ के किसी उपकरण (औजार) से किसी चीज को इच्छि...   
2         कलम           8   कलम ो को अलगअलग गुच्छों में बांध लेते हैं और ...   
3        कमान           7   इंका और हरियाणा संघर्ष समिति के बीच निर्णायक ...   
4         मूल          49   ज्येष्ठा मूल या अश्विनी नक्षत्र में जन्म लेने...   
5         लाल          50   लाल रंग खेल कूद में आपकी क्षमता को भी बढाने व...   
6          दर          38   महीने में खुले बाजार में बिकने वाली चीनी के थ...   
7       ग्राम          18   भौतिक डिलीवरी के बारे में सिन्हा ने बताया कि ...   
8         तिल          32                                               नाक:   
9          मत          46   घटनाक्रम के अन्तर्गत बुद्ध को विभिन्न मुद्राओ...   

                                   context_instance2 start1 end1 start2 end2  \
0   दिशाओं का निर्धारण उत्तर से होना चा

The below section is separate from the above one as the English dataset and Hindi dataset are preprocessed differently

In [40]:
def find_indexes_before(list1, list2):
    index = 0
    while index <= len(list1) - len(list2):
        if list1[index:index + len(list2)] == list2:
            return list(range(index, index + len(list2)))
        index += 1
    return []

def find_indexes_after(list1, before_length,list2):
    index = 0
    while index <= len(list1) - len(list2):
        if list1[index:index + len(list2)] == list2:
            #print(index)
            return list(range(index +before_length, index +before_length +  len(list2)))
        index += 1
    return []

def create_data_set(sentences1,sentences2,target_word,labels):
    wic_padded=[]
    for i in range(0,len(sentences1)):

        #print(words[i])
        #sentence = f"[CLS] {sentences1[i]} [SEP] {sentences2[i]} [SEP]"
        sentence = f"<s> {sentences1[i]}</s><s>{sentences2[i]}</s>"
        #print(sentence)
        tokens=tokenizer(sentence, add_special_tokens=False,pad_to_max_length=True,
                  truncation=True,max_length=512)
        input_ids = tokens["input_ids"]
        attention_mask = tokens["attention_mask"]
        #print(input_ids)
        target_word=words[i]
        target_token = tokenizer.encode(target_word)
        target_token=target_token[1:-1]
        
        sep_occurrences = [index for index, token_id in enumerate(input_ids) if token_id == tokenizer.sep_token_id]

        #print(tokenizer.sep_token_id)
        if len(sep_occurrences)!=0:
            sep_index = sep_occurrences[0]

            tokens_before_sep = input_ids[:sep_index]
            tokens_after_sep = input_ids[sep_index + 1:]
            #print(tokens_before_sep,tokens_after_sep)
            is_present1= str(target_token).replace("[","").replace("]","") in str(tokens_before_sep)
            is_present2= str(target_token).replace("[","").replace("]","") in str(tokens_after_sep)
            #print(target_token)
            #print(is_present1,is_present2)
            if is_present1!=False and is_present2!=False:
                target_word_ids_before_sep = find_indexes_before(tokens_before_sep,target_token)
                target_word_ids_after_sep = find_indexes_after(tokens_after_sep,len(tokens_before_sep)-1,target_token)


                mask_tensor_sent1 = torch.zeros_like(torch.tensor(input_ids))
                mask_tensor_sent1[target_word_ids_before_sep] = 1
                mask_tensor_sent2 = torch.zeros_like(torch.tensor(input_ids))
                mask_tensor_sent2[target_word_ids_after_sep] = 1
                sample_data = {
                                "input_ids": torch.tensor(input_ids),
                                "attention_mask": torch.tensor(attention_mask),
                                "word1_locs": mask_tensor_sent1,
                                "word2_locs": mask_tensor_sent2,
                                "labels": torch.tensor(labels[i]),
                                "sentence": sentence,
                                "target_word":words[i]
                            }

                # Append the data for the current sample to the list
                wic_padded.append(sample_data)
    return wic_padded

In [41]:
#create test dataset

wic_test_set = create_data_set(sentences1,sentences2,words,labels)



In [42]:
test_data = TensorDataset(
    torch.stack([sample["input_ids"] for sample in wic_test_set]),
    torch.stack([sample["attention_mask"] for sample in wic_test_set]),
    torch.stack([sample["word1_locs"] for sample in wic_test_set]),
    torch.stack([sample["word2_locs"] for sample in wic_test_set]),
    torch.stack([sample["labels"] for sample in wic_test_set])
)

# Create a sampler and loader
test_sampler = RandomSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=BATCH_SIZE)

In [43]:
print(len(wic_test_set))

1930


In [44]:
#load weights of best model
path = 'saved_weights_muril.pt'
class_model.load_state_dict(torch.load(path))

FileNotFoundError: ignored

In [45]:
# get predictions for test data
class_model.eval()
total_preds=[]
test_labels1=[]
with torch.no_grad():
    for batch in test_dataloader:
        batch = [t.to(device) for t in batch]
        test_ids, test_mask, test_word1, test_word2, test_labels = batch
        _,logits = class_model(test_ids, test_mask, test_word1, test_word2, test_labels)
        logits=logits.detach().cpu().numpy()
        test_labels1.append(test_labels.detach().cpu().numpy())
        #print(logits)
        preds = np.argmax(logits, axis=1).flatten()
        total_preds.append(preds)

  logits = self.softmax(self.linear_seperator(layer1_results))


In [46]:
total_preds1=[]
for i in total_preds:
    for i1 in i:
        total_preds1.append(i1)

print(len(total_preds1))

test_labels2=[]
for i in test_labels1:
    for i1 in i:
        test_labels2.append(i1)

print(len(test_labels2))

1930
1930


In [47]:
# model's performance
#preds = np.argmax(preds, axis = 1)
print(classification_report(test_labels2, total_preds1))

              precision    recall  f1-score   support

           0       0.64      0.50      0.56      1094
           1       0.49      0.63      0.55       836

    accuracy                           0.56      1930
   macro avg       0.57      0.57      0.56      1930
weighted avg       0.58      0.56      0.56      1930

