# Libraries

## Install modules if necessary

In [1]:
!pip install transformers
!pip install xmltodict
!pip install pandas
!pip install numpy
!pip3 install torch==1.10.0+cu102 torchvision==0.11.1+cu102 torchaudio===0.10.0+cu102 -f https://download.pytorch.org/whl/cu102/torch_stable.html
!pip install nltk
!pip install sklearn

Looking in links: https://download.pytorch.org/whl/cu102/torch_stable.html


## Import Modules

In [2]:
import numpy as np
import pandas as pd
import xml.etree.ElementTree as ET
import torch
from torch import nn
import nltk
from nltk.corpus import wordnet as wn
from sklearn.metrics import f1_score
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hms17\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
import transformers as trf
import random, time, datetime, warnings, re
from os.path import exists as pathExists
from os import rename as renameFile
from collections import defaultdict
warnings.filterwarnings('ignore') 

In [4]:
dev = torch.device('cuda:0')

# Data Preparation

## Loading Datasets

In [5]:
# This function reads a dataset xml file and its gold.key.txt file, and 
def loadDataset(path):
  data = ET.parse(path + '.data.xml')
  with open(path + '.gold.key.txt') as file:
      labels =  [line.rstrip() for line in file.readlines()]
  root = data.getroot()
  dataset = []
  for doc in root:
    for raw_sent in doc:
      whole_sentence = []
      instances = [(i, x) for i, x in enumerate(raw_sent) if x.tag == 'instance']
      for term in raw_sent:
        whole_sentence.append(term.text.lower())
      whole_sentence = ' '.join(whole_sentence)
      for idx, inst in instances:
        gold_label = labels.pop(0).split()
        assert(gold_label[0] == inst.attrib['id'])
        all_senses = [lemma for sense in wn.synsets(inst.text) for lemma in sense.lemmas()]
        try:
          label = torch.tensor([ [x.key() for x in all_senses].index(gold_label[1]) ])
        except ValueError:
          continue

        if any(c in set(".-\\/~()") for c in inst.text): continue
        dataset.append({
            'sentence': whole_sentence,
            'idx': whole_sentence.split().index(inst.text.lower()),
            'polyseme': inst.text.lower(),
            'lemma' : inst.attrib['lemma'],
            'senses': all_senses,
            'label': label
        })
  return dataset

In [6]:
training = loadDataset('Datasets/Training/SemCor/semcor')
validation = loadDataset('Datasets/Validation/semeval2007/semeval2007')
testing = {
    'SE2': loadDataset('Datasets/Testing/senseval2/senseval2'),
    'SE3': loadDataset('Datasets/Testing/senseval3/senseval3'),
    'SE13': loadDataset('Datasets/Testing/semeval2013/semeval2013'),
    'SE15': loadDataset('Datasets/Testing/semeval2015/semeval2015')
}


## DistilBERT Model

In [7]:
# BERT-base
tokenizer = trf.DistilBertTokenizer.from_pretrained('distilbert-base-uncased', truncation=True, padding=True, do_lower_case=True)
config = trf.DistilBertConfig.from_pretrained('distilbert-base-uncased', output_hidden_states=True)
bert_model = trf.DistilBertModel.from_pretrained('distilbert-base-uncased', config=config)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# Classifier

## Helper functions

In [8]:
def getEncodedIndeces(text, idx):
  encodings = [(x, tokenizer(x, add_special_tokens=False)['input_ids']) for x in text.split()]
  start, end = 0, 0
  for i, encoding in enumerate(encodings):
    if i == idx:
      start += 1
      end = start + len(encoding[1])
      return (start, end)
    else:
      start += len(encoding[1])

cached_output = ('', None)
def getFeatureVec(data):
  global cached_output
  text = data['sentence']
  if text == cached_output[0]:
    hidden_states = cached_output[1]
  else:
    encoded_input = tokenizer(text, return_tensors='pt')
    output = bert_model(**encoded_input)
    hidden_states = output[1][0][0]
    cached_output = (text, hidden_states.clone().detach())
  s, e = getEncodedIndeces(text, data['idx'])
  avg = torch.mean(hidden_states[s:e], 0)
  return avg

In [79]:
getFeatureVec(training[1]).shape

torch.Size([768])

In [9]:
def evaluate(model, dataset):
  loss_function = nn.CrossEntropyLoss().to(dev)
  loss = 0.0
  f1 = 0
  predictions = defaultdict(list)
  labels = defaultdict(list)
  model.eval().cuda()
  with torch.no_grad():
    for i, data in enumerate(dataset):
      feature_vec = getFeatureVec(data).to(dev)
      label = data['label'].to(dev)
      outputs = torch.reshape(model(feature_vec.to(dev), data), (1, -1))
      loss = loss_function(outputs, label)
      predictions[data['polyseme']].append(outputs)
      labels[data['polyseme']].append(label)
      
      loss += loss.item()

      if i % 100 == 99:
        print("{}/{} complete.".format(i+1, len(dataset)))
    
    all_f1_scores = []
    f1_weights = []
    for polyseme in labels.keys():
      preds_tensor = torch.stack(predictions[polyseme])
      preds_tensor = torch.argmax(preds_tensor.reshape((-1, preds_tensor.shape[-1])), dim=1)
      labels_tensor = torch.stack(labels[polyseme]).reshape((-1,))
      all_f1_scores.append(f1_score(labels_tensor.cpu(), preds_tensor.cpu(), average='weighted'))
      f1_weights.append(len(labels_tensor))
    
    avgLoss = loss / len(dataset)
    f1 = np.average(all_f1_scores, weights=f1_weights) * 100
    return avgLoss, f1

In [10]:
# Code for sanity-checking that getEncodedIndeces extracts the correct tokens.

# for data in training:
#     f = getFeatureVec(data)
#     # print(data['sentence'], "| Definition of:", data['polyseme'])
#     # data['senses'][data['label']].synset().definition()
#     encodings = {x:tokenizer(x, add_special_tokens=False)['input_ids'] for x in data['sentence'].split()}
#     idxs = getEncodedIndeces(data['sentence'], data['idx'])
#     # print(encodings)
#     extracted_token = tokenizer(data['sentence'])['input_ids'][idxs[0]:idxs[1]]
#     # print(extracted_token)
#     try:
#         assert(encodings[data['polyseme'].lower()] == extracted_token)
#     except:
#         print(encodings)
#         print(data['polyseme'], encodings[data['polyseme'].lower()])
#         print(extracted_token)
#         break

## MLP Neural Net Structure

In [11]:
class MLP_Classifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.H = 768
        self.L1 = nn.Linear(self.H, self.H)
        self.ReLU = nn.ReLU()
        self.L2 = nn.ModuleDict({})

    def forward(self, features, data):
        x = self.L1(features)
        x = self.ReLU(x)

        polyseme, numSenses = "polyseme_" + data['polyseme'], len(data['senses'])
        if polyseme not in self.L2: 
          self.L2.update({polyseme: nn.Linear(self.H, numSenses).to(dev)})
        
        x = self.L2[polyseme](x)
        
        return x

## Load model if it is present

In [27]:
def loadModel(path):
    if pathExists(path):
        print("Saved model found. Loading saved model")
        
        # This code makes sure the saved model is loaded properly, including the ModuleDict which stores the nn.Linear() layer for each polyseme
        combined_state_dict = torch.load(path)
        L1_state_dict = {k:v for k,v in combined_state_dict.items() if k in ['L1.weight', 'L1.bias']}
        L2_state_dict = {k:v for k,v in combined_state_dict.items() if k not in ['L1.weight', 'L1.bias']}

        mlp = MLP_Classifier().to(dev)
        loaded_L2 = nn.ModuleDict({})
        all_polysemes = set([re.search('L2\.(.*)\.', polyseme).group(1) for polyseme in L2_state_dict.keys()])
        for polyseme in all_polysemes:
            num_senses = len([lemma for sense in wn.synsets(polyseme.split('_')[1]) for lemma in sense.lemmas()])
            layer = nn.Linear(mlp.H, num_senses)
            weights = L2_state_dict['L2.{}.weight'.format(polyseme)]
            biases = L2_state_dict['L2.{}.bias'.format(polyseme)]
            layer.load_state_dict({
                'weight': weights,
                'bias': biases
            })
            loaded_L2.update({polyseme: layer.to(dev)})

        mlp.load_state_dict(L1_state_dict)
        mlp.L2 = loaded_L2

        print("Saved model loaded.")
        return mlp
    else:
        print("No saved model found.")
        return False

In [28]:
do_training = False
saved_model_path = "wsd_model_48.96_f1.pth"
mlp = loadModel(saved_model_path)
if not mlp: do_training = True

Saved model found. Loading saved model
Saved model loaded.


## Training and Validation

In [30]:
# Only do training if there is no saved model
if do_training:
  # Initialize the MLP
  mlp = MLP_Classifier().to(dev)

  # Define the loss function, optimizer, and scheduler (to reduce learning rate)
  loss_function = nn.CrossEntropyLoss().to(dev)
  optimizer = torch.optim.Adam(mlp.parameters(), lr=1e-3)
  scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=(lambda epoch: 1/(epoch+1)))

  min_valid_loss = np.inf
  max_f1_score = 0

  # Run the training loop
  for epoch in range(10):
    print(f'Starting epoch {epoch+1}')
    
    # Each epoch, the model trains on half of the training dataset. 
    # The first epoch will train on the first half, then the second epoch will train on the second half, and so on, alternating every epoch.
    halfway_idx = int(len(training) / 2)
    if epoch % 2 == 0:
      training_subset = training[:halfway_idx]
    else:
      training_subset = training[halfway_idx:]

    # training_subset = training_subset[:1000]
    # Training loop:
    train_loss = 0.0
    batch_loss = 0.0
    batchStart = time.time()
    mlp.train().cuda()
    for i, data in enumerate(training_subset):
      feature_vec = getFeatureVec(data).to(dev)
      label = data['label'].to(dev)
      optimizer.zero_grad()
      outputs = torch.reshape(mlp(feature_vec.to(dev), data), (1, -1))
      loss = loss_function(outputs, label)
      loss.backward()
      optimizer.step()
      
      train_loss += loss.item()
      batch_loss += loss.item()
      if i % 500 == 499:
          print('Training loss after mini-batch {:5d}/{:d}: {:.3f}'.format(i + 1, len(training_subset), batch_loss / 500))
          batchTime = time.time() - batchStart
          remainingTime = ((len(training_subset) - (i + 1)) / 500) * batchTime
          print("Time remaining for current epoch: {}".format(str(datetime.timedelta(seconds=remainingTime))))
          batch_loss = 0.0
          batchStart = time.time()

    print("Training done. Validating...")
    
    # Validation loop:
    valid_loss, f1 = evaluate(mlp, validation)
    
    # Decrease learning rate (per epoch)
    scheduler.step()
    print("Learning rate:", scheduler.get_lr())

    # Epoch complete
    print("Epoch {} complete. Training loss: {:.5f}. Validation loss: {:.5f}. Validation F1 score: {:.5f}".format(epoch+1, (train_loss / len(training_subset)), (valid_loss / len(validation)), f1) )
    if f1 > max_f1_score:
      print(f'F1 score increased ({max_f1_score:.6f}--->{f1:.6f}) \t Saving The Model...')
      max_f1_score = f1
      # Saving State Dict
      torch.save(mlp.state_dict(), 'wsd_model.pth')

  # Rename the best model to include F1 score
  best_model = 'wsd_model_{:.2f}_f1.pth'.format(max_f1_score)
  renameFile('wsd_model.pth', best_model)

  # Process is complete. Load best model
  print('Training process has finished. Loading best model...')
  mlp = loadModel(best_model)
  print("Best model loaded.")

## Testing

In [31]:
testing_f1_scores = {}
for name, dataset in testing.items():
    f1 = evaluate(mlp, dataset)
    testing_f1_scores[name] = f1

100/2192 complete.
200/2192 complete.
300/2192 complete.
400/2192 complete.
500/2192 complete.
600/2192 complete.
700/2192 complete.
800/2192 complete.
900/2192 complete.
1000/2192 complete.
1100/2192 complete.
1200/2192 complete.
1300/2192 complete.
1400/2192 complete.
1500/2192 complete.
1600/2192 complete.
1700/2192 complete.
1800/2192 complete.
1900/2192 complete.
2000/2192 complete.
2100/2192 complete.
100/1730 complete.
200/1730 complete.
300/1730 complete.
400/1730 complete.
500/1730 complete.
600/1730 complete.
700/1730 complete.
800/1730 complete.
900/1730 complete.
1000/1730 complete.
1100/1730 complete.
1200/1730 complete.
1300/1730 complete.
1400/1730 complete.
1500/1730 complete.
1600/1730 complete.
1700/1730 complete.
100/1509 complete.
200/1509 complete.
300/1509 complete.
400/1509 complete.
500/1509 complete.
600/1509 complete.
700/1509 complete.
800/1509 complete.
900/1509 complete.
1000/1509 complete.
1100/1509 complete.
1200/1509 complete.
1300/1509 complete.
1400/15

In [32]:
for name, score in testing_f1_scores.items():
    print(name, "F1 score:", score[1])

SE2 F1 score: 52.73236003225463
SE3 F1 score: 54.014249905183895
SE13 F1 score: 51.13066121281534
SE15 F1 score: 51.01071706280993


# Sense Predictor

## Predictor function

In [66]:
def predictSense(sentence, polyseme):
    idx = 0
    polyseme_text = ""
    split_sentence = list(map(lambda x: re.sub('\W+', "", x), sentence.lower().split()))
    
    if isinstance(polyseme, str):
        try:
            idx = split_sentence.index(polyseme.lower())
        except ValueError:
            print("Given string not found in sentence.")
            return
        polyseme_text = polyseme.lower()
    elif isinstance(polyseme, int):
        idx = polyseme
        polyseme_text = split_sentence[polyseme].lower()
    
    data = {
        'sentence': sentence,
        'polyseme': polyseme_text,
        'idx': idx,
        'senses': [lemma for sense in wn.synsets(polyseme_text) for lemma in sense.lemmas()]
    }
    
    try:
        feature_vec = getFeatureVec(data).to(dev)
        output = nn.functional.softmax(torch.reshape(mlp(feature_vec.to(dev), data), (-1,)))
        top_preds = output.topk(5)

        print("Predicted senses for", polyseme_text + ":")
        for i in range(5):
            prob = top_preds[0][i]
            sense_idx = top_preds[1][i]
            predicted_sense = data['senses'][sense_idx].synset().definition()
            print("{:3.2%}".format(prob),"\t", predicted_sense)
    except:
        print("No sense definitions for polyseme found.")


## Try it out! 

In [67]:
predictSense("This is such a lovely day!", "lovely")

Predicted senses for lovely:
93.78% 	 appealing to the emotions as well as the eye
3.02% 	 lovable especially in a childlike or naive way
0.98% 	 a very pretty girl who works as a photographer's model
0.95% 	 lovable especially in a childlike or naive way
0.67% 	 lovable especially in a childlike or naive way


In [97]:
predictSense("Please pay with cash.", "pay")

Predicted senses for pay:
69.15% 	 give money, usually in exchange for goods or services
21.36% 	 convey, as of a compliment, regards, attention, etc.; bestow
5.62% 	 something that remunerates
0.96% 	 cancel or discharge a debt
0.54% 	 cancel or discharge a debt


In [98]:
predictSense("You better pay attention!", "pay")

Predicted senses for pay:
79.48% 	 give money, usually in exchange for goods or services
11.34% 	 convey, as of a compliment, regards, attention, etc.; bestow
6.45% 	 something that remunerates
0.85% 	 cancel or discharge a debt
0.33% 	 do or give something to somebody in return
