# Libraries

## Install modules if necessary

In [1]:
!pip install transformers
!pip install xmltodict
!pip install pandas
!pip install numpy
!pip3 install torch==1.10.0+cu102 torchvision==0.11.1+cu102 torchaudio===0.10.0+cu102 -f https://download.pytorch.org/whl/cu102/torch_stable.html
!pip install nltk
!pip install sklearn

Looking in links: https://download.pytorch.org/whl/cu102/torch_stable.html


## Import Modules

In [2]:
import numpy as np
import pandas as pd
import xml.etree.ElementTree as ET
import torch
from torch import nn
import nltk
from nltk.corpus import wordnet as wn
from sklearn.metrics import f1_score
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hms17\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
import transformers as trf
import random, time, datetime, warnings
from collections import defaultdict
warnings.filterwarnings('ignore') 

In [4]:
dev = torch.device('cuda:0')

# Model Training

## Loading Datasets

In [5]:
# This function reads a dataset xml file and its gold.key.txt file, and 
def loadDataset(path):
  data = ET.parse(path + '.data.xml')
  with open(path + '.gold.key.txt') as file:
      labels =  [line.rstrip() for line in file.readlines()]
  root = data.getroot()
  dataset = []
  for doc in root:
    for raw_sent in doc:
      whole_sentence = []
      instances = [(i, x) for i, x in enumerate(raw_sent) if x.tag == 'instance']
      for term in raw_sent:
        whole_sentence.append(term.text.lower())
      whole_sentence = ' '.join(whole_sentence)
      for idx, inst in instances:
        gold_label = labels.pop(0).split()
        assert(gold_label[0] == inst.attrib['id'])
        all_senses = [lemma for sense in wn.synsets(inst.text) for lemma in sense.lemmas()]
        try:
          label = torch.tensor([ [x.key() for x in all_senses].index(gold_label[1]) ])
        except ValueError:
          continue

        if any(c in set(".-\\/~()") for c in inst.text): continue
        dataset.append({
            'sentence': whole_sentence,
            'idx': whole_sentence.split().index(inst.text.lower()),
            'polyseme': inst.text,
            'lemma' : inst.attrib['lemma'],
            'senses': all_senses,
            'label': label
        })
  return dataset

In [6]:
training = loadDataset('Datasets/Training/SemCor/semcor')

In [7]:
validation = loadDataset('Datasets/Validation/semeval2007/semeval2007')
SE2 = loadDataset('Datasets/Testing/senseval2/senseval2')
SE3 = loadDataset('Datasets/Testing/senseval2/senseval2')
SE13 = loadDataset('Datasets/Testing/semeval2013/semeval2013')
SE15 = loadDataset('Datasets/Testing/semeval2015/semeval2015')

## Fine-tuning Model

In [8]:
# BERT-base
tokenizer = trf.DistilBertTokenizer.from_pretrained('distilbert-base-uncased', truncation=True, padding=True, do_lower_case=True)
config = trf.DistilBertConfig.from_pretrained('distilbert-base-uncased', output_hidden_states=True)
model = trf.DistilBertModel.from_pretrained('distilbert-base-uncased', config=config)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
def getEncodedIndeces(text, idx):
  encodings = [(x, tokenizer(x, add_special_tokens=False)['input_ids']) for x in text.split()]
  start, end = 0, 0
  for i, encoding in enumerate(encodings):
    if i == idx:
      start += 1
      end = start + len(encoding[1])
      return (start, end)
    else:
      start += len(encoding[1])

cached_output = ('', None)
def getFeatureVec(data):
  global cached_output
  text = data['sentence']
  if text == cached_output[0]:
    hidden_states = cached_output[1]
  else:
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input)
    hidden_states = output[1][0][0]
    cached_output = (text, hidden_states.clone().detach())
  s, e = getEncodedIndeces(text, data['idx'])
  avg = torch.mean(hidden_states[s:e], 0)
  return avg

# Classifier

In [10]:
class MLP_Classifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.H = 768
        self.L1 = nn.Linear(self.H, self.H)
        self.ReLU = nn.ReLU()
        self.L2 = nn.ModuleDict({})
        self.softmax = nn.Softmax()

    def forward(self, features, data):
        x = self.L1(features)
        x = self.ReLU(x)

        polyseme, numSenses = "polyseme_" + data['polyseme'], len(data['senses'])
        if polyseme not in self.L2: 
          self.L2.update({polyseme: nn.Linear(self.H, numSenses).to(dev)})
        
        x = self.L2[polyseme](x)

        return x

In [15]:
# Initialize the MLP
mlp = MLP_Classifier().to(dev)

# Define the loss function, optimizer, and scheduler (to reduce learning rate)
loss_function = nn.CrossEntropyLoss().to(dev)
optimizer = torch.optim.Adam(mlp.parameters(), lr=1e-3)
scheduler = torch.optim.lr_scheduler.MultiplicativeLR(optimizer, lr_lambda=(lambda epoch: 1/epoch))

min_valid_loss = np.inf
max_f1_score = 0

# Run the training loop
for epoch in range(10):
  # Print epoch
  print(f'Starting epoch {epoch+1}')
  
  # Training loop:
  train_loss = 0.0
  batch_loss = 0.0
  batchStart = time.time()
  training_subset = training[:] # <=== Set training size here
  for i, data in enumerate(training_subset):
    feature_vec = getFeatureVec(data).to(dev)
    label = data['label'].to(dev)
    optimizer.zero_grad()
    outputs = torch.reshape(mlp(feature_vec.to(dev), data), (1, -1))
    loss = loss_function(outputs, label)
    loss.backward()
    optimizer.step()
    scheduler.step()
    
    train_loss += loss.item()
    batch_loss += loss.item()
    if i % 500 == 499:
        print('Training loss after mini-batch {:5d}/{:d}: {:.3f}'.format(i + 1, len(training_subset), batch_loss / 500))
        batchTime = time.time() - batchStart
        remainingTime = ((len(training_subset) - (i + 1)) / 500) * batchTime
        print("Time remaining for current epoch: {}".format(str(datetime.timedelta(seconds=remainingTime))))
        batch_loss = 0.0
        batchStart = time.time()

  print("Training done. Validating...")
  # Validation loop:
  valid_loss = 0.0
  predictions = defaultdict(list)
  labels = defaultdict(list)
  for i, data in enumerate(validation):
    feature_vec = getFeatureVec(data).to(dev)
    label = data['label'].to(dev)
    outputs = torch.reshape(mlp(feature_vec.to(dev), data), (1, -1))
    loss = loss_function(outputs, label)
    predictions[data['polyseme']].append(outputs)
    labels[data['polyseme']].append(label)
    
    valid_loss += loss.item()

    if i % 50 == 49:
      print("{}/{} complete.".format(i+1, len(validation)))
  
  all_f1_scores = []
  f1_weights = []
  for polyseme in labels.keys():
    preds_tensor = torch.stack(predictions[polyseme])
    preds_tensor = torch.argmax(preds_tensor.reshape((-1, preds_tensor.shape[-1])), dim=1)
    labels_tensor = torch.stack(labels[polyseme]).reshape((-1,))
    all_f1_scores.append(f1_score(labels_tensor.cpu(), preds_tensor.cpu(), average='weighted'))
    f1_weights.append(len(labels_tensor))
  
  f1 = np.average(all_f1_scores, weights=f1_weights) * 100


  print("Epoch {} complete. Training loss: {:.5f}. Validation loss: {:.5f}. Validation F1 score: {:.5f}".format(epoch, (train_loss / len(training_subset)), (valid_loss / len(validation)), f1) )
  if f1 > max_f1_score:
        print(f'F1 score increased({max_f1_score:.6f}--->{f1:.6f}) \t Saving The Model')
        max_f1_score = f1
        # Saving State Dict
        torch.save(model.state_dict(), 'saved_model.pth')

# Process is complete.
print('Training process has finished.')

Starting epoch 1
Training loss after mini-batch   500/100000: 2.676
Time remaining for current epoch: 2:37:14.765470


In [None]:
# for data in training:
#     f = getFeatureVec(data)
#     # print(data['sentence'], "| Definition of:", data['polyseme'])
#     # data['senses'][data['label']].synset().definition()
#     encodings = {x:tokenizer(x, add_special_tokens=False)['input_ids'] for x in data['sentence'].split()}
#     idxs = getEncodedIndeces(data['sentence'], data['idx'])
#     # print(encodings)
#     extracted_token = tokenizer(data['sentence'])['input_ids'][idxs[0]:idxs[1]]
#     # print(extracted_token)
#     try:
#         assert(encodings[data['polyseme'].lower()] == extracted_token)
#     except:
#         print(encodings)
#         print(data['polyseme'], encodings[data['polyseme'].lower()])
#         print(extracted_token)
#         break