In [None]:
!pip install transformers

In [None]:
cd /content/drive/MyDrive/NLP HW - 2

### Create data files

In [14]:
# Read data (.sent files)
def read_data(data_file):
  data = []

  with open(data_file) as f:
    contents = f.readlines()
    for line in contents:
      data.append(line)

  return data 


# Read pointer file
def read_pointer_file(pointer_file):
  pointers = []
  with open(pointer_file) as f:
    contents = f.readlines()
    for line in contents:
      lines = line.split("|")
      pointers.append(lines)

  return pointers


#Read tuple file
def read_tuple_file(tuple_file):
  tuples = []
  with open(tuple_file) as f:
    contents = f.readlines()
    for line in contents:
      lines = line.split("|")
      tuples.append(lines)

  return tuples

# Map relations to ids
def get_label_mapping(label_file):
  label2idx = {}

  with open(label_file) as f:
    contents = f.readlines()
    for label in contents:
      label = label.strip()
      label2idx[label] = len(label2idx)
  
  # add other class
  label2idx['other'] = 29

  return label2idx

In [15]:
train_data = read_data('train.sent')
val_data = read_data('dev.sent')
test_data = read_data('test.sent')

train_pointers = read_pointer_file('train.pointer')
val_pointers = read_pointer_file('dev.pointer')
test_pointers = read_pointer_file('test.pointer')

train_tuples = read_tuple_file('train.tup')
val_tuples = read_tuple_file('dev.tup')
test_tuples = read_tuple_file('test.tup')

labels_dir = 'relations.txt'
label2idx = get_label_mapping(labels_dir)


In [16]:
# Relation to class mapping
label2idx

{'/location/administrative_division/country': 0,
 '/location/country/capital': 1,
 '/location/country/administrative_divisions': 2,
 '/location/neighborhood/neighborhood_of': 3,
 '/location/location/contains': 4,
 '/people/person/nationality': 5,
 '/people/person/place_lived': 6,
 '/people/deceased_person/place_of_death': 7,
 '/business/person/company': 8,
 '/location/us_state/capital': 9,
 '/people/person/place_of_birth': 10,
 '/people/person/children': 11,
 '/business/company/founders': 12,
 '/business/company/place_founded': 13,
 '/sports/sports_team/location': 14,
 '/people/person/ethnicity': 15,
 '/people/ethnicity/geographic_distribution': 16,
 '/people/person/religion': 17,
 '/business/company/major_shareholders': 18,
 '/location/province/capital': 19,
 '/location/br_state/capital': 20,
 '/business/company/advisors': 21,
 '/film/film_location/featured_in_films': 22,
 '/film/film/featured_film_locations': 23,
 '/location/us_county/county_seat': 24,
 '/time/event/locations': 25,
 

In [22]:
# returns a list of other entity pairs
def get_other_entity_pairs(relations, entites): 
  unique_pairs = set()

  for i in entites:
    for j in entites:
      if i!=j:
        if (i, j) not in relations:
          unique_pairs.add( (i, j))


  return list(unique_pairs)


In [27]:
# preprocess data

#Input - sentence, pointer, tuple files
#Output - sentence with special tokens inserted( <s1>, <e1>, <s2>, <e2>) and one hot labels for all given entity pairs

def preprocess_data(data, pointers_data, tuples_data, evaluation = False):
  sentences = []
  labels = []

  for i in range(len(data)):
    pointer = pointers_data[i]

    tuples = tuples_data[i]

    relations = set()
    entites = set()
    maps = {}

    # for all the given entites
    for item, tuple  in zip(pointer, tuples):
        
      line = data[i]
      line = line.strip().split(" ")
      
      # add entity tuple to relation
      tuple = tuple.strip().split(";")
      relations.add(( tuple[0].strip(), tuple[1].strip() ))

      # store the tuples in entity
      entites.add(tuple[0].strip())
      entites.add(tuple[1].strip())
      
      item = item.strip().split(" ")
        
      s_1 = int(item[0])
      e_1 = int(item[1])
      s_2 = int(item[2])
      e_2 = int(item[3])
        
      maps[tuple[0].strip()] = (s_1, e_1)
      maps[tuple[1].strip()] = (s_2, e_2)


      #get label for the relation
      label = label2idx[item[4]]

      line.insert(s_1, "<s1>")
      line.insert(e_1+2, "<e1>")
    
      if s_2 > e_1:
            line.insert(s_2 +2, "<s2>")
            line.insert(e_2 + 4, "<e2>")
      else:
            line.insert(s_2, "<s2>")
            line.insert(e_2 +2, "<e2>")
        
    
      line =  " ".join(line)

      if line in sentences:
        index = sentences.index(line)
        current_y = labels[index]
        current_y[label] = 1
        labels[index] = current_y

      else:
        y = [0]*29
        y[label] = 1
        sentences.append(line)
        labels.append(y)
        
    if not evaluation:

        #get other pairs
        other_pairs = get_other_entity_pairs(relations, entites)

        if other_pairs != []:
          for other_pair in other_pairs:
              line = data[i]
              line = line.strip().split(" ")

              entity_1 = other_pair[0].strip()
              entity_2 = other_pair[1].strip()

              s_1, e_1 = maps[entity_1]
              s_2, e_2 = maps[entity_2]

              line.insert(int(s_1), "<s1>")
              line.insert(int(e_1)+2, "<e1>")

              if s_2 > e_1:
                s_2_pos = s_2 + 2
                e_2_pos = e_2 + 4
                line.insert(s_2_pos, "<s2>")
                line.insert(e_2_pos, "<e2>")
              else:
                line.insert(int(s_2), "<s2>")
                line.insert(int(e_2) +2, "<e2>")

              line =  " ".join(line)

              #other label -- array of all zeros 
              y = [0]*29

              if line not in sentences:
                sentences.append(line)
                labels.append(y)



  return sentences, labels


In [28]:
train_sentences, train_labels = preprocess_data(train_data, train_pointers, train_tuples)

val_sentences, val_labels = preprocess_data(val_data, val_pointers, val_tuples, evaluation = True)

test_sentences, test_labels = preprocess_data(test_data, test_pointers, test_tuples, evaluation = True)

### Save data files

In [32]:
import pickle

with open('train.pkl', 'wb') as f:
  pickle.dump( (train_sentences, train_labels),  f)

with open('val.pkl', 'wb') as f:
  pickle.dump((val_sentences, val_labels), f)

with open('test.pkl', 'wb') as f:
  pickle.dump( (test_sentences, test_labels), f)


### Imports

In [28]:
import pickle
import pandas as pd
import torch
import torch.nn as nn
from tqdm import tqdm
from time import sleep
import warnings
import torch.nn.functional as F

from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from transformers import RobertaTokenizer, RobertaModel

warnings.filterwarnings("ignore") 

In [29]:
# cd /content/drive/MyDrive/NLP HW - 2

In [30]:
# Check GPU
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
print(device)

torch.backends.cudnn.benchmark = True
torch.backends.cudnn.enabled   = True

print(torch.cuda.device_count())
print('GPU Allocated')

cuda:1
8
GPU Allocated


In [31]:
# device = 'cpu'

### Load saved data files

In [32]:
# load train, val, test files

with open('train.pkl', 'rb') as f:
  train_sentences, train_labels = pickle.load(f)

with open('val.pkl', 'rb') as f:
  val_sentences, val_labels = pickle.load(f)

with open('test.pkl', 'rb') as f:
  test_sentences, test_labels = pickle.load(f)

### Hyper parameters

In [34]:
## Specify the Hyper parameters 

BATCH_SIZE = 16
EPOCHS = 4
LEARNING_RATE = 3e-5
CLIP = 2


hidden_size = 64
vocab_size = 4
embed_dim = 256
num_class =  29


### Dataset and Dataloader

In [35]:
# Import transformer tokenizer, model and optimizer

from transformers import ElectraTokenizer, ElectraModel, AdamW
tokenizer = ElectraTokenizer.from_pretrained("google/electra-small-discriminator")

# add special tokens
special_tokens_dict = {'additional_special_tokens': ['<s1>','<e1>','<s2>','<e2>']}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)





In [36]:
class Dataset(torch.utils.data.Dataset):
    """
    This is our custom dataset class which will load the text and their corresponding labels into Pytorch tensors
    """
    def __init__(self, labels, text):
        self.labels = labels
        self.text = text

    def __getitem__(self, idx):
        sample = {}
        text = self.text[idx]

        #Roberta Tokenizer to tokenize the text
        indexed_tokens = tokenizer.encode_plus(text, 
                                               add_special_tokens=True,   # Adds [CLS] and [SEP] token to every input text
                                               max_length=128, 
                                               truncation=True, 
                                               return_tensors='pt',
                                               padding="max_length")['input_ids']
        

        tokens = tokenizer.encode(text)
        
        
        s_1, e_1, s_2, e_2 = tokens.index(30522), tokens.index(30523), tokens.index(30524), tokens.index(30525)
        

        
        entity_mask = [0] * 128   # create entity mask 
        
        for i in range(len(entity_mask)):
            if i>=s_1 and i <=e_1:
                entity_mask[i] =1
                
            if i>=s_2 and i <=e_2:
                entity_mask[i] = 2
                
            if i>=len(tokens):
                entity_mask[i] = 3
                
                
            

        try:
            sample["label"] = torch.tensor(self.labels[idx])
            sample["token"] = indexed_tokens
            sample["mask"] = torch.tensor(entity_mask)
        except Exception as e:
            print(e)
        
        return sample
    
    def __len__(self):
        return len(self.labels)
        

In [37]:
# Create train, test and val datasets
train_data_object = Dataset(
    labels = train_labels,
    text = train_sentences,
)

test_data_object = Dataset(
    labels = test_labels,
    text = test_sentences,
)

val_data_object = Dataset(
    labels = val_labels,
    text = val_sentences,
)

In [38]:
tokenizer.additional_special_tokens_ids     # token ids of inserted tokens

[30522, 30523, 30524, 30525]

In [39]:

## We call the dataloader class
train_loader = torch.utils.data.DataLoader(
    train_data_object,
    batch_size=BATCH_SIZE,
    pin_memory=True,
    num_workers=2,
    shuffle=True,
    drop_last=True
 )

test_loader = torch.utils.data.DataLoader(
    test_data_object,
    batch_size=BATCH_SIZE//2,
    pin_memory=True,
    num_workers=2,
    shuffle=True,
    drop_last=True
 )

val_loader = torch.utils.data.DataLoader(
    val_data_object,
    batch_size=BATCH_SIZE//2,
    pin_memory=True,
    num_workers=2,
    shuffle=True,
    drop_last=True
 )

dataloaders = {'Train': train_loader, 'Test': test_loader, 'Val': val_loader}



### Model

In [40]:
electra = ElectraModel.from_pretrained("google/electra-small-discriminator")
electra.resize_token_embeddings(len(tokenizer))

Some weights of the model checkpoint at google/electra-small-discriminator were not used when initializing ElectraModel: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Embedding(30526, 128)

In [41]:
# Encoded Entity representation

class EntityPair(nn.Module):

    def __init__(self, vocab_size, embed_dim):
      super(EntityPair, self).__init__()

      self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx = 3)
      self.embedding.weight.requires_grad = True

 
      self.fc1 = nn.Linear(embed_dim, embed_dim//2)

        
      self.adaptive = nn.AdaptiveMaxPool2d((1, embed_dim))

      self.dropout = nn.Dropout(0.3)



    def forward(self, mask):
        
      embed_output = self.embedding(mask)             #out shape = [batch, seq_len, embed_dim] 
    
      out = self.adaptive(embed_output).squeeze()     #out shape = [batch, embed_dim]
        
      out = self.dropout(F.relu(self.fc1(out)))       #out shape = [batch, embed_dim/2]
      
      return out


In [42]:
class RelationClassification(nn.Module):

    def __init__(self, electra, hidden_size, vocab_size, embed_dim, num_class):
        
      super(RelationClassification, self).__init__()

      self.electra = electra
        
      self.entity_encoder = EntityPair(vocab_size, embed_dim)
    
      self.adaptive = nn.AdaptiveMaxPool2d((1, embed_dim))


 
      self.fc1 = nn.Linear(embed_dim + embed_dim//2, hidden_size)
      self.fc2 = nn.Linear(hidden_size, num_class)

      self.dropout = nn.Dropout(0.3)



    def forward(self, input, mask):

      electra_output = self.electra(input).last_hidden_state   #out shape = [batch, seq_len, 256]
    
      sentence_encoded = self.adaptive(electra_output).squeeze()  #out shape = [batch, embed_dim ] 
    
    
      entity_encoded = self.entity_encoder(mask)     #out shape =   [batch, embed_dim/2 ]
        
        
      concat = torch.cat( (sentence_encoded, entity_encoded), 1)    #out_shape = [batch, embed_dim + embed_dim/2]

        
      concat_output = self.dropout(F.relu(self.fc1(concat)))      # out shape = batch, hidden_size
    
      final_output = F.sigmoid((self.fc2(concat_output)))        #out shape = batch, num_class
      
    
      return final_output
        
    
        

In [43]:
model = RelationClassification(electra, hidden_size, vocab_size, embed_dim, num_class)

model = model.to(device)

In [44]:
#optimizer
optimizer = AdamW(model.parameters(), lr = LEARNING_RATE, eps=1e-8 )
#Loss function
criterion = nn.BCELoss()

### Training

In [45]:
PATH = 'electra_embed.pt'

In [46]:
from torch.optim.lr_scheduler import ReduceLROnPlateau

scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.5, verbose = True)


Adjusting learning rate of group 0 to 3.0000e-05.


In [47]:
best_valid_f1 = 0

for epoch in range(0, EPOCHS):
  

    print('-'*50)
    print('Epoch {}/{}'.format(epoch+1, EPOCHS))

    for phase in ['Train', 'Val']:

        batch_loss = 0.0000   #epoch loss

        #accuracy = 0.0   #epoch accuracy

        y_true = []
        y_pred = []

        if phase == 'Train':
            model.train()
        else:
            model.eval()
        
        with tqdm(dataloaders[phase], unit="batch") as tepoch:

          for batch in tepoch:
            labels = batch["label"].to(device)
            text = batch["token"].to(device)
            mask = batch['mask'].to(device)

            labels = labels.to(torch.float32)
            text = text.squeeze()

            output = model(text, mask)
            
            loss = criterion(output, labels)

            if phase == 'Train':

                #zero gradients
                optimizer.zero_grad() 

                # Backward pass  (calculates the gradients)
                loss.backward()   

                # gradient clipping
                nn.utils.clip_grad_norm_(model.parameters(), CLIP)    

                optimizer.step()             # Updates the weights    

            labels_numpy = labels.detach().cpu().numpy()
            hard_preds = torch.round(output).detach().cpu().numpy()
            
            y_pred.extend(hard_preds.tolist())
            y_true.extend(labels_numpy.tolist())
            
            batch_loss += loss.item()
            
            
            #accuracy+= batch_acc

              
          epoch_loss = batch_loss / (len(dataloaders[phase]))
          #epoch_acc = accuracy / (len(dataloaders[phase]))

          print(phase + ":")
          pre = precision_score(y_true, y_pred, average='micro')
          recall = recall_score(y_true, y_pred, average='micro')
          f1 = f1_score(y_true, y_pred, average='micro')
          

          print("Micro F1: {:.8f}, Precision: {:.8f}, Recall : {:.8f}, Loss: {:.8f}.".format(f1, pre, recall, epoch_loss))
          print()
          
            
          if phase == 'Val':
                
                if f1 > best_valid_f1:
                    best_valid_f1 = f1
                    
                    torch.save(model.state_dict(), PATH)
                    print('Model Saved!')
                    
                scheduler.step()
                    
                
        

--------------------------------------------------
Epoch 1/4


100%|██████████| 9148/9148 [10:06<00:00, 15.07batch/s]


Train:
Micro F1: 0.56899173, Precision: 0.58695945, Recall : 0.55209138, Loss: 0.05155015.



100%|██████████| 1017/1017 [00:16<00:00, 62.03batch/s]


Val:
Micro F1: 0.92561305, Precision: 0.95254769, Recall : 0.90015974, Loss: 0.01758949.

Model Saved!
Adjusting learning rate of group 0 to 3.0000e-05.
--------------------------------------------------
Epoch 2/4


100%|██████████| 9148/9148 [09:48<00:00, 15.55batch/s]


Train:
Micro F1: 0.89111778, Precision: 0.91863983, Recall : 0.86519685, Loss: 0.01220640.



100%|██████████| 1017/1017 [00:15<00:00, 64.29batch/s]


Val:
Micro F1: 0.93733994, Precision: 0.93505166, Recall : 0.93963943, Loss: 0.01264862.

Model Saved!
Adjusting learning rate of group 0 to 1.5000e-05.
--------------------------------------------------
Epoch 3/4


100%|██████████| 9148/9148 [09:51<00:00, 15.45batch/s]


Train:
Micro F1: 0.92246886, Precision: 0.94128346, Recall : 0.90439165, Loss: 0.00810605.



100%|██████████| 1017/1017 [00:16<00:00, 62.01batch/s]


Val:
Micro F1: 0.94745259, Precision: 0.94853614, Recall : 0.94637152, Loss: 0.01072068.

Model Saved!
Adjusting learning rate of group 0 to 1.5000e-05.
--------------------------------------------------
Epoch 4/4


100%|██████████| 9148/9148 [09:48<00:00, 15.53batch/s]


Train:
Micro F1: 0.93492243, Precision: 0.94995295, Recall : 0.92036015, Loss: 0.00671957.



100%|██████████| 1017/1017 [00:17<00:00, 56.71batch/s]


Val:
Micro F1: 0.95460782, Precision: 0.95657654, Recall : 0.95264719, Loss: 0.00982205.

Model Saved!
Adjusting learning rate of group 0 to 7.5000e-06.


### Inference


In [48]:
model = RelationClassification(electra, hidden_size, vocab_size, embed_dim, num_class)

model = model.to(device)

model.load_state_dict(torch.load(PATH))

<All keys matched successfully>

In [49]:
# Test on Test loader

batch_loss = 0.0   #epoch loss

y_true = []
y_pred = []

# set the model to evaluation mode            
model.eval()
        
with tqdm(test_loader, unit="batch") as tepoch:
  for batch in tepoch:
    labels = batch["label"].to(device)
    text = batch["token"].to(device)
    mask = batch['mask'].to(device)

    labels = labels.to(torch.float32)
    
    text = text.squeeze()


    with torch.no_grad():
    
      output = model(text, mask)
    
      loss = criterion(output, labels)
    
    labels_numpy = labels.detach().cpu().numpy()
    hard_preds = torch.round(output).detach().cpu().numpy()
    
    y_pred.extend(hard_preds.tolist())
    y_true.extend(labels_numpy.tolist())
            
    batch_loss += loss.item()

              
epoch_loss = batch_loss / (len(test_loader))

print('')
print("Inference:")
print("")

pre = precision_score(y_true, y_pred, average='micro')
recall = recall_score(y_true, y_pred, average='micro')
f1 = f1_score(y_true, y_pred, average='micro')


print("")


print("F1: {:.8f}, Precision: {:.8f}, Recall : {:.8f}, Loss: {:.8f}.".format(f1, pre, recall, epoch_loss))

100%|██████████| 611/611 [00:10<00:00, 59.05batch/s]



Inference:


F1: 0.88892952, Precision: 0.95689825, Recall : 0.82997610, Loss: 0.05899484.
