# NER4ID at SemEval-2022 Task 2: Named Entity Recognition for Idiomaticity Detection
### Tedeschi and Navigli (2022)
----------------------------------------------------------------------------------------------------

## Dependencies

In [1]:
! pip install transformers
! pip install scikit-learn



In [2]:
import os

from tqdm import tqdm

import torch
from torch import nn
from torch.utils.data import Dataset
from torch.utils.data.dataloader import DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch.optim as optim
 
from pprint import pprint
import random
import numpy as np

from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

from transformers import BertTokenizer, BertModel, BertConfig

In [3]:
os.chdir("../")

In [4]:
print(torch.__version__)
torch.cuda.current_device()
torch.cuda.get_device_name(0)

1.8.1+cu102


GeForce RTX 3090 with CUDA capability sm_86 is not compatible with the current PyTorch installation.
The current PyTorch install supports CUDA capabilities sm_37 sm_50 sm_60 sm_70.
If you want to use the GeForce RTX 3090 GPU with PyTorch, please check the instructions at https://pytorch.org/get-started/locally/



'GeForce RTX 3090'

## Read Data

In [5]:
mode = input("Do you want to train a model from scratch or do you want to evaluate a pretrained model? Possible answers: ['train', 'eval']\n> ")
assert mode in ['train', 'eval']

setting = input("\nIn which setting do you want to train/evaluate your model? Possible answers: ['zero-shot', 'one-shot']\n>")
assert setting in ['zero-shot', 'one-shot']

In [6]:
train_file_zero_shot = "data/train_zero_shot.csv" 
train_file_one_shot = "data/train_one_shot.csv"

dev_file = "data/dev.csv"
dev_file_gold = "data/dev_gold.csv"
test_file = "data/test.csv"

In [7]:
import spacy
import en_core_web_sm
import pt_core_news_sm
from spacy.cli.download import download as spacy_download

spacy_download("en_core_web_sm")
spacy_tagger_en = spacy.load("en_core_web_sm", exclude=["ner", "parser"])

spacy_download("pt_core_news_sm")
spacy_tagger_pt = spacy.load("pt_core_news_sm", exclude=["ner", "parser"])

Collecting en-core-web-sm==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl (13.9 MB)




[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
Collecting pt-core-news-sm==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/pt_core_news_sm-3.2.0/pt_core_news_sm-3.2.0-py3-none-any.whl (22.2 MB)




[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('pt_core_news_sm')


In [8]:
import csv

train_data = {}
idx = 0

predictions = []
labels = []

if setting=="zero-shot" or setting=="one-shot":
    with open(train_file_zero_shot, newline='') as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        for row in tqdm(reader):
            
            if len((row[4].strip() + row[5].strip() + row[6].strip()).split(" ")) < 300:
                text = row[4].strip()+ " " + row[5].strip()+ " " + row[6].strip()
            elif len((row[4].strip() + row[5].strip()).split(" ")) < 300:
                text = row[4].strip()+ " " + row[5].strip()
            else:
                text = row[5].strip()
                
            e = row[2].strip()

            train_data[idx] = {
                                "id": row[0],
                                "lang": row[1],
                                "expression": e,
                                "text": row[5].strip(),
                                "idiomatic": True if row[7] == "0" else False
                            }    
            idx += 1


if setting=="one-shot":
    with open(train_file_one_shot, newline='') as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        for row in tqdm(reader):
            
            if len((row[4].strip() + row[5].strip() + row[6].strip()).split(" ")) < 300:
                text = row[4].strip()+ " " + row[5].strip()+ " " + row[6].strip()
            elif len((row[4].strip() + row[5].strip()).split(" ")) < 300:
                text = row[4].strip()+ " " + row[5].strip()
            else:
                text = row[5].strip()
                
            e = row[2].strip()
                        
            train_data[idx] = {
                                "id": row[0],
                                "lang": row[1],
                                "expression": e,
                                "text": row[5].strip(),
                                "idiomatic": True if row[7] == "0" else False
                            }    
            idx += 1

print(len(train_data)) 

4492it [00:00, 120087.79it/s]

4492





In [9]:
labels_dev = {}

with open(dev_file_gold, newline='') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    for row in reader:
        labels_dev[row[0]] = True if row[3] == "0" else False

dev_data = {}
idx = 0

with open(dev_file, newline='') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    for row in reader:

        if len((row[3].strip() + row[4].strip() + row[5].strip()).split(" ")) < 300:
            text = row[3].strip() + " " + row[4].strip()+ " " + row[5].strip()
        elif len((row[3].strip() + row[4].strip()).split(" ")) < 300:
            text = row[3].strip()+ " " + row[4].strip()
        else:
            text = row[4].strip()
        
        e = row[2].strip()
                
        
        dev_data[idx] = {
                        "id": row[0],
                        "lang": row[1],
                        "expression": e,
                        "text": row[4].strip(),
                        "idiomatic": labels_dev[row[0]]
                        }
                
        idx += 1
        
    
print(len(dev_data)) 

740


In [10]:
test_data = {}
idx = 0

with open(test_file, newline='') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    for row in reader:

        if len((row[3].strip() + row[4].strip() + row[5].strip()).split(" ")) < 300:
            text = row[3].strip()+ " " + row[4].strip()+ " " + row[5].strip()
        elif len((row[3].strip() + row[4].strip()).split(" ")) < 300:
            text = row[3].strip()+ " " + row[4].strip()
        else:
            text = row[4].strip()
        
        e = row[2].strip()

        test_data[idx] = {
                            "id": row[0],
                            "lang": row[1],
                            "expression": e,
                            "text": row[4].strip(),
                            "idiomatic": True #it's just a FAKE label used to maintain the same structure of the dataset entries, 
                                              #it could be also False
                        }
                
        idx += 1
        
    
print(len(test_data)) 

2343


## Fix Random Seed

In [11]:
SEED = 2 #we set a seed for having replicability of results
 
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

## Load BERT Model

In [12]:
model_name = 'bert-base-multilingual-cased'
 
bert_config = BertConfig.from_pretrained(model_name, output_hidden_states=True)
bert_tokenizer = BertTokenizer.from_pretrained(model_name)
bert_model = BertModel.from_pretrained(model_name, config=bert_config)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Dataset

In [13]:
nlp_en = en_core_web_sm.load()
nlp_pt = pt_core_news_sm.load()

class IdiomDataset(Dataset):
    def __init__(self, 
                 dataset, 
                 tokenizer,
                 languages,
                 device="cuda",
                ) -> None:
        
        self.encoded_data = []
    
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.languages = languages
        self.device = device
        self.__init_encoded_data()

 
    def __init_encoded_data(self):
                
        for idx in tqdm(self.dataset):
            id = self.dataset[idx]["id"]
            lang = self.dataset[idx]["lang"]
            e = self.dataset[idx]["expression"]
            label = self.dataset[idx]["idiomatic"]
            context_tmp = self.dataset[idx]["text"]

            if lang == "EN":
                doc = nlp_en(context_tmp)
                ents = [X.text.lower() for X in doc.ents]
            else:
                doc = nlp_pt(context_tmp)
                ents = [X.text.lower() for X in doc.ents]
            
    
            if e in ents:
                e = context_tmp
                context = context_tmp
            else:
                if context_tmp.find(e) != -1:
                    context = context_tmp[:context_tmp.lower().find(e)] + context_tmp[context_tmp.lower().find(e)+len(e):]

                else:    
                    e = context_tmp
                    context = context_tmp                     
                
        
            tokenized_e = torch.tensor(self.tokenize_mention(e, self.tokenizer, True))
            tokenized_context = torch.tensor(self.tokenize_mention(context, self.tokenizer, True))

            if e!="MWE" and lang in self.languages:
                self.encoded_data.append((idx,
                                          e,
                                          context,
                                          tokenized_e,
                                          tokenized_context,
                                          torch.tensor([-1.0]) if label == True else torch.tensor([1.0]),
                                          id, 
                                          lang))

     
    def tokenize_mention(self, sent, tokenizer, special_tokens):
        encoded_sentence = tokenizer.encode(sent, add_special_tokens = special_tokens)
        return encoded_sentence[:500]
    
    def tokenize_description(self, sent, tokenizer, window):
        encoded_sentence = tokenizer.encode(sent, add_special_tokens = True)
        return encoded_sentence

    def __len__(self):
        return len(self.encoded_data)
 
    def __getitem__(self, idx: int):
        return self.encoded_data[idx]

Create the dataset:

In [14]:
train_dataset = IdiomDataset(train_data, bert_tokenizer, languages = ["EN", "PT", "GL"])
dev_dataset = IdiomDataset(dev_data, bert_tokenizer, languages =["EN", "PT", "GL"])
test_dataset = IdiomDataset(test_data, bert_tokenizer, languages = ["EN","PT", "GL"])

print(len(train_dataset))
print(len(dev_dataset))
print(len(test_dataset))

100%|██████████| 4492/4492 [00:23<00:00, 187.69it/s]
100%|██████████| 740/740 [00:03<00:00, 198.66it/s]
100%|██████████| 2343/2343 [00:11<00:00, 201.53it/s]

4491
739
2342





Create the dataloader:

In [15]:
def collate(elems: tuple) -> tuple:
    ids, e, texts, expressions, contexts, labels, ids, langs = list(zip(*elems))
    
    pad_expressions = pad_sequence(expressions, batch_first=True, padding_value=0)
    pad_contexts = pad_sequence(contexts, batch_first=True, padding_value=0)
    pad_labels = pad_sequence(labels, batch_first=True, padding_value=0)
 
    return ids, e, texts, pad_expressions, pad_contexts, pad_labels.cuda(), ids, langs


train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collate)
dev_dataloader = DataLoader(dev_dataset, batch_size=8, shuffle=False, collate_fn=collate)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False, collate_fn=collate)

print(len(train_dataloader))
print(len(dev_dataloader))
print(len(test_dataloader))

562
93
293


## Dual-Encoder Architecture

In [16]:
class DualEncoder(nn.Module):
    def __init__(self, hparams):
        super(DualEncoder, self).__init__()
        pprint(params)
 
        self.hparams = hparams
 
        self.expression_encoder = BertModel.from_pretrained(model_name, config=bert_config)
        self.context_encoder = BertModel.from_pretrained(model_name, config=bert_config)

        self.cosine_similarity = nn.CosineSimilarity(dim=-1, eps=1e-6)
        
        self.dropout = nn.Dropout(hparams.dropout)

        self.tanh = nn.Tanh()
            
        for param in self.context_encoder.parameters():
            param.requires_grad = False
  

    def forward(self, expression, context, mask1, mask2):
        
        embedding_context = self.context_encoder.forward(context.cuda(), mask2.cuda())[0]
        embedding_context = embedding_context[:,0,:].squeeze(1)

        embedding_expression = self.expression_encoder.forward(expression.cuda(), mask1.cuda())[0]
        embedding_expression = torch.sum(embedding_expression, 1)
            
        similarities = self.cosine_similarity(embedding_expression, embedding_context) 
                        
        return similarities

In [17]:
class HParams():
    dropout = 0.25
    
params = HParams()

Instantiate the model: 

In [18]:
my_model = DualEncoder(params).cuda()
my_model

<__main__.HParams object at 0x7f5cbda7b0a0>


Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.dens

DualEncoder(
  (expression_encoder): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

## Trainer

In [19]:
class Trainer():
    def __init__(self,
                model:nn.Module, 
                loss_function,
                optimizer,
                gradient_accumulation_steps):
        
        self.model = model
        self.loss_function = loss_function
        self.optimizer = optimizer
        self.gradient_accumulation_steps = gradient_accumulation_steps
 
    def padding_mask(self, batch):
        padding = torch.ones_like(batch)
        padding[batch == 0] = 0
        padding = padding.type(torch.uint8)
        return padding
 
    def train(self,
            train_dataset:Dataset, 
            valid_dataset:Dataset,
            epochs:int=1,
            patience:int=10,
            modelname=setting):
        
        print("\nTraining...")
 
        record_dev = 0.0
        full_patience = patience
        modelname = modelname
 
        for epoch in range(epochs):
             if patience>0:
                print(" Epoch {:03d}".format(epoch + 1))

                epoch_loss = 0.0
                self.model.train()
                
                count_batches = 0
                self.optimizer.zero_grad()
                
                for ids, e, text, expressions, contexts, labels, id, lang in tqdm(train_dataset):
                    batch_loss = 0.0

                    mask1 = self.padding_mask(expressions)
                    mask2 = self.padding_mask(contexts)
                    
                    similarities = self.model(expressions, contexts, mask1, mask2)

                    labels = labels.view(-1)
                    batch_loss = self.loss_function(similarities, labels)
                    epoch_loss += batch_loss

                    batch_loss.backward()

                    if count_batches % self.gradient_accumulation_steps == 0:
                        self.optimizer.step()
                        self.optimizer.zero_grad()


                avg_epoch_loss = epoch_loss / len(train_dataset)
                print('[E: {:2d}] train loss = {:0.4f}'.format(epoch+1, avg_epoch_loss))

                valid_loss, f1 = self.evaluate(valid_dataset, epoch)

                if f1>record_dev:
                    record_dev = f1
                    torch.save(self.model.state_dict(), "./checkpoints/"+modelname+".pt")
                    patience = full_patience
                else:
                    patience -= 1
                   
                print('\t[E: {:2d}] valid loss = {:0.4f}, f1-score = {:0.4f}, patience: {:2d}'.format(epoch+1, valid_loss, f1, patience))


        print("...Done!")

        return avg_epoch_loss

    def evaluate(self, valid_dataset, epoch):

        valid_loss = 0.0
        all_predictions = list()
        all_labels = list()
        
        predictions = {}
         
        self.model.eval()
                    
        for ids, e, text, expressions, contexts, labels, id, lang in tqdm(valid_dataset):
            mask1 = self.padding_mask(expressions)
            mask2 = self.padding_mask(contexts)
            
            with torch.no_grad():
                similarities = self.model(expressions, contexts, mask1, mask2)

            labels = labels.view(-1)
            loss = self.loss_function(similarities, labels)
 
            for i in range(len(similarities)):
                if similarities[i]>0:
                    all_predictions.append(1)
                else:
                    all_predictions.append(0)

                all_labels.append(1 if labels[i].item()==1 else 0)
            
            valid_loss += loss
            
            for i in range(len(similarities)):
                if similarities[i]>0:
                    predictions[ids[i]] = 1

                else:
                    predictions[ids[i]] = 0

        f1 = f1_score(all_labels, all_predictions, average= 'macro')
        print(classification_report(all_labels, all_predictions, digits=4))
        
        return valid_loss / len(valid_dataset), f1

Instantiate the trainer:

In [20]:
trainer = Trainer(model = my_model,
                    loss_function = nn.MSELoss(),
                    optimizer = optim.Adam(my_model.parameters(), lr=0.00001),
                    gradient_accumulation_steps=1)

Train the system:

In [21]:
if mode == "train":
    trainer.train(train_dataloader, dev_dataloader, epochs = 1000, patience = 5)

## Predict the test set

Load the best model checkpoint:

In [22]:
my_model.load_state_dict(torch.load(f"checkpoints/{setting}.pt"))

FileNotFoundError: [Errno 2] No such file or directory: 'checkpoints/zero-shot.pt'

and then use the model to predict the test set:

In [None]:
def padding_mask(batch):
        padding = torch.ones_like(batch)
        padding[batch == 0] = 0
        padding = padding.type(torch.uint8)
        return padding

def predict(model, test_dataset):

    predictions = {}
        
    model.eval()
                
    for ids, e, text, expressions, contexts, labels, id, lang in tqdm(test_dataset):
        mask1 = padding_mask(expressions)
        mask2 = padding_mask(contexts)
        
        with torch.no_grad():
            similarities = model(expressions, contexts, mask1, mask2)

        labels = labels.view(-1)

        for i in range(len(similarities)):
            if similarities[i]>0:
                predictions[ids[i]] = 1

            else:
                predictions[ids[i]] = 0


    predictions["ID"] = "Label"

    if setting == "zero-shot":
        with open("data/test_submission_format.csv") as csvfile:
            with open("predictions/task2_subtaska.csv", "w") as out:
                reader = csv.reader(csvfile, delimiter=',')
                for row in reader:
                    if row[2] == "zero_shot" or row[0] == "ID":
                        out.write(row[0] + "," + row[1] + "," + row[2] + "," + str(predictions[row[0]]) + "\n")
                    else:
                        out.write(row[0] + "," + row[1] + "," + row[2] + "," + "" + "\n")
    else:
        with open("data/test_submission_format.csv") as csvfile:
            with open("predictions/task2_subtaska.csv", "w") as out:
                reader = csv.reader(csvfile, delimiter=',')
                for row in reader:
                    if row[2] == "one_shot" or row[0] == "ID":
                        out.write(row[0] + "," + row[1] + "," + row[2] + "," + str(predictions[row[0]]) + "\n")
                    else:
                        out.write(row[0] + "," + row[1] + "," + row[2] + "," + "" + "\n")    


predict(my_model, test_dataloader)

The output file (saved in the /predictions folder) contains predictions in the standard format specified by the competition rules. You can now upload the output file on CodaLab and see the scores obtained by the system on the test set.