## Data Processing



## move this notebook into the main folder before execution 

### tokenizer and embedder model from BERT

In [25]:
from transformers import AutoTokenizer,AutoModel


delimitaror = ["<a>","</a>","<b>","</b>","<p>"]


#FOLLOWING PAPER ... WE CAN ADD SOME DELIMITATORS TO INDEFIFY STARTING AND FINISHING POSITION OF 
#ENTITY A, ENTITY B, Pronoun
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased",never_split=delimitaror)
tokenizer.add_tokens(delimitaror,special_tokens=True)


auto_model = AutoModel.from_pretrained("bert-base-cased",output_hidden_states=True)



Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### Dataset 

In [26]:
from torch.utils.data import Dataset
import os
from  typing import Dict,List,Tuple
import pandas as pd
from hw3.evaluate import read_dataset
import torch
import numpy as np

class CoreferenceDataset(Dataset):

    def __init__(
        self, 
        tokenizer,
        modality: str, 
        data_path : str, 
        truncate_up_to_pron: bool=True, 
        labeled: bool=True,
        inference : bool = False 
    ):

        modality = modality+".tsv"
        self.folder = os.path.join(data_path,modality)
        self.truncate_up_to_pron = truncate_up_to_pron
        self.labeled = labeled
        self.tokenizer = tokenizer






        #self.data = pd.read_csv(filepath_or_buffer=self.folder, sep="\t")
        self.data = read_dataset(self.folder)
    

        
        self.pronoun = "<p>",
        self.A_start = "<a>",
        self.A_finish = "</a>",
        self.B_start =  "<b>",
        self.B_finish =  "</b>"

        CLS = [self.tokenizer.cls_token]
        SEP = [self.tokenizer.sep_token]

        self.CLS = CLS
        self.SEP = SEP

        if inference :
            pass
        else :            
            self.dataset = self.pre_processing(CLS,SEP)
        
    def pre_processing(self,CLS,SEP):

        dataset = []

    
        for i, row in enumerate(self.data):
            elements = dict()

            tokens, offsets = self.tokenize_sentence(row)

    
 
            
            pronoun = tokens[offsets["<p>"][0]]
            A_entity = tokens[offsets["<a>"][0]:offsets["</a>"][0]]
            B_entity = tokens[offsets["<b>"][0]:offsets["</b>"][0]]


            nothing = CLS + tokens + SEP + [pronoun, "is", "neither"] + SEP
            A_sentence = CLS + tokens + SEP + [pronoun, "is"] + A_entity + SEP
            B_sentence = CLS + tokens + SEP + [pronoun, "is"] + B_entity + SEP   
            
            
            list_alternatives = [nothing,A_sentence, B_sentence]
            tokens_list = []

            for instances in list_alternatives :
                tokens_list.append(self.tokenizer.convert_tokens_to_ids(instances))
            

            elements['tokens'] = tokens_list
            elements['offsets'] = self._get_offsets_list(offsets)



            #generate gt 
            if row['is_coref_A'] in ["TRUE", True]:
                elements['labels'] = 1

            elif row['is_coref_B'] in ["TRUE", True]:
                elements['labels'] = 2

            else:
                elements['labels'] = 0

            dataset.append(elements)         
        return dataset 
  
    def _get_offsets_list(self, offsets: Dict[str, List[int]]) -> List[int]:
        # 1 is added for the introduction of the CLS token
        offsets_A = [offsets["<a>"][0] + 1, offsets["</a>"][0] + 1]
        offsets_B = [offsets["<b>"][0] + 1, offsets["</b>"][0] + 1]
        
        return  [offsets["<p>"][0] + 1] + offsets_A + offsets_B
  
    def _insert_tag(self, text: str, offsets: Tuple[int, int], 
                    start_tag: str, end_tag: str = None) -> str:
        start_off, end_off = offsets 

        # Starting tag only
        if end_tag is None:
            text = text[:start_off] + start_tag + text[start_off:]
            return text

        text = text[:start_off] + start_tag + text[start_off:end_off] + end_tag + text[end_off:]
        return text

    def tokenize_sentence(self, row: Dict):
        tag_labels = {
            "pronoun_tag": "<p>",
            "start_A_tag": "<a>",
            "end_A_tag": "</a>",
            "start_B_tag": "<b>",
            "end_B_tag": "</b>"
        }
        
        tokens = []
        tag_labels = tag_labels
        offsets = {tag: [] for tag in tag_labels.values()}



        text = row['text']
        pronoun = row['pron']
        A_entity = row['entity_A']
        B_entity = row['entity_B']

        # Sort the offsets in ascending order
        break_points = sorted([
            (tag_labels["pronoun_tag"], row['p_offset']),
            (tag_labels["start_A_tag"], row['offset_A']),
            (tag_labels["end_A_tag"], row['offset_A'] + len(A_entity)),
            (tag_labels["start_B_tag"], row['offset_B']),
            (tag_labels["end_B_tag"], row['offset_B'] + len(B_entity)),
        ], key=lambda x: x[1])

        # When a new tag is inserted, the offset of the next tag
        # changes by the length of the inserted tag.
        len_added_tags = 0
        for tag, offset in break_points:
            offset += len_added_tags
            text = self._insert_tag(text, (offset, None), tag)
            len_added_tags += len(tag)

        # Truncate the text at the last tag inserted and append the pronoun at the end
        if self.truncate_up_to_pron:
            text = text[:offset+len(tag)] + pronoun

        # Also the tags are added to the tokens
        for token in self.tokenizer.tokenize(text):    
            tokens.append(token)

            if token in [*tag_labels.values()]:
                if "/" in token: # End token
                    offsets[token].append(len(tokens)-1)
                else:
                    offsets[token].append(len(tokens)) 
        

     
        
        return tokens, offsets

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        return self.dataset[idx]
    
    def prapare_batch (self,batch,device):
        

        batch = []

    
        for i, instance in enumerate(batch):
            elements = dict()

            tokens, offsets = self.tokenize_sentence(instance)


            
            pronoun = tokens[offsets["<p>"][0]]
            A_entity = tokens[offsets["<a>"][0]:offsets["</a>"][0]]
            B_entity = tokens[offsets["<b>"][0]:offsets["</b>"][0]]


            nothing = self.CLS + tokens + self.SEP + [pronoun, "is", "neither"] + self.SEP
            A_sentence = self.CLS + tokens + self.SEP + [pronoun, "is"] + A_entity + self.SEP
            B_sentence = self.CLS + tokens + self.SEP + [pronoun, "is"] + B_entity + self.SEP   
            
            
            list_alternatives = [nothing,A_sentence, B_sentence]
            tokens_list = []

            for instances in list_alternatives :
                tokens_list.append(self.tokenizer.convert_tokens_to_ids(instances))
            

            elements['tokens'] = tokens_list
            elements['offsets'] = self._get_offsets_list(offsets)
            batch.append(elements)  

        #GET MAX LENGHT
        #total number of lists = batch_size x 3
        pad: int=0
        truncate: int=512

        input = {}
        list_ = []
        batch_size = len(batch)
        total_n_sequences = batch_size*3
        
        for samples in batch :
            list_.append(samples["tokens"][0])
            list_.append(samples["tokens"][1])
            list_.append(samples["tokens"][2])
    

        max_len = min(max((len(x) for x in list_)),truncate)

        
        
        zero_padding = np.full((total_n_sequences, max_len), pad, dtype=np.int64)


        #insert each token sequence in the geneted sentence
        for i,tokens in enumerate(list_):
            lenght_original_tonized_sequnce = len(tokens)
            zero_padding[i,:lenght_original_tonized_sequnce] = tokens
        



        tokens_padded = torch.tensor(zero_padding, device=device)
        tokens_padded = tokens_padded.view(batch_size,3,max_len)
        

        input["tokens"] = tokens_padded






        return input
        


### testing pre processing and CoreferenceDataset

In [27]:
path_data ='data'
train_ds = CoreferenceDataset(tokenizer,"train",path_data)
dev_ds = CoreferenceDataset(tokenizer,"dev",path_data)


#check pre-processing procedure
test = train_ds.__getitem__(0)
tokenizer.decode(test["tokens"][1])


'[CLS] He grew up in Evanston, Illinois the second oldest of five children including his brothers, Fred and Gordon and sisters, Marge ( Peppy ) and Marilyn. His high school days were spent at New Trier High School in Winnetka, Illinois. <a> MacKenzie </a> studied with <b> Bernard Leach </b> from 1949 to 1952. <p> His [SEP] His is MacKenzie [SEP]'

In [28]:
test = train_ds.__getitem__(0)
tokenizer.decode(test["tokens"][0])

'[CLS] He grew up in Evanston, Illinois the second oldest of five children including his brothers, Fred and Gordon and sisters, Marge ( Peppy ) and Marilyn. His high school days were spent at New Trier High School in Winnetka, Illinois. <a> MacKenzie </a> studied with <b> Bernard Leach </b> from 1949 to 1952. <p> His [SEP] His is neither [SEP]'

### Dataloader and custom collated function

In [29]:
from torch.utils.data import DataLoader

batch_size = 16


import torch
import numpy as np


def collate_function(data:list, device: str="cuda:0", pad: int=0, truncate: int=512):
    """
    Args:
    data, list of samples, each is a dictionary with keys "tokens","offset","labels".
    device, in general is "cuda:0" 
    pad, value used for padding, BERT token for padding is 0
    truncate : lenght of element to truncate
    
    Return: 

    """


    #GET MAX LENGHT
    #total number of lists = batch_size x 3
    input = {}
    list_ = []
    labels = []
    batch_size = len(data)
    total_n_sequences = batch_size*3
    
    for samples in data :
        list_.append(samples["tokens"][0])
        list_.append(samples["tokens"][1])
        list_.append(samples["tokens"][2])
        labels.append(samples["labels"])
    


    max_len = min(max((len(x) for x in list_)),truncate)

    
    
    zero_padding = np.full((total_n_sequences, max_len), pad, dtype=np.int64)


    #insert each token sequence in the geneted sentence
    for i,tokens in enumerate(list_):
        lenght_original_tonized_sequnce = len(tokens)
        zero_padding[i,:lenght_original_tonized_sequnce] = tokens
    



    tokens_padded = torch.tensor(zero_padding, device=device)
    tokens_padded = tokens_padded.view(batch_size,3,max_len)
    


    labels_tensor = torch.tensor(labels, dtype=torch.uint8, device=device)

    input["tokens"] = tokens_padded
    input["labels"] = labels_tensor

    return input

"""
train_dataloader = DataLoader(train_ds, batch_size=batch_size, 
                              collate_fn=collate_function, shuffle=True)

valid_dataloader = DataLoader(dev_ds, batch_size=batch_size, 
                              collate_fn=collate_function, shuffle=False)

for data in train_dataloader:
    print(data["tokens"].size())
"""   

'\ntrain_dataloader = DataLoader(train_ds, batch_size=batch_size, \n                              collate_fn=collate_function, shuffle=True)\n\nvalid_dataloader = DataLoader(dev_ds, batch_size=batch_size, \n                              collate_fn=collate_function, shuffle=False)\n\nfor data in train_dataloader:\n    print(data["tokens"].size())\n'

## Coreference Resolution models and Training Strategy

### Model Classifier

In [30]:
import torch.nn as nn


class ConferenceResolution(nn.Module):

    def __init__(self, bert_model,tokenizer,config):
        super().__init__()
        # note  : config is not needed with this deployment case
        self.bert_model = bert_model

        self.bert_model.resize_token_embeddings(len(tokenizer.vocab))
        self.criterion= torch.nn.CrossEntropyLoss()

        self.normalize = nn.BatchNorm1d(768)
        self.dropout = nn.Dropout(0.5)
        self.classifier = nn.Linear(768, 1)


        self.classifier0 = nn.Linear(768, 768)
        self.classifier1 = nn.Linear(768, 1)
        self.relu = nn.ReLU()
        self.dropout0 = nn.Dropout(0.3)



  
    def forward(self, sample):
        #[Batch,3,Tokens]
        bert_input = sample['tokens']
        
        b,_,l = bert_input.size()


        #[Batch*3,Tokens]
        bert_input = bert_input.view(-1,l)

        


        #[Batch*3,Tokens,hidden_size]
        
        bert_outputs = self.bert_model(bert_input, attention_mask=(bert_input > 0).long(),token_type_ids=None, output_hidden_states=True)

        out = bert_outputs.pooler_output

        #from CLS extraction
        #[Batch*3,1,hidden_size]
        #[Batch*3,hidden_size]
        pooled_output = self.normalize(out)
        pooled_output = self.dropout(pooled_output)
        
        #[Batch*3]
        #logits = self.classifier(pooled_output)
        logits = self.classifier0(pooled_output)
        logits = self.relu(logits)
        logits = self.dropout0(logits)
        logits = self.classifier1(logits)


        
        

        #[Batch,3]
        output = logits.view(-1, 3)
        loss = None

        if self.train:
            labels = sample['labels']
            loss = self.criterion(output, labels)

        return output,loss

### Training loop and Evaluation

In [32]:
from sklearn.metrics import f1_score
from torch.optim.lr_scheduler import ExponentialLR
from torch.cuda.amp import GradScaler
from torch.utils.tensorboard import SummaryWriter
from sklearn.metrics import confusion_matrix
writer = SummaryWriter()

config = {}
model = ConferenceResolution(auto_model,tokenizer,config).cuda()

optimizer = torch.optim.Adam(model.parameters(),lr = 0.000004)
scheduler = ExponentialLR(optimizer, gamma=0.96)


train_dataloader = DataLoader(train_ds, batch_size=batch_size, 
                              collate_fn=collate_function, shuffle=True)
valid_dataloader = DataLoader(dev_ds, batch_size=batch_size, 
                              collate_fn=collate_function, shuffle=False)

scaler = GradScaler()
                              

EPOCHS = 20
patience_counter = 0
patience = 8
max_val_loss = 9999

for epoch in range(EPOCHS):

    #TRAINING
    p = []
    g = []
    model.train()
    losses = 0
    losses_eval = 0
    for i_batch, sample_batched in enumerate(train_dataloader):
        optimizer.zero_grad()

        with torch.autocast(device_type="cuda"):
        
            output,loss = model(sample_batched)
            #loss.backward()
            scaler.scale(loss).backward()
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(),0.6)

            
            ### Update weights ### 
        
            scaler.step(optimizer)
            scaler.update()
            #optimizer.step()
            #print("Loss",loss)
           

            predicted = torch.argmax(output, dim=1)
            labels = sample_batched['labels']
            p += predicted.tolist()
            g += labels.tolist()

            losses += loss.detach()
    
    
    
    #-------------------------RESULTS----------------------------------
    print("Epochs n.", epoch)
   
    f1_ =  f1_score(g, p, average=None)
    f1_avg =  f1_score(g, p, average="weighted")
    print("F1 train:",f1_)
    scheduler.step()
    losses = losses/train_dataloader.batch_size
    writer.add_scalar("Loss/train", losses, epoch)
    writer.add_scalar("result/train", f1_avg, epoch)


 
    #EVALUATION
    p = []
    g = []
    model.eval()
    with torch.no_grad():
        for i_batch, sample_batched in enumerate(valid_dataloader):
        
        
            output,loss = model(sample_batched)

            predicted = torch.argmax(output, dim=1)
            labels = sample_batched['labels']
            p += predicted.tolist()
            g += labels.tolist()
            losses_eval += loss.detach()
    
    #-------------------------RESULTS----------------------------------
    f1_ =  f1_score(g, p, average=None)
    f1_avg =  f1_score(g, p, average="weighted")
    print("F1 eval:",f1_)
    losses_eval = losses_eval/valid_dataloader.batch_size
    writer.add_scalar("Loss/eval", losses_eval, epoch)
    writer.add_scalar("result/eval", f1_avg, epoch)


    if max_val_loss > losses_eval:
        max_val_loss = losses_eval 
    else:
        patience_counter += 1
    
    if patience_counter > patience:
        torch.save(model.state_dict(), "hw3/saved/model.pth")
        torch.save(model.bert_model, "hw3/saved/model.pth")        
        #model.bert_model.save_pretrained("hw3/saved/model_bert.pth")
        cm = confusion_matrix(g, p)
        break

    """
    if f1_avg_max < f1_avg:
        f1_avg_max = f1_avg 
    else:
        patience_counter += 1
    
    if patience_counter > patience:
        torch.save(model.state_dict(), "hw3/saved/model.pth")
        model.bert_model.save_pretrained("hw3/saved/model_bert.pth")
        cm = confusion_matrix(g, p)
        break
    """


Epochs n. 0
F1 train: [0.69752066 0.87819549 0.88465763]
F1 eval: [0.61261261 0.8        0.8056872 ]
Epochs n. 1
F1 train: [0.86731392 0.92865232 0.93001099]
F1 eval: [0.61261261 0.79365079 0.79713604]
Epochs n. 2
F1 train: [0.87797147 0.95065913 0.94907749]
F1 eval: [0.6        0.79795396 0.80095923]
Epochs n. 3
F1 train: [0.90393701 0.96015038 0.96038504]
F1 eval: [0.6        0.78740157 0.79156909]
Epochs n. 4
F1 train: [0.92068429 0.96676737 0.97227357]
F1 eval: [0.60377358 0.78306878 0.79716981]
Epochs n. 5
F1 train: [0.94573643 0.97917456 0.98081181]
F1 eval: [0.6440678  0.79144385 0.79807692]
Epochs n. 6
F1 train: [0.96583851 0.98715042 0.98891353]
F1 eval: [0.66115702 0.77628032 0.80769231]
Epochs n. 7
F1 train: [0.95193798 0.98451077 0.98446746]
F1 eval: [0.64285714 0.80104712 0.80193237]
Epochs n. 8
F1 train: [0.97017268 0.99132403 0.98818316]
F1 eval: [0.61261261 0.79274611 0.79318735]
Epochs n. 9
F1 train: [0.96865204 0.99322289 0.99481865]
F1 eval: [0.60952381 0.81038961 0.

In [1]:
import matplotlib.pyplot as plt
plt.imshow(cm, cmap='binary')

NameError: name 'cm' is not defined

In [9]:
import seaborn as sns
import matplotlib.pyplot as plt     

ax= plt.subplot()
sns.heatmap(cm, annot=True, fmt='g', ax=ax);  #annot=True to annotate cells, ftm='g' to disable scientific notation

# labels, title and ticks
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 
ax.xaxis.set_ticklabels(['Nothing', 'Entity A',"Entity B"]); ax.yaxis.set_ticklabels(['Nothing', 'Entity A',"Entity B"])

ImportError: cannot import name '_docstring' from 'matplotlib' (/home/mv/miniconda3/envs/nlp2022-hw2/lib/python3.9/site-packages/matplotlib/__init__.py)