## BERT and TOKENIZER 

In [2]:
from transformers import AutoModel,AutoTokenizer

auto_model = AutoModel.from_pretrained("bert-base-cased",output_hidden_states=True)

print(f"\nmodel class is      : {type(auto_model)}")


tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
print(f"\nmodel class is      : {type(tokenizer)}")




Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).



model class is      : <class 'transformers.models.bert.modeling_bert.BertModel'>

model class is      : <class 'transformers.models.bert.tokenization_bert_fast.BertTokenizerFast'>


## Semantic Role Labelling Dataset

In [3]:
import os
import json
import logging
import torch
from torch.utils.data import DataLoader,Dataset
import random
from typing import Dict

class SRL(Dataset):
 
    def __init__(self,language,path) -> None:

        self.path_root = 'data'
        self.load_data(language,path)
        self.args_roles,self.list_broken_id = self.list_arg_roles()
        self.pos_list,_ = self.list_pos()
        self.device = device 

    def load_data(self,language,mode):
        
        mode = mode+".json"
        path = os.path.join(self.path_root,language,mode)
        data_file = open(path)
       
        data_ = json.load(data_file)

        list_data = []

        for data in data_:
            list_data.append(data_[data])
        

        self.data = list_data
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, id : int):

        flag = False
        if id in self.list_broken_id :
            flag = True
            while flag == True:

                rand_id = random.randint(0, len(self.data))
                
                if rand_id in self.list_broken_id :
                    pass
                else :
                    flag = False
                    id = rand_id        


        data = self.pre_processing(self.data[id])
        data = self.processig(data)
        return data
        
    def pre_processing(self, data:dict):
        data_list = []
        for role in data["roles"]:
            dictionary = dict()
            dictionary["words"] = data["words"]
            dictionary["role"] = data["roles"][role]
            dictionary["pre_idx"] = role
            
            dictionary["pos_tags"] = data["pos_tags"]
            data_list.append(dictionary)    
        return data_list
    
    def processig(self,data_list:list):
        
        for dictionary in data_list:

            #dictionary["words"] = data["words"]
            dictionary["gt_arg_identification"] = self.arg_id(dictionary["role"])
            dictionary["gt_arg_classification"] = self.arg_class(dictionary["role"])
            dictionary["pos_idx"] = self.pos_idx(dictionary["pos_tags"])
        
        return data_list
   
    def list_arg_roles(self):
        list_roles = []
        list_broken_id = []
        for i,element in enumerate(self.data):
            flag = True
            try : roles = element["roles"]
            except : flag = False
            if flag :
                for e in roles:
                    sentence = element["roles"][e]

                    for word in sentence:
                        
                        list_roles.append(word)
                list_roles = list(set(list_roles))
            else : 
                list_broken_id.append(i)
        return list_roles,list_broken_id

    def list_pos(self):
        list_pos = []
        list_broken_id = []
        for i,element in enumerate(self.data):
            flag = True
            try : pos = element["pos_tags"]
            except : flag = False
            if flag :
                for e in pos:
                    list_pos.append(e)
                list_pos = list(set(list_pos))
            else : 
                list_broken_id.append(i)
        return list_pos,list_broken_id
  
    def arg_class(self,role:list):
        list_idxs = []
        for element in role:
            list_idxs.append(self.args_roles.index(element))
        

        return torch.tensor(list_idxs, dtype=torch.int64)

    def arg_id(self,role:dict):
        list_idxs = []
        for element in role:
            if element == "_":
                list_idxs.append(0)
            else :
                list_idxs.append(1)

        

        return torch.tensor(list_idxs, dtype=torch.int64)

    def pos_idx(self,pos_tags:dict):
        list_idxs = []
        for element in pos_tags:
            list_idxs.append(self.pos_list.index(element))
    
        return torch.tensor(list_idxs, dtype=torch.int64)
    
# here we define our collate function
def collate_fn(batch) -> Dict[str, torch.Tensor]:
    #print(batch)
    input = dict() 
    batch_sentence = [] 
    #print(len(batch))
    for period in batch:
        for sentence in period :
        
            #print(len(sentence[0]["words"]))
            pre_idx = int(sentence["pre_idx"])
            

            predicate = sentence["words"][pre_idx]

            text = " ".join(sentence["words"])
            tokens: list[str] = text.split()
            predicate: list[str] = predicate.split()

            #text = sentence[0]["words"]
            
            t = (tokens,predicate)

            batch_sentence.append(t)
            #print(batch_sentence)

    batch_output = tokenizer.batch_encode_plus(batch_sentence,padding=True,is_split_into_words=True, truncation=True,return_offsets_mapping=True, return_tensors="pt")
    #print(batch_output.keys())


    gt = dict()
    
    


    for period in batch:

        list_positional_predicate_encoding = []
        list_arg_gt = []
        list_predicate_index = [] 
        list_pos_index = [] 

        for sentence in period:
            #positional_encoding
            #+2 per il CLS iniziale ad SEP finale
            sentence_words_lenght =  len(sentence["words"])
            positional_predicate_encoding = torch.zeros(1,sentence_words_lenght+2)
            #+1 per il CLS iniziale
            pre_idx = int(sentence["pre_idx"])
            positional_predicate_encoding[:,pre_idx+1] = 1
            list_positional_predicate_encoding.append(positional_predicate_encoding)
            #print("positional_prefix_encoding",positional_predicate_encoding)
            list_predicate_index.append(pre_idx)




            pos = torch.unsqueeze(sentence["pos_idx"],dim = 0)
            list_pos_index.append(pos)






            #note CLS and SEP are discharder after Bi-LSTM, the Classifier takes in input only wokrds hidden state embedding
            arg_gt = torch.unsqueeze(sentence["gt_arg_classification"],dim = 0)
            list_arg_gt.append(arg_gt)
        


        


    list_arg_gt = torch.cat(list_arg_gt,dim = 0)
    list_pos_index = torch.cat(list_pos_index,dim = 0)
    list_positional_predicate_encoding = torch.cat(list_positional_predicate_encoding,dim = 0)



    gt["arg_gt"] = list_arg_gt
    input["predicate_index"] = list_predicate_index
    input["pos_index"] = list_pos_index.long()


    offset = batch_output.pop("offset_mapping")


    input["BERT_input"] = batch_output
    input["positional_encoding"] = list_positional_predicate_encoding.long()
    input["offset_mapping"] = offset
    input["gt"] = gt

   


    return input



## Training Argument Identification and Classification

In [7]:
from hw2.stud.arg import Arg_Classifier 
from sklearn.metrics import f1_score
from torch.optim.lr_scheduler import ExponentialLR

model = Arg_Classifier("EN").cuda()

optimizer = torch.optim.Adam(model.parameters())
scheduler = ExponentialLR(optimizer, gamma=0.9)

train_dataset = SRL("EN","train")
dev_dataset = SRL("EN","dev")
logSotfMax = torch.nn.LogSoftmax(dim=1)
nll_loss = torch.nn.NLLLoss()

dataloader_train = DataLoader(train_dataset, batch_size=1, shuffle=False, sampler=None,
           batch_sampler=None, num_workers=0, collate_fn=collate_fn,
           pin_memory=False, drop_last=False, timeout=0,
           worker_init_fn=None, prefetch_factor=2,
           persistent_workers=False)

dataloader_dev = DataLoader(dev_dataset, batch_size=1, shuffle=False, sampler=None,
           batch_sampler=None, num_workers=0, collate_fn=collate_fn,
           pin_memory=False, drop_last=False, timeout=0,
           worker_init_fn=None, prefetch_factor=2,
           persistent_workers=False)

auto_model.eval()

EPOCHS = 20

for epoch in range(EPOCHS):

    #TRAINING
    p = []
    g = []
    model.train()
    for i_batch, sample_batched in enumerate(dataloader_train):
        optimizer.zero_grad()
       
        #----------------------PREPARE INPUT/OUTPUT-------------------------------
        input_bert = sample_batched["BERT_input"]
        input_bert['input_ids'] = input_bert['input_ids'].cuda()
        input_bert['token_type_ids'] = input_bert['token_type_ids'].cuda()
        input_bert['attention_mask'] = input_bert['attention_mask'].cuda()
        sample_batched["positional_encoding"] = sample_batched["positional_encoding"].cuda() 
        sample_batched["pos_index"] = sample_batched["pos_index"].cuda() 
        #prepare gt
        gt = torch.flatten(sample_batched["gt"]["arg_gt"]).cuda()
        offset = sample_batched["offset_mapping"]
        #-----------------BERT EMBEDDING---------------------------
        with torch.no_grad():
            output = auto_model(**input_bert)
            output_hidden_states_sum = torch.stack(output.hidden_states[-4:], dim=0).sum(dim=0)
            b,n,h = output_hidden_states_sum.size()
    
        #------------------FILTERING SUB-WORDS----------------------
        subtoken_mask = torch.unsqueeze(offset[:,:, 0] != 0,dim =-1)
        word_emebedding = []
        for i in range(n): 
            subwords_embedding = torch.unsqueeze(output_hidden_states_sum[:,i,:],dim = 1)
            flag = subtoken_mask[0,i,0]
            if flag :
                continue
            else :
                word_emebedding.append(subwords_embedding)
        word_emebedding = torch.cat(word_emebedding,dim = 1)
        #-------------------------FORWARD/BACKWARD----------------------------------
        x = model.forward(subwords_embeddings = output_hidden_states_sum,perdicate_positional_encoding = sample_batched["positional_encoding"], predicate_index = sample_batched["predicate_index"],pos_index_encoding = sample_batched["pos_index"])        
        b,n = sample_batched["gt"]["arg_gt"].size()
        loss = nll_loss(logSotfMax(x),gt)
        loss.backward()
        optimizer.step()

        #-------------------------RESULT STORING----------------------------------
        predicted = torch.argmax(x, dim=1)
        p += predicted.tolist()
        g += gt.tolist()
    

    #-------------------------RESULTS----------------------------------
    print("Epochs n.", epoch)
    print("F1 train:",f1_score(g, p, average=None))
    scheduler.step()



    #EVALUATION
    p = []
    g = []
    model.eval()
    for i_batch, sample_batched in enumerate(dataloader_dev):
    
      #----------------------PREPARE INPUT/OUTPUT-------------------------------
        input_bert = sample_batched["BERT_input"]
        input_bert['input_ids'] = input_bert['input_ids'].cuda()
        input_bert['token_type_ids'] = input_bert['token_type_ids'].cuda()
        input_bert['attention_mask'] = input_bert['attention_mask'].cuda()
        sample_batched["positional_encoding"] = sample_batched["positional_encoding"].cuda() 
        #prepare gt
        gt = torch.flatten(sample_batched["gt"]["arg_gt"]).cuda()
        offset = sample_batched["offset_mapping"]
        #-----------------BERT EMBEDDING---------------------------
        with torch.no_grad():
            output = auto_model(**input_bert)
            output_hidden_states_sum = torch.stack(output.hidden_states[-4:], dim=0).sum(dim=0)
            b,n,h = output_hidden_states_sum.size()
    
        #------------------FILTERING SUB-WORDS----------------------
        subtoken_mask = torch.unsqueeze(offset[:,:, 0] != 0,dim =-1)
        word_emebedding = []
        for i in range(n): 
            subwords_embedding = torch.unsqueeze(output_hidden_states_sum[:,i,:],dim = 1)
            flag = subtoken_mask[0,i,0]
            if flag :
                continue
            else :
                word_emebedding.append(subwords_embedding)
        word_emebedding = torch.cat(word_emebedding,dim = 1)
        #-------------------------FORWARD----------------------------------
        x = model.forward(subwords_embeddings = output_hidden_states_sum,perdicate_positional_encoding = sample_batched["positional_encoding"], predicate_index = sample_batched["predicate_index"])        
        b,n = sample_batched["gt"]["arg_gt"].size()
        #-------------------------RESULT STORING----------------------------------
        predicted = torch.argmax(x, dim=1)
        p += predicted.tolist()
        g += gt.tolist()
    
    #-------------------------RESULTS----------------------------------
    print("F1 train:",f1_score(g, p, average=None))









    


Epochs n. 0
F1 train: [0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.46126188 0.         0.03106009 0.         0.         0.
 0.         0.08500401 0.         0.         0.         0.97514247
 0.         0.         0.02791625]
F1 dev: [0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.59618209 0.         0.         0.         0.         0.
 0.         0.26600496 0.         0.         0.         0.9814664
 0.         0.         0.1106383 ]
Epochs n. 1
F1 train: [0.         0.         0.         0.         0.         0.
 0.         0.         0.07119021 0.         0.         0.06365372
 0.65973435 0.         0.36798438 0.         0.         0.
 0.         0.2863485  0.         0.01008403 0.         0.98086138
 0.         0.         0.15438596]
F1 dev: [0.         0.         0.         0.         0.         0.
 0.         0.         0.16931217 0. 

KeyboardInterrupt: 

## Embedder

In [None]:
from typing import Dict
import torch
from datasets import load_dataset


# we will use with Distil-BERT
language_model_name = "distilbert-base-uncased"
# this GPU should be enough for this task to handle 32 samples per batch
batch_size = 32
# we keep num_workers = min(4 * number of GPUs, number of cores)
# tells the data loader how many sub-processes to use for data loading
num_workers = 2
# optim
learning_rate = 2e-4
weight_decay = 0.0
transformer_learning_rate = 1e-5
transformer_weight_decay = 0.0
# training
epochs = 3
device = "cuda" if torch.cuda.is_available() else "cpu"

# load our dataset
ner_dataset = load_dataset("conll2003")

# let's instantiate the tokenizer
tokenizer = AutoTokenizer.from_pretrained(language_model_name)

# here we define a vocab dict to map the labels to int (and vice versa)
label_list = ner_dataset["train"].features["ner_tags"].feature.names
label_to_id = {n: i for i, n in enumerate(ner_dataset["train"].features["ner_tags"].feature.names)}
id_to_label = {i: n for n, i in label_to_id.items()}

# here we define our collate function
def collate_fn(batch) -> Dict[str, torch.Tensor]:
    batch_out = tokenizer(
        [sentence["tokens"] for sentence in batch],
        return_tensors="pt",
        padding=True,
        # We use this argument because the texts in our dataset are lists of words.
        is_split_into_words=True,
    )
    labels = []
    ner_tags = [sentence["ner_tags"] for sentence in batch]
    for i, label in enumerate(ner_tags):
      # obtains the word_ids of the i-th sentence
      word_ids = batch_out.word_ids(batch_index=i)
      previous_word_idx = None
      label_ids = []
      for word_idx in word_ids:
        # Special tokens have a word id that is None. We set the label to -100 so they are automatically
        # ignored in the loss function.
        if word_idx is None:
          label_ids.append(-100)
        # We set the label for the first token of each word.
        elif word_idx != previous_word_idx:
          label_ids.append(label[word_idx])
        # For the other tokens in a word, we set the label to -100 so they are automatically
        # ignored in the loss function.
        else:
          label_ids.append(-100)
        previous_word_idx = word_idx
      labels.append(label_ids)
    
    # pad the labels with -100
    batch_max_length = len(max(labels, key=len))
    labels = [l + ([-100] * abs(batch_max_length - len(l))) for l in labels]
    batch_out["labels"] = torch.as_tensor(labels)
    return batch_out