In [1]:
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModel, AutoTokenizer
from pytorch_lightning import Trainer
from pytorch_lightning import LightningModule
import json
import torchmetrics
from torchmetrics import F1Score
#import evaluate
import copy
from torch.nn.utils.rnn import pad_sequence
#from seqeval.metrics import f1_score

In [2]:
import os
from tqdm.auto import tqdm

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [4]:
class LoadData(Dataset):
    def __init__(self, trainingDataPath, testDataPath, validationDataPath,coarseMapsDataPath):
        self.trainingDataPath = trainingDataPath
        self.testDataPath = testDataPath
        self.validationDataPath = validationDataPath
        self.coarseMapsDataPath = coarseMapsDataPath

    def readData(self, dataPath):
        # Read the JSON file
        with open(dataPath, "r") as f:
            json_data = json.load(f)

        # Some transformation so that we have data in the form {i:object{},i+1:object{},...}. where i is an index (0,...,N)
        json_data = {k: json_data[k] for k in list(json_data)}
        json_data = {index: json_data[k] for index,k in enumerate(list(json_data))}
        return json_data
        
    def retrieveHominimyMaps(self):
        # Read the JSON file
        with open(self.coarseMapsDataPath, "r") as f:
            json_data = json.load(f)

        homonimyCluster_IdxToCluster = {index+1:key for index, (key, _) in enumerate(json_data.items()) }
        homonimyCluster_ClusterToIdx = {value:key for (key, value) in homonimyCluster_IdxToCluster.items() }

        homonimyCluster_ClusterToIdx["unk"] =  0
        homonimyCluster_IdxToCluster[0] = "unk"
        
        return [ homonimyCluster_ClusterToIdx, homonimyCluster_IdxToCluster ]
            
    def retrieveDataSet(self):
        trainingData = self.readData(self.trainingDataPath)
        testData = self.readData(self.testDataPath)
        validationData = self.readData(self.validationDataPath)
        #coarseMapsData = readJson(self, self.validationDataPath)

        return [ trainingData, testData, validationData ] 
    
        


In [5]:
# Specify the path to your JSON file
trainingDataPath = 'train_coarse_grained.json'
testDataPath = 'test_coarse_grained.json'
validationDataPath = 'val_coarse_grained.json'
coarseMapsDataPath = 'coarse_fine_defs_map.json'

manager_dataset = LoadData(trainingDataPath, testDataPath, validationDataPath,coarseMapsDataPath)

trainingData, testData, validationData  = manager_dataset.retrieveDataSet()
homonimyCluster_ClusterToIdx, homonimyCluster_IdxToCluster = manager_dataset.retrieveHominimyMaps()



In [6]:
class TokenClassificationDataset(Dataset):
    def __init__(self, data, tokenizer,homonimyClusterClusterToIdx):
        self.tokenizer = tokenizer
        self.homonimyClusterClusterToIdx = homonimyClusterClusterToIdx
        self.data = data
        #self.data = {k: data[k] for k in list(data)[:]}
        #self.data1 = {index: batch[k] for index,k in enumerate(list(batch)[:4])}

    def __len__(self):
        #return max(len(sub_array["lemmas"]) for _,sub_array in self.data.items())
        return len(self.data)
        
    def is_int_convertible(self, variable):
        try:
            int(variable)
            return True
        except (ValueError, TypeError):
            return False
            
    def __getitem__(self, data_index):
      batch = self.data[data_index]
      inputs_tokenized = self.tokenizer.batch_encode_plus(
            [batch["lemmas"]],
            add_special_tokens=True,  # Disable adding [CLS] and [SEP] tokens
            max_length=512,
            truncation=True,
            padding='max_length',
            is_split_into_words=True,
            return_tensors='pt'
        )
      output_tokenized = copy.copy(batch["senses"])

# convert senses in the correspondig number
      for (key, clusters) in output_tokenized.items():
        for index,cluster in enumerate(clusters):
          if not self.is_int_convertible(cluster):
              try:
                output_tokenized[key][index] = self.homonimyClusterClusterToIdx[cluster]
              except KeyError:
                # key "cluster" does not exist (last element)
                output_tokenized[key][index] = self.homonimyClusterClusterToIdx["unk"]
              
      candidates = copy.copy(batch["candidates"])
# convert candidates in the correspondig number
      for (key, clusters) in candidates.items():
        for index,cluster in enumerate(clusters):
            
          #if we cannot convert it in int it means that cluster is a string
            if not self.is_int_convertible(cluster):
              try:
                candidates[key][index] = self.homonimyClusterClusterToIdx[cluster]
              except KeyError:
                # key "cluster" does not exist (last element)
                candidates[key][index] = self.homonimyClusterClusterToIdx["unk"]
              
              
    # for every sense we have its number
      inputs_tokenized["senses"]= output_tokenized

    #indexes in which i need to assign label =! -100
      target_word_positions = [ inputs_tokenized.word_ids(batch_index=0)[idx] for idx in [ int(key) for key,_ in inputs_tokenized["senses"].items() ]  ]
      target_word_idx = [ int(key) for key,_ in inputs_tokenized["senses"].items() ]
      target_label = [ value[0] for key,value in inputs_tokenized["senses"].items() ]

      
    
      labels = []
      idx_label = 0
      #the position of the target token after tokenization
      position_target_token = [] 
        
      for iterator, idx in enumerate(inputs_tokenized.word_ids()):
            if idx in target_word_idx:
              labels.append(target_label[idx_label])
              position_target_token.append(iterator)
              idx_label+=1
              target_word_idx.remove(idx)
            else:
              labels.append(-100)

    #inputs_tokenized["labels"] = labels
      candidates_list = []
      for iterator,(_,value) in enumerate(candidates.items()):
              candidates_list.append( torch.tensor(value) )


      #candidates = { position_target_token[iterator]: value for iterator,(_,value) in enumerate(candidates.items()) }
      #candidates_list = torch.cat([ torch.tensor(elements) for elements in candidates_list.values()]).view(-1)
      candidates_list = torch.cat(candidates_list).view(-1)
      #candidates_list_padded = nn.ConstantPad1d((0, 50 - len(candidates_list)),0)(candidates_list)
      #batch["input_ids"] = inputs_tokenized["input_ids"]
      #batch["attention_mask"] = inputs_tokenized["attention_mask"]
      #batch["labels"] = torch.tensor(labels)
      #batch["decoder_attention_mask"] = outputs.attention_mask
        # "position_target_token": nn.ConstantPad1d((0, 512 - len(position_target_token)),-1)(torch.tensor([position_target_token],dtype=torch.int16))
      #print(candidates)
      return {"input_ids" : inputs_tokenized["input_ids"].squeeze(), "attention_mask": inputs_tokenized["attention_mask"].squeeze(),
              "labels":torch.tensor(labels, dtype=torch.long), "index": nn.ConstantPad1d((0, 512 - 1),-1)(torch.tensor([data_index])),
                "candidates": nn.ConstantPad1d((0, 512 - len(candidates_list)),-1)(candidates_list) } 


In [11]:

class CoarseClassifier(LightningModule):
    def __init__(self, num_classes, model_name_or_path, learning_rate=2e-5):
        super(CoarseClassifier, self).__init__()
        
        self.num_classes = num_classes
        self.bert = AutoModel.from_pretrained(model_name_or_path, output_hidden_states=True)
        
        # Freeze the BERT model
        for param in self.bert.parameters():
            param.requires_grad = False
            
        self.dropout = nn.Dropout(0.4)
        #self.bert.config.hidden_size
        self.classifier1 = nn.Linear(self.bert.config.hidden_size, num_classes )
        #+1 because we have unknown label
        self.learning_rate = learning_rate
        #self.f1score = F1Score(task="multiclass",num_classes=self.num_classes ,ignore_index=-100)
        self.save_hyperparameters()

    def forward(self, input_ids, attention_mask):
        bert_output = self.bert(input_ids, attention_mask) #,pooled_output
        avarage_hidden_state = torch.stack(bert_output.hidden_states[-4:], dim=0).sum(dim=0)
        #hidden_state = outputs[0]  # (bs, seq_len, dim)
        #pooled_output = hidden_state[:, 0]  # (bs, dim
        #candidates = candidates.unsqueeze(1).expand(-1, 512, -1)
        #output = torch.cat((avarage_hidden_state, candidates), dim=2)
        output = self.dropout(avarage_hidden_state)
        output = self.classifier1(output)
    

        return output

    def training_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        #get the candidates (ignore all the values -1)
        #candidates  = torch.stack([candidate[:torch.nonzero(candidate == -1)[0][0]] for candidate in batch['candidates']])
        #candidates  = torch.stack([candidate for candidate in batch['candidates']])
        logits = self.forward(input_ids, attention_mask)

        loss = nn.CrossEntropyLoss(ignore_index=-100)(logits.view(-1, logits.shape[-1]), labels.view(-1))
        self.log('train_loss', loss,prog_bar=True, logger=True, on_step=True, on_epoch=True)
        return loss

    def validation_step(self, batch, batch_idx=None):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        #candidates  = torch.stack([candidate for candidate in batch['candidates']])
        #get the candidates (ignore all the values -1)

        
        logits = self.forward(input_ids, attention_mask)
        loss = nn.CrossEntropyLoss(ignore_index=-100)(logits.view(-1, logits.shape[-1]), labels.view(-1))
        self.log('val_loss', loss,prog_bar=True, logger=True, on_step=True, on_epoch=True)


        
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.learning_rate)



In [12]:
#classifier.training_step(trainDataset[1])

In [13]:
# Save the trained model in the current path
current_path = os.getcwd()

model_path = os.path.join(current_path, "saved_model")
model_name_or_path = 'kanishka/GlossBERT'
#model_name_or_path = "prajjwal1/bert-mini"
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
batch_size = 16
#batch = {k: train_data[k] for k in list(train_data)[:4]}


trainDataset = TokenClassificationDataset( trainingData, tokenizer, homonimyCluster_ClusterToIdx)
testDataset = TokenClassificationDataset( testData, tokenizer, homonimyCluster_ClusterToIdx)
valDataset = TokenClassificationDataset( validationData, tokenizer, homonimyCluster_ClusterToIdx)



trainLoader = DataLoader(trainDataset, batch_size=batch_size, shuffle=False)
testLoader = DataLoader(testDataset, batch_size=batch_size, shuffle=False)
valLoader = DataLoader(valDataset, batch_size=batch_size, shuffle=False)


#for (key, label) in list(enumerate(trainLoader))[:10]:
#  print(label)
#trainingData, testData, validationData  = dataset.retrieveDataSet()
#homonimyCluster_ClusterToIdx, homonimyCluster_IdxToCluster = dataset.retrieveHominimyMaps()

#trainer = Trainer(max_epochs=3)
#trainer.fit(classifier, dataloader)

In [14]:
num_classes = len(homonimyCluster_ClusterToIdx)

In [15]:

classifier = CoarseClassifier(num_classes=num_classes, model_name_or_path=model_name_or_path)
classifier = classifier.load_from_checkpoint(os.path.join(model_path, "model7.ckpt"),num_classes=len(homonimyCluster_ClusterToIdx), model_name_or_path=model_name_or_path)
                                                                                                                                       
                                                                                                                                       
                                                                                                                                       

Some weights of the model checkpoint at kanishka/GlossBERT were not used when initializing BertModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at kanishka/GlossBERT were not used when initializing BertModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are

In [16]:
classifier.to(device)

CoarseClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_

In [158]:
trainer = Trainer(max_epochs=6,accelerator='gpu')

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [159]:
trainer.fit(classifier, train_dataloaders = trainLoader, val_dataloaders= valLoader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name        | Type      | Params
------------------------------------------
0 | bert        | BertModel | 109 M 
1 | dropout     | Dropout   | 0     
2 | classifier1 | Linear    | 1.7 M 
------------------------------------------
1.7 M     Trainable params
109 M     Non-trainable params
111 M     Total params
444.570   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

tensor([[[ 0.1572, -0.2892, -0.3265,  ..., -0.0175, -0.0349,  0.1516],
         [ 0.5714,  0.6474,  0.3030,  ...,  0.3620,  0.4584,  0.2265],
         [ 0.3530, -0.5867,  0.5659,  ..., -0.6741,  0.3178, -0.6737],
         ...,
         [ 0.5385, -0.4636, -0.0584,  ..., -0.0342, -0.4318, -0.4826],
         [ 0.6379, -0.6538,  0.4555,  ..., -0.1910, -0.6936, -0.4940],
         [ 0.1934, -0.9702,  0.2464,  ...,  0.2335,  0.1424, -1.5309]],

        [[ 0.1572, -0.2892, -0.3265,  ..., -0.0175, -0.0349,  0.1516],
         [-0.2717, -0.1493,  0.2506,  ...,  0.3020,  0.5897, -0.9890],
         [-0.7113,  0.3207, -0.1196,  ...,  0.6679, -0.3092, -0.7156],
         ...,
         [ 0.5385, -0.4636, -0.0584,  ..., -0.0342, -0.4318, -0.4826],
         [ 0.6379, -0.6538,  0.4555,  ..., -0.1910, -0.6936, -0.4940],
         [ 0.1934, -0.9702,  0.2464,  ...,  0.2335,  0.1424, -1.5309]],

        [[ 0.1572, -0.2892, -0.3265,  ..., -0.0175, -0.0349,  0.1516],
         [-0.0224,  0.6208, -0.4091,  ..., -0

IndexError: tuple index out of range

In [None]:
trainer.test(classifier,testLoader)

In [53]:
#trainer.save_checkpoint(os.path.join(model_path, "model7.ckpt"))

In [17]:
def get_position_token(labels):
    #get the position of target token AFTER tokenization
    result = []
    for iterator, label in enumerate(labels):
        if label != -100:
            #result.append({iterator:label})
            result.append(iterator)
    return result
#original_data can be testData, trainingData or validationData
def evaluate_model(classifier, loader, original_data):
    # Set the model to evaluation mode
    classifier.eval()
    result_labels_target = []
    result_predicted_labels = []
    
    with torch.no_grad():
        with tqdm(enumerate( loader ),desc="Batch", leave=False) as data:
            for step, samples in data:
                index = [ index_data[0] for index_data in samples['index'] ]

                result_prediction = classifier(samples['input_ids'].cuda(),samples['attention_mask'].cuda())
                #result_prediction = [ result[0] for result in result_prediction ]
                sample_non_tokanized = [ original_data[ index_data.tolist() ] for index_data in index ] 
                
                position_token = [get_position_token(labels) for labels in samples['labels'] ]

                for idx_sample in range(len(sample_non_tokanized)):
                    #print(position_token[idx_sample])
                    #print(result_prediction[idx_sample][0])
                    predicted_labels_idx = []
                    for iterator, (target_token_idx, candidates) in enumerate(sample_non_tokanized[idx_sample]["candidates"].items()):
                            #print(candidates)
                            # Get the predicted labels considering only the candidates
                            predicted_labels_idx.append(torch.argmax( 
                                torch.tensor([ result_prediction[idx_sample][ position_token[idx_sample][iterator] ][target_prob_idx] for target_prob_idx in candidates ]), dim=-1) )
                    labels_target = [ value[0] for key,value in sample_non_tokanized[idx_sample]["senses"].items() ]
                    predicted_labels = [ candidates[predicted_labels_idx[i]] for i,(key,candidates) in enumerate(sample_non_tokanized[idx_sample]["candidates"].items()) ]
                    result_labels_target.append(labels_target)
                    result_predicted_labels.append(predicted_labels)
                    
    result_labels_target =  [element for sublist in result_labels_target for element in sublist]
    result_predicted_labels =  [element for sublist in result_predicted_labels for element in sublist]
    
    return result_labels_target,result_predicted_labels




In [18]:
result_labels_target,result_predicted_labels = evaluate_model(classifier,testLoader, testData)

Batch: 0it [00:00, ?it/s]

In [19]:
F1Score(avarege='micro',task="multiclass",num_classes=num_classes)(torch.tensor(result_labels_target),torch.tensor(result_predicted_labels))

tensor(0.9484)

In [24]:
train_acc = torch.sum(torch.tensor(result_labels_target) == torch.tensor(result_predicted_labels))

In [None]:
result_labels_target > num_classes e result_predicted_labels == 0

In [25]:
train_acc/len(result_labels_target)

tensor(0.9484)