In [1]:
#------# Import libraries and datasets #------#

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer
import datasets as dts
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
import re
import gc
%matplotlib inline

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from wordcloud import WordCloud,STOPWORDS
from nltk.stem.snowball import SnowballStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MultiLabelBinarizer
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multioutput import ClassifierChain
from imblearn.under_sampling import TomekLinks
from imblearn.under_sampling import RandomUnderSampler

from transformers import BertModel,AutoModel
from sklearn.metrics import f1_score
from sklearn.metrics import hamming_loss
from sklearn.metrics import accuracy_score

In [2]:
dataset = dts.load_dataset('lex_glue','unfair_tos')

Found cached dataset lex_glue (/home/anas/.cache/huggingface/datasets/lex_glue/unfair_tos/1.0.0/8a66420941bf6e77a7ddd4da4d3bfb7ba88ef48c1d55302a568ac650a095ca3a)


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
train_dataset = pd.DataFrame.from_dict(dataset["train"])
val_dataset = pd.DataFrame.from_dict(dataset["validation"])
test_dataset = pd.DataFrame.from_dict(dataset["test"])

stop_words = list(stopwords.words('english'))

In [4]:
definitions = {"Limitation of liability": ["This clause stipulates that the duty to pay damages is limited or excluded, for certain kind of losses, under certain conditions. "]
               , "Unilateral termination": ["This clause gives provider the right to suspend and/or terminate the service and/or the contract, and sometimes details the circumstances under which the provider claims to have a right to do so."]
               , "Unilateral change": ["This clause specifies the conditions under which the service provider could amend and modify the terms of service and/or the service itself."]
               , "Content removal": ["This clause gives the provider a right to modify/delete user’s content, including in-app purchases, and sometimes specifies the conditions under which the service provider may do so."]
               , "Contract by using": ["This clause stipulates that the consumer is bound by the terms of use of a specific service, simply by using the service, without even being required to mark that he or she has read and accepted them."]
               , "Choice of law": ["This clause specifies what law will govern the contract, meaning also what law will be applied in potential adjudication of a dispute arising under the contract."]
               , "Jurisdiction": ["This selection clause requires or allows the parties to resolve their disputes through an arbitration process, before the case could go to court."]
               , "Arbitration": ["This forum selection clause requires or allows the parties to resolve their disputes through an arbitration process, before the case could go to court however, such a clause may or may not specify that arbitration should occur within a specific jurisdiction. "]}

entailment_con = ["entails"]

### Default with 8 + 1 classes

In [115]:
class BERTClassifier(nn.Module):
    def __init__(self, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained("nlpaueb/legal-bert-base-uncased")
        #self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(768, 8)
        self.sig = torch.nn.GELU()
        
    def forward(self, input_ids, attention_mask):
        _ , pooled_output = self.bert(input_ids=input_ids, attention_mask =attention_mask,return_dict=False)
        pooled_output = self.dropout(pooled_output)
        logits = self.fc(pooled_output)
        prediction = self.sig(logits)
        
        return logits

learning_rate = 3e-5
num_classes = 8
base_model = BERTClassifier(num_classes)
#loss_function = nn.CrossEntropyLoss()
loss_function = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(base_model.parameters(), lr=learning_rate)
print (base_model)

Some weights of the model checkpoint at nlpaueb/legal-bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BERTClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tru

In [116]:
class CustomDataset(Dataset):
    def __init__(self, dataset,num_classes,tokenizer):
        
        self.dataset = dataset
        self.texts = self.dataset["text"]
        self.labels = self.dataset["labels"]
        self.num_classes = num_classes
        self.tokenizer = tokenizer
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, index):
        text = self.texts[index]
        label = self.labels[index]
        
        # Tokenize the text
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=64,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        input_ids = inputs['input_ids'].squeeze()
        attention_mask = inputs['attention_mask'].squeeze()
        #print ("original label: ",label)
        # Convert label to one-hot encoding
        multi_label = torch.zeros(self.num_classes, dtype=torch.float32)
        multi_label[label] = 1
        #print ("one hot multi : ",multi_label)
        return {'input_ids':input_ids, 'attention_mask':attention_mask, 'multi_label':multi_label}

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
batch_size = 8
train_custom = CustomDataset(train_dataset, num_classes,tokenizer)
train_dataloader = DataLoader(train_custom, batch_size=batch_size, shuffle=True)
valid_custom = CustomDataset(val_dataset, num_classes,tokenizer)
val_dataloader = DataLoader(valid_custom, batch_size=batch_size, shuffle=True)
def find_metrics(targets,prediction):
    final_pred = ((torch.sigmoid(prediction) >= 0.5) * 1.0)
    np_tar = targets.cpu().detach().numpy()
    np_pred = final_pred.cpu().detach().numpy()
    
    avg_f1_mic = f1_score(np_tar.flatten(), np_pred.flatten(), average='micro',zero_division=0)
    avg_f1_mac = f1_score(np_tar, np_pred, average='macro',zero_division=1)
    avg_acc = accuracy_score(np_tar, np_pred)
    del np_tar
    del np_pred
    del final_pred
    return avg_f1_mic, avg_f1_mac, avg_acc


In [101]:
# overfitting on one example:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model.to(device)
one_example = None
while one_example == None:
    curr_batch = next(iter(train_dataloader))
    if torch.sum(curr_batch["multi_label"]) > 1:
        one_example = curr_batch

def overfit_one(base_model,fit_example):
    num_epochs = 1000
    running_loss = []
    iteration = 0
    for epoch in range(num_epochs):
        base_model.train()  # Set the model to training mode            
        
        iteration +=1
        if iteration % 20 == 0:
            print ("eval epoch")
            base_model.eval()
        input_ids = fit_example['input_ids'].to(device)
        attention_mask = fit_example['attention_mask'].to(device)
        targets = fit_example['multi_label'].to(device)


        outputs = base_model(input_ids,attention_mask)
        loss = loss_function(outputs.to(device), targets)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        #print (iteration)
        running_loss.append(loss.item())
        if len(running_loss) > 20:
            running_loss.pop(0)
        #print (outputs,"||||",targets)
        #print ("debug metrics : ",find_metrics(targets,targets))
        print (f"Epoch : {epoch} ,Iteration : {iteration}, training loss: {loss:.4f} , running loss:{sum(running_loss)/len(running_loss)}",find_metrics(targets,outputs))

        if iteration % 20 == 0:
            base_model.train()

        
        # freeing up excess memory
        del loss, outputs
        gc.collect()
        torch.cuda.empty_cache()
        
    return base_model

base_model = overfit_one(base_model,one_example)


Epoch : 0 ,Iteration : 1, training loss: 1.6184 , running loss:1.6184499263763428 (0.875, 0.875, 0.0)
Epoch : 1 ,Iteration : 2, training loss: 2.1937 , running loss:1.9060617685317993 (0.875, 0.875, 0.0)
Epoch : 2 ,Iteration : 3, training loss: 1.5850 , running loss:1.7990306615829468 (0.75, 0.75, 0.0)
Epoch : 3 ,Iteration : 4, training loss: 1.8905 , running loss:1.8219015300273895 (0.875, 0.875, 0.0)
Epoch : 4 ,Iteration : 5, training loss: 1.6284 , running loss:1.7831977367401124 (0.75, 0.75, 0.0)
Epoch : 5 ,Iteration : 6, training loss: 1.6870 , running loss:1.7671671311060588 (0.75, 0.75, 0.0)
Epoch : 6 ,Iteration : 7, training loss: 1.5712 , running loss:1.7391750812530518 (0.875, 0.875, 0.0)
Epoch : 7 ,Iteration : 8, training loss: 1.4853 , running loss:1.707445204257965 (1.0, 1.0, 1.0)
Epoch : 8 ,Iteration : 9, training loss: 1.4633 , running loss:1.680318898624844 (0.875, 0.875, 0.0)
Epoch : 9 ,Iteration : 10, training loss: 1.4691 , running loss:1.6592008233070374 (1.0, 1.0, 

Epoch : 83 ,Iteration : 84, training loss: 1.3963 , running loss:1.4075821936130524 (1.0, 1.0, 1.0)
Epoch : 84 ,Iteration : 85, training loss: 1.4088 , running loss:1.4083540916442872 (1.0, 1.0, 1.0)
Epoch : 85 ,Iteration : 86, training loss: 1.4261 , running loss:1.409983789920807 (1.0, 1.0, 1.0)
Epoch : 86 ,Iteration : 87, training loss: 1.3923 , running loss:1.407001680135727 (1.0, 1.0, 1.0)
Epoch : 87 ,Iteration : 88, training loss: 1.3982 , running loss:1.4067601144313813 (1.0, 1.0, 1.0)
Epoch : 88 ,Iteration : 89, training loss: 1.3960 , running loss:1.4069125294685363 (1.0, 1.0, 1.0)
Epoch : 89 ,Iteration : 90, training loss: 1.4050 , running loss:1.407490473985672 (1.0, 1.0, 1.0)
Epoch : 90 ,Iteration : 91, training loss: 1.4073 , running loss:1.4074879586696625 (1.0, 1.0, 1.0)
Epoch : 91 ,Iteration : 92, training loss: 1.4480 , running loss:1.4085281789302826 (1.0, 1.0, 1.0)
Epoch : 92 ,Iteration : 93, training loss: 1.3930 , running loss:1.4082599461078644 (1.0, 1.0, 1.0)
Epo

KeyboardInterrupt: 

In [118]:
from sklearn.metrics import f1_score, accuracy_score
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model.to(device)
def compute_f1_score(true_labels, predicted_labels):
    num_labels = true_labels.shape[1]
    f1_scores = []

    for label_idx in range(num_labels):
        true_positives = np.sum(np.logical_and(true_labels[:, label_idx], predicted_labels[:, label_idx]))
        false_positives = np.sum(np.logical_and(np.logical_not(true_labels[:, label_idx]), predicted_labels[:, label_idx]))
        false_negatives = np.sum(np.logical_and(true_labels[:, label_idx], np.logical_not(predicted_labels[:, label_idx])))

        precision = true_positives / (true_positives + false_positives + 1e-16)
        recall = true_positives / (true_positives + false_negatives + 1e-16)
        f1 = 2 * (precision * recall) / (precision + recall + 1e-16)

        f1_scores.append(f1)

    macro_f1_score = np.mean(f1_scores)

    return macro_f1_score
def gpt_imp(predicted_labels,true_labels):
    # Calculate true positives, false positives, and false negatives
    true_positives = torch.logical_and(predicted_labels, true_labels).sum().item()
    false_positives = (predicted_labels.logical_not() & true_labels).sum().item()
    false_negatives = (predicted_labels & true_labels.logical_not()).sum().item()

    # Calculate precision, recall, and micro F1 score
    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)
    micro_f1_score = 2 * (precision * recall) / (precision + recall + 0.000000000001)
    return micro_f1_score
    
def find_metrics1(targets,prediction):
    final_pred = ((torch.sigmoid(prediction) >= 0.5) * 1.0)
    #final_pred = prediction
    
    append_out = torch.cat((final_pred,((torch.sum(final_pred,dim=1) < 1.0)  * 1.0).unsqueeze(1)),dim=1)
    append_tar = torch.cat((targets,((torch.sum(targets,dim=1) < 1.0)  * 1.0).unsqueeze(1)),dim=1)

    np_tar = append_tar.cpu().detach().numpy()
    np_pred = append_out.cpu().detach().numpy()
    
    avg_f1_mic = f1_score(np_tar, np_pred, average='micro',zero_division=0)
    avg_f1_mac = f1_score(np_tar.flatten(), np_pred.flatten(), average='macro',zero_division=0)
    avg_acc = accuracy_score(np_tar, np_pred)
    waise_hi = gpt_imp(append_tar.to(torch.int64),append_out.to(torch.int64))
    waise_hi2 = compute_f1_score(np_tar,np_pred)
    del np_tar
    del np_pred
    del final_pred
    
    return avg_f1_mic, avg_f1_mac, avg_acc, waise_hi,waise_hi2


def train(base_model,train_dataloader,optimizer,loss_function):
    # Training loop
    
    max_mac = 0
    max_mic = 0
    acc_step = 2
    num_epochs = 20
    valid_interval = 10  # Perform validation and save model every 10 iterations
    iteration = 0
    
    stop_criterion = 2000000

    running_loss = []
    for epoch in range(num_epochs):
        base_model.train()  # Set the model to training mode
        for curr_batch in train_dataloader:
            
            if iteration > stop_criterion:
                break
            
            input_ids = curr_batch['input_ids'].to(device)
            attention_mask = curr_batch['attention_mask'].to(device)
            targets = curr_batch['multi_label'].to(device)


            outputs = base_model(input_ids,attention_mask)
            loss = loss_function(outputs.to(device), targets)
            
            if iteration % acc_step == 0:
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            #print (iteration)
            running_loss.append(loss.item())
            if len(running_loss) > 20:
                running_loss.pop(0)
            #print (f"Epoch : {epoch} ,Iteration : {iteration}, training loss: {loss:.4f} , running loss:{sum(running_loss)/len(running_loss)}")            
            print (f"Epoch : {epoch} ,Iteration : {iteration}, training loss: {loss:.4f} , running loss:{sum(running_loss)/len(running_loss)}",find_metrics1(targets,outputs))
            
            # freeing up excess memory
            del loss, outputs
            gc.collect()
            torch.cuda.empty_cache()
            
            # Validation and model saving
            if iteration % valid_interval == 0:
                base_model.eval()  # Set the model to evaluation mode

                with torch.no_grad():
                    total_loss = []
                    
                    full_metric_tar = []
                    full_metric_out = []
                    
                    for val_batch in val_dataloader:
                        val_input_ids = val_batch['input_ids'].to(device)
                        val_attention_mask = val_batch['attention_mask'].to(device)
                        val_targets = val_batch['multi_label'].to(device)


                        outputs = base_model(val_input_ids,val_attention_mask)
                        loss = loss_function(outputs.to(device), val_targets)                        
                        
                        total_loss.append(loss.item())
                        if full_metric_out == []:
                            full_metric_out = outputs.detach().clone()
                            full_metric_tar = val_targets.detach().clone()
                        else:
                            full_metric_out = torch.cat((full_metric_out,outputs.detach()),dim=0)
                            full_metric_tar = torch.cat((full_metric_tar,val_targets.detach()),dim=0)

                        # emptying memory
                        del loss, outputs, val_targets, val_input_ids
                        gc.collect()
                        torch.cuda.empty_cache()
                        
                    avg_loss = sum(total_loss)/len(total_loss)
                    val_out_met = find_metrics1(full_metric_tar,full_metric_out)
                    print (f"Validation loss : {sum(total_loss)/len(total_loss)} and metrics(micro,macro,acc) :{val_out_met}")
                    if val_out_met[1] > max_mac :
                        torch.save(base_model.state_dict(),f"model_trained/model_macro_{iteration}.pth")
                        max_mac = val_out_met[1]
                    if val_out_met[0] > max_mic:
                        torch.save(base_model.state_dict(),f"model_trained/model_micro_{iteration}.pth")
                        max_mic = val_out_met[0]

                    del total_loss

                base_model.train()  # Set the model back to training mode
            
            iteration += 1
    return base_model, train_dataloader, optimizer, loss_function

base_model, train_dataloader, optimizer, loss_function = train(base_model,train_dataloader,optimizer,loss_function)

Epoch : 0 ,Iteration : 0, training loss: 0.6343 , running loss:0.6342571377754211 (0.0, 0.38983050847457623, 0.0, 0.0, 0.0)
Validation loss : 0.5753532129421569 and metrics(micro,macro,acc) :(0.026863226863226862, 0.4281826537530369, 0.01010989010989011, 0.026863226862757734, 0.008497135698349046)
Epoch : 0 ,Iteration : 1, training loss: 0.5940 , running loss:0.614126592874527 (0.0, 0.40495867768595045, 0.0, 0.0, 0.0)
Epoch : 0 ,Iteration : 2, training loss: 0.5730 , running loss:0.6004253029823303 (0.0, 0.41463414634146345, 0.0, 0.0, 0.0)
Epoch : 0 ,Iteration : 3, training loss: 0.5347 , running loss:0.5840025097131729 (0.11111111111111112, 0.49206349206349204, 0.125, 0.11111111111061728, 0.024691358024691353)
Epoch : 0 ,Iteration : 4, training loss: 0.4699 , running loss:0.5611724853515625 (0.5, 0.71875, 0.5, 0.49999999999949996, 0.07407407407407407)
Epoch : 0 ,Iteration : 5, training loss: 0.4463 , running loss:0.5420268177986145 (0.875, 0.9296875, 0.875, 0.8749999999995, 0.10370370

Epoch : 0 ,Iteration : 52, training loss: 0.1609 , running loss:0.13630967028439045 (0.75, 0.859375, 0.75, 0.7499999999995, 0.09523809523809523)
Epoch : 0 ,Iteration : 53, training loss: 0.0858 , running loss:0.13207568302750589 (1.0, 1.0, 1.0, 0.9999999999995, 0.1111111111111111)
Epoch : 0 ,Iteration : 54, training loss: 0.0872 , running loss:0.12837861701846123 (1.0, 1.0, 1.0, 0.9999999999995, 0.1111111111111111)
Epoch : 0 ,Iteration : 55, training loss: 0.1128 , running loss:0.12583119608461857 (0.875, 0.9296875, 0.875, 0.8749999999995, 0.1037037037037037)
Epoch : 0 ,Iteration : 56, training loss: 0.1222 , running loss:0.12552982419729233 (0.875, 0.9296875, 0.875, 0.8749999999995, 0.1037037037037037)
Epoch : 0 ,Iteration : 57, training loss: 0.1231 , running loss:0.12550104968249798 (0.875, 0.9296875, 0.875, 0.8749999999995, 0.1037037037037037)
Epoch : 0 ,Iteration : 58, training loss: 0.1167 , running loss:0.1254751469939947 (0.875, 0.9296875, 0.875, 0.8749999999995, 0.103703703703

Epoch : 0 ,Iteration : 104, training loss: 0.1860 , running loss:0.09005487617105246 (0.7058823529411765, 0.8332561371005095, 0.75, 0.7058823529406782, 0.09523809523809523)
Epoch : 0 ,Iteration : 105, training loss: 0.0376 , running loss:0.08719795253127813 (1.0, 1.0, 1.0, 0.9999999999995, 0.1111111111111111)
Epoch : 0 ,Iteration : 106, training loss: 0.1372 , running loss:0.0890637831762433 (0.823529411764706, 0.8999536822603057, 0.875, 0.8235294117642077, 0.1037037037037037)
Epoch : 0 ,Iteration : 107, training loss: 0.0893 , running loss:0.08891997430473567 (0.875, 0.9296875, 0.875, 0.8749999999995, 0.1037037037037037)
Epoch : 0 ,Iteration : 108, training loss: 0.0396 , running loss:0.08842589240521193 (1.0, 1.0, 1.0, 0.9999999999995, 0.1111111111111111)
Epoch : 0 ,Iteration : 109, training loss: 0.1871 , running loss:0.09537457320839167 (0.7777777777777777, 0.873015873015873, 0.875, 0.7777777777772839, 0.1037037037037037)
Epoch : 0 ,Iteration : 110, training loss: 0.0831 , running 

Epoch : 0 ,Iteration : 156, training loss: 0.0822 , running loss:0.079834402538836 (0.875, 0.9296875, 0.875, 0.8749999999995, 0.1037037037037037)
Epoch : 0 ,Iteration : 157, training loss: 0.0772 , running loss:0.07927316445857287 (0.875, 0.9296875, 0.875, 0.8749999999995, 0.1037037037037037)
Epoch : 0 ,Iteration : 158, training loss: 0.0876 , running loss:0.08222005628049374 (0.875, 0.9296875, 0.875, 0.8749999999995, 0.1037037037037037)
Epoch : 0 ,Iteration : 159, training loss: 0.0265 , running loss:0.07964166663587094 (1.0, 1.0, 1.0, 0.9999999999995, 0.1111111111111111)
Epoch : 0 ,Iteration : 160, training loss: 0.0265 , running loss:0.07145281489938497 (1.0, 1.0, 1.0, 0.9999999999995, 0.1111111111111111)
Validation loss : 0.07309549920783753 and metrics(micro,macro,acc) :(0.895163055373167, 0.9409984211199691, 0.8989010989010989, 0.8951630553726669, 0.1051954732510288)
Epoch : 0 ,Iteration : 161, training loss: 0.0250 , running loss:0.06362899634987115 (1.0, 1.0, 1.0, 0.99999999999

Epoch : 0 ,Iteration : 208, training loss: 0.0834 , running loss:0.05770089123398066 (0.875, 0.9296875, 0.875, 0.8749999999995, 0.1037037037037037)
Epoch : 0 ,Iteration : 209, training loss: 0.0911 , running loss:0.06117957672104239 (0.875, 0.9296875, 0.875, 0.8749999999995, 0.1037037037037037)
Epoch : 0 ,Iteration : 210, training loss: 0.0195 , running loss:0.06111033335328102 (1.0, 1.0, 1.0, 0.9999999999995, 0.1111111111111111)
Validation loss : 0.07193113210562029 and metrics(micro,macro,acc) :(0.895163055373167, 0.9409984211199691, 0.8989010989010989, 0.8951630553726669, 0.1051954732510288)
Epoch : 0 ,Iteration : 211, training loss: 0.1395 , running loss:0.06095629334449768 (0.823529411764706, 0.8999536822603057, 0.875, 0.8235294117642077, 0.1037037037037037)
Epoch : 0 ,Iteration : 212, training loss: 0.0772 , running loss:0.0575767207890749 (0.875, 0.9296875, 0.875, 0.8749999999995, 0.1037037037037037)
Epoch : 0 ,Iteration : 213, training loss: 0.3740 , running loss:0.069226996228

KeyboardInterrupt: 

### Find out memory leaks

In [92]:
import torch
gc.collect()
torch.cuda.empty_cache()

In [93]:
# prints currently alive Tensors and Variables
import torch
import gc
print (len(gc.get_objects()))
for obj in gc.get_objects():
    try:
        if torch.is_tensor(obj) or (hasattr(obj, 'data') and torch.is_tensor(obj.data)):
            print(type(obj), obj.size())
    except:
        pass

427178
<class 'torch.nn.parameter.Parameter'> torch.Size([30522, 768])
<class 'torch.nn.parameter.Parameter'> torch.Size([512, 768])
<class 'torch.nn.parameter.Parameter'> torch.Size([2, 768])
<class 'torch.nn.parameter.Parameter'> torch.Size([768])
<class 'torch.nn.parameter.Parameter'> torch.Size([768])
<class 'torch.nn.parameter.Parameter'> torch.Size([768, 768])
<class 'torch.nn.parameter.Parameter'> torch.Size([768])
<class 'torch.nn.parameter.Parameter'> torch.Size([3072, 768])
<class 'torch.nn.parameter.Parameter'> torch.Size([3072])
<class 'torch.nn.parameter.Parameter'> torch.Size([768, 3072])
<class 'torch.nn.parameter.Parameter'> torch.Size([768])
<class 'torch.nn.parameter.Parameter'> torch.Size([768])
<class 'torch.nn.parameter.Parameter'> torch.Size([768])
<class 'torch.nn.parameter.Parameter'> torch.Size([3072, 768])
<class 'torch.nn.parameter.Parameter'> torch.Size([3072])
<class 'torch.nn.parameter.Parameter'> torch.Size([768, 3072])
<class 'torch.nn.parameter.Paramete

<class 'torch.Tensor'> torch.Size([768])
<class 'torch.Tensor'> torch.Size([768, 768])
<class 'torch.Tensor'> torch.Size([768])
<class 'torch.Tensor'> torch.Size([768, 768])
<class 'torch.Tensor'> torch.Size([768])
<class 'torch.Tensor'> torch.Size([768, 768])
<class 'torch.Tensor'> torch.Size([768])
<class 'torch.Tensor'> torch.Size([768])
<class 'torch.Tensor'> torch.Size([768])
<class 'torch.Tensor'> torch.Size([3072, 768])
<class 'torch.Tensor'> torch.Size([3072])
<class 'torch.Tensor'> torch.Size([768, 3072])
<class 'torch.Tensor'> torch.Size([768])
<class 'torch.Tensor'> torch.Size([768])
<class 'torch.Tensor'> torch.Size([768])
<class 'torch.Tensor'> torch.Size([768, 768])
<class 'torch.Tensor'> torch.Size([768])
<class 'torch.Tensor'> torch.Size([768, 768])
<class 'torch.Tensor'> torch.Size([768])
<class 'torch.Tensor'> torch.Size([768, 768])
<class 'torch.Tensor'> torch.Size([768])
<class 'torch.Tensor'> torch.Size([768, 768])
<class 'torch.Tensor'> torch.Size([768])
<class 't



In [24]:
import sys
# Print local variable names with memory space
print("Local variables:")
for name, value in locals().items():
    print(f"{name}: {sys.getsizeof(value)} bytes")

# Print global variable names with memory space
print("Global variables:")
for name, value in globals().items():
    print(f"{name}: {sys.getsizeof(value)} bytes")

Local variables:
__name__: 57 bytes
__doc__: 113 bytes
__package__: 16 bytes
__loader__: 16 bytes
__spec__: 16 bytes
__builtin__: 72 bytes
__builtins__: 72 bytes
_ih: 312 bytes
_oh: 232 bytes
_dh: 64 bytes
In: 312 bytes
Out: 232 bytes
get_ipython: 64 bytes
exit: 48 bytes
quit: 48 bytes
_: 49 bytes
__: 49 bytes
___: 49 bytes
_i: 385 bytes
_ii: 341 bytes
_iii: 99 bytes
_i1: 1634 bytes
np: 72 bytes
torch: 72 bytes
nn: 72 bytes
optim: 72 bytes
Dataset: 1064 bytes
DataLoader: 1472 bytes
BertTokenizer: 2008 bytes
dts: 72 bytes
pd: 72 bytes
sns: 72 bytes
plt: 72 bytes
nltk: 72 bytes
re: 72 bytes
gc: 72 bytes
stopwords: 48 bytes
CountVectorizer: 1064 bytes
TfidfVectorizer: 1064 bytes
WordCloud: 1064 bytes
STOPWORDS: 8408 bytes
SnowballStemmer: 1064 bytes
train_test_split: 136 bytes
TfidfTransformer: 1064 bytes
MultinomialNB: 1064 bytes
OneVsRestClassifier: 1064 bytes
LinearSVC: 1064 bytes
LogisticRegression: 1064 bytes
Pipeline: 1064 bytes
MultiLabelBinarizer: 1064 bytes
BinaryRelevance: 1064 

### Balanced Dataset

In [116]:
def list_it(curr):
    return [curr]

train_dataset['text_2d'] = train_dataset["text"].apply(list_it)
train_dataset['str_labels'] = train_dataset["labels"].apply(str)

rus = RandomUnderSampler(sampling_strategy='majority')
X_train_resampled, y_train_resampled = rus.fit_resample(np.array(train_dataset["text_2d"]).reshape(-1, 1),train_dataset['str_labels'])

In [122]:
def delist_it(curr):
    new_list = []
    for i in curr:
        new_list.append(i[0][0])
    return new_list
def str2list(curr):
    if curr=="[]":
        return []
    else:
        return [int(x) for x in curr[1:-1].split(',')]
x_train_weight = {}
x_train_weight["text"] = delist_it(X_train_resampled)
x_train_weight["labels"] = y_train_resampled.apply(str2list)

In [125]:
train_custom_w = CustomDataset(x_train_weight, num_classes,tokenizer)
train_dataloader_w = DataLoader(train_custom_w, batch_size=batch_size, shuffle=True)

In [128]:
from sklearn.metrics import f1_score, accuracy_score
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model.to(device)



def train_w(base_model,train_dataloader,optimizer,loss_function):
    # Training loop
    num_epochs = 20
    valid_interval = 10  # Perform validation and save model every 10 iterations
    iteration = 0
    acc_step = 2
    stop_criterion = 2000000

    running_loss = []
    for epoch in range(num_epochs):
        base_model.train()  # Set the model to training mode
        for curr_batch in train_dataloader:
            
            if iteration > stop_criterion:
                break
            
            input_ids = curr_batch['input_ids'].to(device)
            attention_mask = curr_batch['attention_mask'].to(device)
            targets = curr_batch['multi_label'].to(device)


            outputs = base_model(input_ids,attention_mask)
            loss = loss_function(outputs.to(device), targets)
            
            if iteration % acc_step == 0:
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            #print (iteration)
            running_loss.append(loss.item())
            if len(running_loss) > 20:
                running_loss.pop(0)
            #print (f"Epoch : {epoch} ,Iteration : {iteration}, training loss: {loss:.4f} , running loss:{sum(running_loss)/len(running_loss)}")            
            print (f"Epoch : {epoch} ,Iteration : {iteration}, training loss: {loss:.4f} , running loss:{sum(running_loss)/len(running_loss)}",find_metrics(targets,outputs))
            
            # freeing up excess memory
            del loss, outputs
            gc.collect()
            torch.cuda.empty_cache()
            
            # Validation and model saving
            if iteration % valid_interval == 0:
                base_model.eval()  # Set the model to evaluation mode

                with torch.no_grad():
                    total_loss = []
                    f1_micro = []
                    f1_macro = []
                    f1_avg = []
                    for val_batch in val_dataloader:
                        val_input_ids = val_batch['input_ids'].to(device)
                        val_attention_mask = val_batch['attention_mask'].to(device)
                        val_targets = val_batch['multi_label'].to(device)


                        outputs = base_model(val_input_ids,val_attention_mask)
                        loss = loss_function(outputs.to(device), val_targets)                        
                        
                        total_loss.append(loss.item())
                        val_out = find_metrics(val_targets,outputs)
                        f1_micro.append(val_out[0])
                        f1_macro.append(val_out[1])
                        f1_avg.append(val_out[2])
                        
                        # emptying memory
                        del val_out, loss, outputs
                        gc.collect()
                        torch.cuda.empty_cache()
                        
                    avg_acc = sum(f1_avg)/len(f1_avg)
                    avg_f1mic = sum(f1_micro)/len(f1_micro)
                    avg_f1mac = sum(f1_macro)/len(f1_macro)
                    avg_loss = sum(total_loss)/len(total_loss)
                    print (f"Validation loss : {sum(total_loss)/len(total_loss)} ", ' ,acc : ',avg_acc," ,f1-micro : ",avg_f1mic," ,f1-macro : ",avg_f1mac)
                    torch.save(base_model.state_dict(),f"model_trained/model_w_{iteration}.pth")
                    #wandb.log({"Validation Loss": sum(total_loss)/len(total_loss)})
                    del total_loss, f1_micro, f1_macro, f1_avg

                base_model.train()  # Set the model back to training mode
            
            iteration += 1
    return base_model, train_dataloader, optimizer, loss_function

base_model, train_dataloader_w, optimizer, loss_function = train_w(base_model,train_dataloader_w ,optimizer,loss_function)

Epoch : 0 ,Iteration : 0, training loss: 0.7228 , running loss:0.7227567434310913 (0.53125, 0.23402777777777778, 0.0)
Validation loss : 0.6665522667399624   ,acc :  0.016228070175438595  ,f1-micro :  0.764437134502924  ,f1-macro :  0.41703390420495645
Epoch : 0 ,Iteration : 1, training loss: 0.6695 , running loss:0.696148693561554 (0.703125, 0.22499999999999998, 0.0)
Epoch : 0 ,Iteration : 2, training loss: 0.5594 , running loss:0.6505704720815023 (0.796875, 0.3409090909090909, 0.125)
Epoch : 0 ,Iteration : 3, training loss: 0.5836 , running loss:0.6338293254375458 (0.8125, 0.5056818181818181, 0.125)
Epoch : 0 ,Iteration : 4, training loss: 0.5987 , running loss:0.6268131971359253 (0.78125, 0.425, 0.125)
Epoch : 0 ,Iteration : 5, training loss: 0.5837 , running loss:0.6196288466453552 (0.8125, 0.6333333333333333, 0.25)
Epoch : 0 ,Iteration : 6, training loss: 0.5572 , running loss:0.6107116256441388 (0.8125, 0.425, 0.25)
Epoch : 0 ,Iteration : 7, training loss: 0.5589 , running loss:0.

Epoch : 0 ,Iteration : 69, training loss: 0.3168 , running loss:0.33141190111637114 (0.90625, 0.375, 0.25)
Epoch : 0 ,Iteration : 70, training loss: 0.2569 , running loss:0.32981830537319184 (0.875, 0.6666666666666667, 0.125)
Validation loss : 0.19155651388461128   ,acc :  0.8747076023391812  ,f1-micro :  0.9832419590643274  ,f1-macro :  0.8840643274853802
Epoch : 0 ,Iteration : 71, training loss: 0.2903 , running loss:0.3269809141755104 (0.875, 0.4375, 0.125)
Epoch : 0 ,Iteration : 72, training loss: 0.3663 , running loss:0.32460304349660873 (0.875, 0.375, 0.0)
Epoch : 0 ,Iteration : 73, training loss: 0.3073 , running loss:0.3194793462753296 (0.875, 0.5625, 0.125)
Epoch : 0 ,Iteration : 74, training loss: 0.3051 , running loss:0.31375869959592817 (0.875, 0.35, 0.25)
Epoch : 0 ,Iteration : 75, training loss: 0.3889 , running loss:0.31918873339891435 (0.859375, 0.25, 0.0)
Epoch : 0 ,Iteration : 76, training loss: 0.2957 , running loss:0.3162463799118996 (0.875, 0.375, 0.0)
Epoch : 0 ,I

KeyboardInterrupt: 

### Computing metrics from the whole dataset

In [122]:
# compute metrics for normal dataset
# computer metrics for the entire dataset
def compute_f1_score(true_labels, predicted_labels):
    num_labels = true_labels.shape[1]
    f1_scores = []

    for label_idx in range(num_labels):
        true_positives = np.sum(np.logical_and(true_labels[:, label_idx], predicted_labels[:, label_idx]))
        false_positives = np.sum(np.logical_and(np.logical_not(true_labels[:, label_idx]), predicted_labels[:, label_idx]))
        false_negatives = np.sum(np.logical_and(true_labels[:, label_idx], np.logical_not(predicted_labels[:, label_idx])))

        precision = true_positives / (true_positives + false_positives + 1e-16)
        recall = true_positives / (true_positives + false_negatives + 1e-16)
        f1 = 2 * (precision * recall) / (precision + recall + 1e-16)

        f1_scores.append(f1)

    macro_f1_score = np.mean(f1_scores)

    return macro_f1_score

def gpt_imp(predicted_labels,true_labels):
    # Calculate true positives, false positives, and false negatives
    true_positives = torch.logical_and(predicted_labels, true_labels).sum().item()
    false_positives = (predicted_labels.logical_not() & true_labels).sum().item()
    false_negatives = (predicted_labels & true_labels.logical_not()).sum().item()

    # Calculate precision, recall, and micro F1 score
    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)
    micro_f1_score = 2 * (precision * recall) / (precision + recall)
    return micro_f1_score
    
def find_metrics1(targets,prediction):
    final_pred = ((torch.sigmoid(prediction) >= 0.5) * 1.0)
    #final_pred = prediction
    
    append_out = torch.cat((final_pred,((torch.sum(final_pred,dim=1) < 1.0)  * 1.0).unsqueeze(1)),dim=1)
    append_tar = torch.cat((targets,((torch.sum(targets,dim=1) < 1.0)  * 1.0).unsqueeze(1)),dim=1)

    np_tar = append_tar.cpu().detach().numpy()
    np_pred = append_out.cpu().detach().numpy()
    
    avg_f1_mic = f1_score(np_tar, np_pred, average='micro',zero_division=0)
    avg_f1_mac = f1_score(np_tar.flatten(), np_pred.flatten(), average='macro',zero_division=0)
    avg_acc = accuracy_score(np_tar, np_pred)
    waise_hi = gpt_imp(append_tar.to(torch.int64),append_out.to(torch.int64))
    waise_hi3 = per_label(np_tar,np_pred)
    del np_tar
    del np_pred
    del final_pred
    
    return avg_f1_mic, avg_f1_mac, avg_acc, waise_hi, waise_hi3

def compute_metrics(targets,prediction):
    # Fix gold labels
    y_true = np.zeros((targets.shape[0], p.label_ids.shape[1] + 1), dtype=np.int32)
    y_true[:, :-1] = p.label_ids
    y_true[:, -1] = (np.sum(p.label_ids, axis=1) == 0).astype('int32')
    # Fix predictions
    logits = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = (expit(logits) > 0.5).astype('int32')
    y_pred = np.zeros((p.label_ids.shape[0], p.label_ids.shape[1] + 1), dtype=np.int32)
    y_pred[:, :-1] = preds
    y_pred[:, -1] = (np.sum(preds, axis=1) == 0).astype('int32')
    # Compute scores
    macro_f1 = f1_score(y_true=y_true, y_pred=y_pred, average='macro', zero_division=0)
    micro_f1 = f1_score(y_true=y_true, y_pred=y_pred, average='micro', zero_division=0)
    return {'macro-f1': macro_f1, 'micro-f1': micro_f1}


def per_label(targets,prediction):
    
    output_per_lab = []
    for i in range(0,9):
        samples = targets[:, i] == 1
        per_lab_tar = targets[samples] 
        per_lab_out = prediction[samples]
        avg_f1_mic = f1_score(per_lab_tar, per_lab_out, average='micro',zero_division=0)
        avg_f1_mac = f1_score(per_lab_tar.flatten(), per_lab_out.flatten(), average='macro',zero_division=0)
        avg_acc = accuracy_score(per_lab_tar, per_lab_out)
        
        output_per_lab.append((avg_f1_mac,avg_f1_mic,avg_acc))
    return output_per_lab
        
def test_whole_data(test_data):
    curr_model = BERTClassifier(num_classes)
    curr_model.load_state_dict(torch.load("/home/anas/Desktop/code/legal_tech_GR/baseline/model_trained/model_0.pth"))
    curr_model.eval()
    print ("Starting to test the whole dataset")
    
    
    full_metric_out = []
    full_metric_tar = []
    curr_model.to(device)
    iteri = 0
    for curr_batch in test_data:
        input_ids = curr_batch['input_ids'].to(device)
        attention_mask = curr_batch['attention_mask'].to(device)
        targets = curr_batch['multi_label'].to(device)
        
        outputs = curr_model(input_ids,attention_mask)
        iteri+=1
        if full_metric_out == []:
            full_metric_out = outputs.detach().clone()
            full_metric_tar = targets.detach().clone()
        else:
            full_metric_out = torch.cat((full_metric_out,outputs.detach()),dim=0)
            full_metric_tar = torch.cat((full_metric_tar,targets.detach()),dim=0)
        del outputs,targets,input_ids
    
    #full_metric_out = (torch.sigmoid(full_metric_out) >= 0.5) * 1.0
        
    print (find_metrics1(full_metric_tar, full_metric_out))

    
test_custom = CustomDataset(test_dataset, num_classes,tokenizer)
test_dataloader = DataLoader(test_custom, batch_size=batch_size, shuffle=True)
test_whole_data(test_dataloader)

Some weights of the model checkpoint at nlpaueb/legal-bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Starting to test the whole dataset
(0.8909541511771994, 0.9386282935822179, 0.8948350964530181, 0.8909541511771994, [(0.4790555978674791, 0.07692307692307693, 0.07894736842105263), (0.42809364548494977, 0.0, 0.0), (0.4318936877076412, 0.0, 0.0), (0.4207920792079208, 0.0, 0.0), (0.4359673024523161, 0.0, 0.0), (0.4320388349514563, 0.0, 0.0), (0.43529411764705883, 0.0, 0.0), (0.4375, 0.0, 0.0), (1.0, 1.0, 1.0)])
