In [1]:
#------# Import libraries and datasets #------#

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer
import datasets as dts
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
import re
import gc
%matplotlib inline

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from wordcloud import WordCloud,STOPWORDS
from nltk.stem.snowball import SnowballStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MultiLabelBinarizer
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multioutput import ClassifierChain
from imblearn.under_sampling import TomekLinks
from imblearn.under_sampling import RandomUnderSampler

from transformers import BertModel
from sklearn.metrics import f1_score
from sklearn.metrics import hamming_loss
from sklearn.metrics import accuracy_score

In [2]:
dataset = dts.load_dataset('lex_glue','unfair_tos')

Found cached dataset lex_glue (/home/anas/.cache/huggingface/datasets/lex_glue/unfair_tos/1.0.0/8a66420941bf6e77a7ddd4da4d3bfb7ba88ef48c1d55302a568ac650a095ca3a)


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
train_dataset = pd.DataFrame.from_dict(dataset["train"])
val_dataset = pd.DataFrame.from_dict(dataset["validation"])
test_dataset = pd.DataFrame.from_dict(dataset["test"])

stop_words = list(stopwords.words('english'))

In [4]:
definitions = {"Limitation of liability": ["This clause stipulates that the duty to pay damages is limited or excluded, for certain kind of losses, under certain conditions. "]
               , "Unilateral termination": ["This clause gives provider the right to suspend and/or terminate the service and/or the contract, and sometimes details the circumstances under which the provider claims to have a right to do so."]
               , "Unilateral change": ["This clause specifies the conditions under which the service provider could amend and modify the terms of service and/or the service itself."]
               , "Content removal": ["This clause gives the provider a right to modify/delete userâ€™s content, including in-app purchases, and sometimes specifies the conditions under which the service provider may do so."]
               , "Contract by using": ["This clause stipulates that the consumer is bound by the terms of use of a specific service, simply by using the service, without even being required to mark that he or she has read and accepted them."]
               , "Choice of law": ["This clause specifies what law will govern the contract, meaning also what law will be applied in potential adjudication of a dispute arising under the contract."]
               , "Jurisdiction": ["This selection clause requires or allows the parties to resolve their disputes through an arbitration process, before the case could go to court."]
               , "Arbitration": ["This forum selection clause requires or allows the parties to resolve their disputes through an arbitration process, before the case could go to court however, such a clause may or may not specify that arbitration should occur within a specific jurisdiction. "]}

entailment_con = ["entails"]

### Default with 8 + 1 classes

In [45]:
class BERTClassifier(nn.Module):
    def __init__(self, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(768, 8)
        
    def forward(self, input_ids, attention_mask):
        _ , pooled_output = self.bert(input_ids=input_ids, attention_mask =attention_mask,return_dict=False)
        #pooled_output = self.dropout(pooled_output)
        logits = self.fc(pooled_output)
        return logits

learning_rate = 1e-5
num_classes = 8
base_model = BERTClassifier(num_classes)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(base_model.parameters(), lr=learning_rate)
print (base_model)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BERTClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tru

In [57]:
class CustomDataset(Dataset):
    def __init__(self, dataset,num_classes,tokenizer):
        
        self.dataset = dataset
        self.texts = self.dataset["text"]
        self.labels = self.dataset["labels"]
        self.num_classes = num_classes
        self.tokenizer = tokenizer
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, index):
        text = self.texts[index]
        label = self.labels[index]
        
        # Tokenize the text
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=64,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        input_ids = inputs['input_ids'].squeeze()
        attention_mask = inputs['attention_mask'].squeeze()
        #print ("original label: ",label)
        # Convert label to one-hot encoding
        multi_label = torch.zeros(self.num_classes, dtype=torch.float32)
        multi_label[label] = 1
        #print ("one hot multi : ",multi_label)
        return {'input_ids':input_ids, 'attention_mask':attention_mask, 'multi_label':multi_label}

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
batch_size = 32
train_custom = CustomDataset(train_dataset, num_classes,tokenizer)
train_dataloader = DataLoader(train_custom, batch_size=batch_size, shuffle=True)
#valid_custom = CustomDataset(val_dataset, num_classes,tokenizer)
#val_dataloader = DataLoader(valid_custom, batch_size=batch_size, shuffle=True)

In [58]:
# overfitting on one example:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model.to(device)
one_example = next(iter(dataloader))    

def overfit_one(base_model,fit_example):
    num_epochs = 100:
    running_loss = []
    for epoch in range(num_epochs):
        base_model.train()  # Set the model to training mode            

        input_ids = fit_example['input_ids'].to(device)
        attention_mask = fit_example['attention_mask'].to(device)
        targets = fit_example['multi_label'].to(device)


        outputs = base_model(input_ids,attention_mask)
        loss = loss_function(outputs.to(device), targets)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        #print (iteration)
        running_loss.append(loss.item())
        if len(running_loss) > 20:
            running_loss.pop(0)
        print (f"Epoch : {epoch} ,Iteration : {iteration}, training loss: {loss:.4f} , running loss:{sum(running_loss)/len(running_loss)}",find_metrics(targets,outputs))

        # freeing up excess memory
        del loss, outputs
        gc.collect()
        torch.cuda.empty_cache()
    


In [59]:
from sklearn.metrics import f1_score, accuracy_score
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model.to(device)

def find_metrics(targets,prediction):
    final_pred = (torch.sigmoid(prediction) > 0.5) * 1.0
    np_tar = targets.cpu().detach().numpy()
    np_pred = final_pred.cpu().detach().numpy()
    
    avg_f1_mic = f1_score(np_tar.flatten(), np_pred.flatten(), average='micro',zero_division=1)
    avg_f1_mac = f1_score(np_tar, np_pred, average='macro',zero_division=1)
    avg_acc = accuracy_score(np_tar, np_pred)
    del np_tar
    del np_pred
    del final_pred
    return avg_f1_mic, avg_f1_mac, avg_acc

    

def train(base_model,train_dataloader,optimizer,loss_function):
    # Training loop
    num_epochs = 20
    valid_interval = 10  # Perform validation and save model every 10 iterations
    iteration = 0
    
    stop_criterion = 2000000

    running_loss = []
    for epoch in range(num_epochs):
        base_model.train()  # Set the model to training mode
        for curr_batch in train_dataloader:
            
            if iteration > stop_criterion:
                break
            
            input_ids = curr_batch['input_ids'].to(device)
            attention_mask = curr_batch['attention_mask'].to(device)
            targets = curr_batch['multi_label'].to(device)


            outputs = base_model(input_ids,attention_mask)
            loss = loss_function(outputs.to(device), targets)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            #print (iteration)
            running_loss.append(loss.item())
            if len(running_loss) > 20:
                running_loss.pop(0)
            #print (f"Epoch : {epoch} ,Iteration : {iteration}, training loss: {loss:.4f} , running loss:{sum(running_loss)/len(running_loss)}")            
            print (f"Epoch : {epoch} ,Iteration : {iteration}, training loss: {loss:.4f} , running loss:{sum(running_loss)/len(running_loss)}",find_metrics(targets,outputs))
            
            # freeing up excess memory
            del loss, outputs
            gc.collect()
            torch.cuda.empty_cache()
            
            # Validation and model saving
            '''if iteration % valid_interval == 0:
                base_model.eval()  # Set the model to evaluation mode

                with torch.no_grad():
                    total_loss = []
                    f1_micro = []
                    f1_macro = []
                    f1_avg = []
                    for val_batch in val_dataloader:
                        val_input_ids = val_batch['input_ids'].to(device)
                        val_attention_mask = val_batch['attention_mask'].to(device)
                        val_targets = val_batch['multi_label'].to(device)


                        outputs = base_model(val_input_ids,val_attention_mask)
                        loss = loss_function(outputs.to(device), val_targets)                        
                        
                        total_loss.append(loss.item())
                        val_out = find_metrics(val_targets,outputs)
                        f1_micro.append(val_out[0])
                        f1_macro.append(val_out[1])
                        f1_avg.append(val_out[2])
                        
                        # emptying memory
                        del val_out, loss, outputs
                        gc.collect()
                        torch.cuda.empty_cache()
                        
                    avg_acc = sum(f1_avg)/len(f1_avg)
                    avg_f1mic = sum(f1_micro)/len(f1_micro)
                    avg_f1mac = sum(f1_macro)/len(f1_macro)
                    avg_loss = sum(total_loss)/len(total_loss)
                    print (f"Validation loss : {sum(total_loss)/len(total_loss)} ", ' ,acc : ',avg_loss," ,f1-micro : ",avg_f1mic," ,f1-macro : ",avg_f1mac)
                    torch.save(base_model.state_dict(),f"model_trained/model_{iteration}.pth")
                    #wandb.log({"Validation Loss": sum(total_loss)/len(total_loss)})
                    del total_loss, f1_micro, f1_macro, f1_avg '''

                #base_model.train()  # Set the model back to training mode
            
            iteration += 1
    return base_model, train_dataloader, optimizer, loss_function

base_model, train_dataloader, optimizer, loss_function = train(base_model,train_dataloader,optimizer,loss_function)

Epoch : 0 ,Iteration : 0, training loss: 0.0737 , running loss:0.0737270712852478 (0.59765625, 0.125, 0.0)
Epoch : 0 ,Iteration : 1, training loss: 0.3944 , running loss:0.2340606153011322 (0.5859375, 0.20806451612903226, 0.0)
Epoch : 0 ,Iteration : 2, training loss: 0.3834 , running loss:0.2838270664215088 (0.51953125, 0.191017316017316, 0.0)
Epoch : 0 ,Iteration : 3, training loss: 0.2444 , running loss:0.2739791050553322 (0.46484375, 0.034722222222222224, 0.0)
Epoch : 0 ,Iteration : 4, training loss: 0.3722 , running loss:0.29361444115638735 (0.453125, 0.059375000000000004, 0.0)
Epoch : 0 ,Iteration : 5, training loss: 0.4274 , running loss:0.3159133394559224 (0.4609375, 0.06567887931034483, 0.0)
Epoch : 0 ,Iteration : 6, training loss: 0.2001 , running loss:0.29936590577874866 (0.4453125, 0.03333333333333333, 0.0)
Epoch : 0 ,Iteration : 7, training loss: 0.3787 , running loss:0.3092879597097635 (0.4609375, 0.023973607038123165, 0.0)
Epoch : 0 ,Iteration : 8, training loss: 0.1462 ,

KeyboardInterrupt: 

### Entailment training

In [56]:
import torch
gc.collect()
torch.cuda.empty_cache()

In [60]:
# prints currently alive Tensors and Variables
import torch
import gc
print (len(gc.get_objects()))
for obj in gc.get_objects():
    try:
        if torch.is_tensor(obj) or (hasattr(obj, 'data') and torch.is_tensor(obj.data)):
            print(type(obj), obj.size())
    except:
        pass

337545
<class 'torch.Tensor'> torch.Size([0])
<class 'torch.nn.parameter.Parameter'> torch.Size([30522, 768])
<class 'torch.nn.parameter.Parameter'> torch.Size([512, 768])
<class 'torch.nn.parameter.Parameter'> torch.Size([2, 768])
<class 'torch.nn.parameter.Parameter'> torch.Size([768])
<class 'torch.nn.parameter.Parameter'> torch.Size([768])
<class 'torch.nn.parameter.Parameter'> torch.Size([768, 768])
<class 'torch.nn.parameter.Parameter'> torch.Size([768])
<class 'torch.nn.parameter.Parameter'> torch.Size([3072, 768])
<class 'torch.nn.parameter.Parameter'> torch.Size([3072])
<class 'torch.nn.parameter.Parameter'> torch.Size([768, 3072])
<class 'torch.nn.parameter.Parameter'> torch.Size([768])
<class 'torch.nn.parameter.Parameter'> torch.Size([768])
<class 'torch.nn.parameter.Parameter'> torch.Size([768])
<class 'torch.nn.parameter.Parameter'> torch.Size([3072, 768])
<class 'torch.nn.parameter.Parameter'> torch.Size([3072])
<class 'torch.nn.parameter.Parameter'> torch.Size([768, 307

In [28]:
del val_dataset
del test_dataset

NameError: name 'val_dataset' is not defined

In [24]:
import sys
# Print local variable names with memory space
print("Local variables:")
for name, value in locals().items():
    print(f"{name}: {sys.getsizeof(value)} bytes")

# Print global variable names with memory space
print("Global variables:")
for name, value in globals().items():
    print(f"{name}: {sys.getsizeof(value)} bytes")

Local variables:
__name__: 57 bytes
__doc__: 113 bytes
__package__: 16 bytes
__loader__: 16 bytes
__spec__: 16 bytes
__builtin__: 72 bytes
__builtins__: 72 bytes
_ih: 312 bytes
_oh: 232 bytes
_dh: 64 bytes
In: 312 bytes
Out: 232 bytes
get_ipython: 64 bytes
exit: 48 bytes
quit: 48 bytes
_: 49 bytes
__: 49 bytes
___: 49 bytes
_i: 385 bytes
_ii: 341 bytes
_iii: 99 bytes
_i1: 1634 bytes
np: 72 bytes
torch: 72 bytes
nn: 72 bytes
optim: 72 bytes
Dataset: 1064 bytes
DataLoader: 1472 bytes
BertTokenizer: 2008 bytes
dts: 72 bytes
pd: 72 bytes
sns: 72 bytes
plt: 72 bytes
nltk: 72 bytes
re: 72 bytes
gc: 72 bytes
stopwords: 48 bytes
CountVectorizer: 1064 bytes
TfidfVectorizer: 1064 bytes
WordCloud: 1064 bytes
STOPWORDS: 8408 bytes
SnowballStemmer: 1064 bytes
train_test_split: 136 bytes
TfidfTransformer: 1064 bytes
MultinomialNB: 1064 bytes
OneVsRestClassifier: 1064 bytes
LinearSVC: 1064 bytes
LogisticRegression: 1064 bytes
Pipeline: 1064 bytes
MultiLabelBinarizer: 1064 bytes
BinaryRelevance: 1064 