In [1]:
### This script tries to establish baseline for longformer
### in this folder, we will use LegalBERT to run a MLM on 
### contractnli and see what score it gets
### what metric dees LegalBERT paper use? the training loss, so default huggingface loss should be used

In [2]:
### get the contractnli dataset

### load contractnli

from datasets import load_dataset, DatasetDict, Dataset
import json
from transformers import PerceiverTokenizer, PerceiverModel, PerceiverConfig, PerceiverPreTrainedModel, PerceiverForSequenceClassification, TrainingArguments, Trainer, \
    DataCollatorWithPadding, AutoTokenizer, AutoModelForMaskedLM, DataCollatorForLanguageModeling


import re
import os
from tqdm import tqdm
import torch

ROOT_PATH = "/home/yan_xu_uk_qbe_com/scc_yan/"

with open(os.path.join(ROOT_PATH, "ignored_dir/data/contract-nli/train.json")) as train_json_f:
    train_json = json.load(train_json_f)

id2label = {0: "Entailment", 1: "Contradiction", 2: "NotMnetioned"}
label2id = {"Entailment": 0, "Contradiction": 1, "NotMentioned": 2}

def load_dataset_custom(dataset_name):
    if dataset_name == "contract-nli":
        def contract_nli_iterator(data):
            documents, labels = data['documents'], data['labels']
            for document in documents:
                id = document['id']
                file_name = document['file_name']
                text = document['text']
                spans = document['spans']
                annotation_sets = document['annotation_sets']
                document_type = document['document_type']
                url = document['url']
                for annotation_id, annotation_content in annotation_sets[0]['annotations'].items():
                    hypothesis = labels[annotation_id]['hypothesis']
                    choice = annotation_content['choice']
                    yield {
                        "id": id,
                        "file_name": file_name,
                        "text": text,
                        "spans": spans,
                        "document_type": document_type,
                        "url": url,
                        "hypothesis": hypothesis,
                        "labels": label2id[choice],
                    }            
        base_filepath = os.path.join(ROOT_PATH, "ignored_dir/data/contract-nli")
        train_filepath = os.path.join(base_filepath, "train.json")
        validation_filepath = os.path.join(base_filepath, "dev.json")
        test_filepath = os.path.join(base_filepath, "test.json")
        with open(train_filepath) as f:
            train_data = json.load(f)
        with open(validation_filepath) as f:
            validation_data = json.load(f)
        with open(test_filepath) as f:
            test_data = json.load(f)
        data = {
            "train": Dataset.from_generator(lambda: contract_nli_iterator(train_data)),
            "validation": Dataset.from_generator(lambda: contract_nli_iterator(validation_data)),
            "test": Dataset.from_generator(lambda: contract_nli_iterator(test_data)),
        }
        return DatasetDict(data)
    return None

contractnli_dataset = load_dataset_custom("contract-nli")

In [3]:
### get tokenizer and model for LegalBERT

legalbert_tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")



In [4]:
### chunk contractnli dataset into sizes of 512

def chunk_contractnli(ds, cs, os):
    sep_token = legalbert_tokenizer.sep_token
    concat_text = sep_token.join([sep_token.join([e['text'], e['hypothesis']]) for e in ds])
    tokenized_text = legalbert_tokenizer(concat_text).input_ids
    i = 0
    chunks = []
    with tqdm(total=100) as pbar:
        while i < len(tokenized_text):
            if i + cs >= len(tokenized_text):
                chunks.append(tokenized_text[i:])
                i = len(tokenized_text)
            else:
                chunks.append(tokenized_text[i: i + cs])
                i += (cs - os)
            pbar.update(i / len(tokenized_text))
    return Dataset.from_dict({'input_ids': chunks})

cs = 512
os = 50
contractnli_chunked = DatasetDict({mode: chunk_contractnli(contractnli_dataset[mode], cs, os) for mode in ['train', 'validation', 'test']})

In [None]:
# not using this for now
# trainer_args = TrainingArguments(output_dir="baseline_dir", deepspeed="ds_config.json")

In [None]:
legalbert_model = AutoModelForMaskedLM.from_pretrained("nlpaueb/legal-bert-base-uncased")

### legalbert_tokenizer.model_max_length shows 512

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=legalbert_tokenizer)

trainer = Trainer(model=legalbert_model, eval_dataset=contractnli_chunked['test'], data_collator=data_collator)
# trainer = Trainer(model=legalbert_model, args=trainer_args, eval_dataset=contractnli_chunked['test'], data_collator=data_collator)

In [None]:
ret = trainer.evaluate()

eval_loss = ret['eval_loss']

with open("baseline.txt", "a") as f:
    f.write(f"LegalBERT eval loss: {eval_loss}\n")

### on validation set
### eval_loss: 0.5897765159606934
## with deepspeed eval_loss gives 0.5909032821656273

### on test set
### without deepspeed: 
# {'eval_loss': 0.6284850239753723,
# 'eval_runtime': 97.6967,
# 'eval_samples_per_second': 104.405,
# 'eval_steps_per_second': 1.638}
### with deepspeed: 0.6288514733314514,