In [1]:
from datasets import load_dataset, DatasetDict, Dataset
import json
from transformers import PerceiverTokenizer, PerceiverModel, PerceiverConfig, PerceiverPreTrainedModel, PerceiverForSequenceClassification, TrainingArguments, Trainer, \
    DataCollatorWithPadding
import re
import os
from tqdm import tqdm
import torch

In [2]:
ROOT_PATH = ".."

### Load Dataset

In [3]:
with open(os.path.join(ROOT_PATH, "ignored_dir/data/contract-nli/train.json")) as train_json_f:
    train_json = json.load(train_json_f)

In [4]:
print(f"type(train_json): {type(train_json)}")
print(f"train_json.keys(): {train_json.keys()}")
print(f"type(train_json['documents']): {type(train_json['documents'])}")
print(f"type(train_json['labels']): {type(train_json['labels'])}")
print(f"len(train_json['documents']): {len(train_json['documents'])}")
print(f"train_json['documents'][0].keys(): {train_json['documents'][0].keys()}")
print(f"type(train_json['documents'][0]['annotation_sets']): {type(train_json['documents'][0]['annotation_sets'])}")
print(f"len(train_json['documents'][0]['annotation_sets']): {len(train_json['documents'][0]['annotation_sets'])}")
print(f"type(train_json['documents'][0]['annotation_sets][0]): {type(train_json['documents'][0]['annotation_sets'][0])}")
print(f"train_json['documents'][0]['annotation_sets'][0].keys(): {train_json['documents'][0]['annotation_sets'][0].keys()}")
print(f"type(train_json['documents'][0]['annotation_sets'][0]['annotations']): {train_json['documents'][0]['annotation_sets'][0]['annotations'].keys()}")
print(f"type(train_json['documents'][0]['annotation_sets'][0]['annotations']['nda-11']): {type(train_json['documents'][0]['annotation_sets'][0]['annotations']['nda-11'])}")
print(f"train_json['documents'][0]['annotation_sets'][0]['annotations']['nda-11'].keys(): {train_json['documents'][0]['annotation_sets'][0]['annotations']['nda-11'].keys()}")
print(f"type(train_json['documents'][0]['annotation_sets'][0]['annotations']['nda-11']['choice']): {type(train_json['documents'][0]['annotation_sets'][0]['annotations']['nda-11']['choice'])}")
print(f"type(train_json['documents'][0]['annotation_sets'][0]['annotations']['nda-11']['spans']): {type(train_json['documents'][0]['annotation_sets'][0]['annotations']['nda-11']['spans'])}")
print(f"len(train_json['documents'][0]['annotation_sets'][0]['annotations']['nda-11']['spans']): {len(train_json['documents'][0]['annotation_sets'][0]['annotations']['nda-11']['spans'])}")
print(f"train_json['labels'].keys(): {train_json['labels'].keys()}")
print(f"type(train_json['labels']['nda-11']): {type(train_json['labels']['nda-11'])}")
print(f"train_json['labels']['nda-11'].keys(): {train_json['labels']['nda-11'].keys()}")
print(f"type(train_json['labels']['nda-11']['short_description']): {type(train_json['labels']['nda-11']['short_description'])}")
print(f"type(train_json['labels']['nda-11']['hypothesis']): {type(train_json['labels']['nda-11']['hypothesis'])}")

type(train_json): <class 'dict'>
train_json.keys(): dict_keys(['documents', 'labels'])
type(train_json['documents']): <class 'list'>
type(train_json['labels']): <class 'dict'>
len(train_json['documents']): 423
train_json['documents'][0].keys(): dict_keys(['id', 'file_name', 'text', 'spans', 'annotation_sets', 'document_type', 'url'])
type(train_json['documents'][0]['annotation_sets']): <class 'list'>
len(train_json['documents'][0]['annotation_sets']): 1
type(train_json['documents'][0]['annotation_sets][0]): <class 'dict'>
train_json['documents'][0]['annotation_sets'][0].keys(): dict_keys(['annotations'])
type(train_json['documents'][0]['annotation_sets'][0]['annotations']): dict_keys(['nda-11', 'nda-16', 'nda-15', 'nda-10', 'nda-2', 'nda-1', 'nda-19', 'nda-12', 'nda-20', 'nda-3', 'nda-18', 'nda-7', 'nda-17', 'nda-8', 'nda-13', 'nda-5', 'nda-4'])
type(train_json['documents'][0]['annotation_sets'][0]['annotations']['nda-11']): <class 'dict'>
train_json['documents'][0]['annotation_sets'][

In [5]:
id2label = {0: "Entailment", 1: "Contradiction", 2: "NotMnetioned"}
label2id = {"Entailment": 0, "Contradiction": 1, "NotMentioned": 2}

In [6]:
def load_dataset_custom(dataset_name):
    if dataset_name == "contract-nli":
        def contract_nli_iterator(data):
            documents, labels = data['documents'], data['labels']
            for document in documents:
                id = document['id']
                file_name = document['file_name']
                text = document['text']
                spans = document['spans']
                annotation_sets = document['annotation_sets']
                document_type = document['document_type']
                url = document['url']
                for annotation_id, annotation_content in annotation_sets[0]['annotations'].items():
                    hypothesis = labels[annotation_id]['hypothesis']
                    choice = annotation_content['choice']
                    yield {
                        "id": id,
                        "file_name": file_name,
                        "text": text,
                        "spans": spans,
                        "document_type": document_type,
                        "url": url,
                        "hypothesis": hypothesis,
                        "labels": label2id[choice],
                    }            
        base_filepath = os.path.join(ROOT_PATH, "ignored_dir/data/contract-nli")
        train_filepath = os.path.join(base_filepath, "train.json")
        validation_filepath = os.path.join(base_filepath, "dev.json")
        test_filepath = os.path.join(base_filepath, "test.json")
        with open(train_filepath) as f:
            train_data = json.load(f)
        with open(validation_filepath) as f:
            validation_data = json.load(f)
        with open(test_filepath) as f:
            test_data = json.load(f)
        data = {
            "train": Dataset.from_generator(lambda: contract_nli_iterator(train_data)),
            "validation": Dataset.from_generator(lambda: contract_nli_iterator(validation_data)),
            "test": Dataset.from_generator(lambda: contract_nli_iterator(test_data)),
        }
        return DatasetDict(data)
    return None

In [7]:
contract_nli_dataset = load_dataset_custom("contract-nli")

In [8]:
model_max_length = 60000 # set a big number here for now

In [9]:
perceiver_tokenizer = PerceiverTokenizer(model_max_length=model_max_length)

In [10]:
def process_contract_nli_dataset(element):
    return perceiver_tokenizer(element['text'], element['hypothesis'], truncation=True)

In [11]:
contract_nli_dataset_processed = contract_nli_dataset.map(process_contract_nli_dataset, batched=True)

In [12]:
print(contract_nli_dataset_processed)

DatasetDict({
    train: Dataset({
        features: ['id', 'file_name', 'text', 'spans', 'document_type', 'url', 'hypothesis', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 7191
    })
    validation: Dataset({
        features: ['id', 'file_name', 'text', 'spans', 'document_type', 'url', 'hypothesis', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 1037
    })
    test: Dataset({
        features: ['id', 'file_name', 'text', 'spans', 'document_type', 'url', 'hypothesis', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 2091
    })
})


In [13]:
'''
# for loop to figure out how long the tokenized sentences are
record = []
for mode in ['train', 'validation', 'test']:
    mode_dataset = contract_nli_dataset_processed[mode]
    for i in tqdm(range(len(mode_dataset)), total=len(mode_dataset)):
        record.append(len(mode_dataset[i]['input_ids']))
print(f"max length of tokenized dataset element is: {max(record)}") # 55122
'''

'\n# for loop to figure out how long the tokenized sentences are\nrecord = []\nfor mode in [\'train\', \'validation\', \'test\']:\n    mode_dataset = contract_nli_dataset_processed[mode]\n    for i in tqdm(range(len(mode_dataset)), total=len(mode_dataset)):\n        record.append(len(mode_dataset[i][\'input_ids\']))\nprint(f"max length of tokenized dataset element is: {max(record)}") # 55122\n'

In [14]:
perceiver_config = PerceiverConfig(model_max_length=model_max_length, num_labels=3, max_position_embeddings=model_max_length + 1000, num_self_attends_per_block=2)

In [15]:
#checkpoint_path = os.path.join(ROOT_PATH, "ignored_dir/training_outputs/perceiver_contract_nli/run_5/checkpoint-309")
# perceiver_model = PerceiverForSequenceClassification.from_pretrained(checkpoint_path, config=perceiver_config, ignore_mismatched_sizes=True)
# perceiver_model = PerceiverForSequenceClassification(config=perceiver_config)
perceiver_model = PerceiverForSequenceClassification.from_pretrained("deepmind/language-perceiver", config=perceiver_config, offload_state_dict=True, torch_dtype=torch.float16, ignore_mismatched_sizes=True)

Some weights of PerceiverForSequenceClassification were not initialized from the model checkpoint at deepmind/language-perceiver and are newly initialized: ['perceiver.decoder.decoder.decoding_cross_attention.attention.output.dense.bias', 'perceiver.decoder.decoder.decoding_cross_attention.attention.output.dense.weight', 'perceiver.decoder.decoder.decoding_cross_attention.attention.self.key.bias', 'perceiver.decoder.decoder.decoding_cross_attention.attention.self.key.weight', 'perceiver.decoder.decoder.decoding_cross_attention.attention.self.layernorm1.bias', 'perceiver.decoder.decoder.decoding_cross_attention.attention.self.layernorm1.weight', 'perceiver.decoder.decoder.decoding_cross_attention.attention.self.layernorm2.bias', 'perceiver.decoder.decoder.decoding_cross_attention.attention.self.layernorm2.weight', 'perceiver.decoder.decoder.decoding_cross_attention.attention.self.query.bias', 'perceiver.decoder.decoder.decoding_cross_attention.attention.self.query.weight', 'perceiver.de

In [16]:
perceiver_data_collator = DataCollatorWithPadding(tokenizer=perceiver_tokenizer)

In [17]:
ignored_dir_path = os.path.join(ROOT_PATH, "ignored_dir")
if not os.path.exists(ignored_dir_path):
    os.mkdir(ignored_dir_path)
training_outputs_path = os.path.join(ignored_dir_path, "training_outputs")
if not os.path.exists(training_outputs_path):
    os.mkdir(training_outputs_path)
output_path = os.path.join(training_outputs_path, "perceiver_contract_nli")
if not os.path.exists(output_path):
    os.mkdir(output_path)
output_path_content = os.listdir(output_path)
pattern = "^run_([0-9]+)"
output_path_legal_content = [e for e in output_path_content if re.match(pattern, e)]
run_output_path = os.path.join(output_path, f"run_{len(output_path_legal_content) + 1}")
print(f"saving to {run_output_path}")
perceiver_training_arguments = TrainingArguments(
    run_output_path,
    do_train=True,
    do_eval=True,
    num_train_epochs=20,
    learning_rate = 1e-6,
    gradient_accumulation_steps=8,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    fp16=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    load_best_model_at_end=True,
)

saving to ../ignored_dir/training_outputs/perceiver_contract_nli/run_6


In [18]:
import evaluate

accuracy = evaluate.load("accuracy")

In [19]:
import numpy as np

def perceiver_compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [20]:
perceiver_trainer = Trainer(
    model=perceiver_model,
    args=perceiver_training_arguments,
    train_dataset=contract_nli_dataset_processed['train'],
    eval_dataset=contract_nli_dataset_processed["validation"],
    tokenizer=perceiver_tokenizer,
    data_collator=perceiver_data_collator,
   #  compute_metrics=perceiver_compute_metrics,
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [21]:
# ret = perceiver_trainer.evaluate()

In [22]:
# print(ret)

In [23]:
perceiver_trainer.train()

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
Could not estimate the number of tokens of the input, floating-point operations will not be computed


PerceiverFCS forward gives PerceiverModelOutput(logits=tensor([[ 0.2544, -0.0618, -0.6348],
        [ 0.2549, -0.0594, -0.6353]], device='cuda:1', dtype=torch.float16,
       grad_fn=<SliceBackward0>), last_hidden_state=tensor([[[ 1.3027,  0.4028, -0.6045,  ...,  2.5684,  1.4375, -1.0410],
         [ 0.6514,  1.4385, -0.7861,  ...,  0.5073, -2.1895,  0.8115],
         [ 1.8955, -0.3945,  0.0125,  ...,  0.1781,  0.7720,  0.2490],
         ...,
         [ 0.8218, -0.1284,  0.4758,  ...,  0.0783,  0.0542, -0.5781],
         [ 0.8018, -0.5693, -0.2341,  ...,  0.7026, -0.0247,  0.3159],
         [ 2.4434, -0.0592, -0.1635,  ...,  0.3896,  0.3745, -0.1616]],

        [[ 1.3096,  0.3799, -0.6372,  ...,  2.5898,  1.4375, -1.0459],
         [ 0.6768,  1.3838, -0.7490,  ...,  0.4561, -2.1387,  0.8149],
         [ 1.8711, -0.4089,  0.0217,  ...,  0.1956,  0.7988,  0.2097],
         ...,
         [ 0.8345, -0.1356,  0.5376,  ...,  0.0743,  0.1079, -0.6328],
         [ 0.7803, -0.5854, -0.2334,  ..

Epoch,Training Loss,Validation Loss


PerceiverFCS forward gives PerceiverModelOutput(logits=tensor([[nan, nan, nan],
        [nan, nan, nan]], device='cuda:0', dtype=torch.float16,
       grad_fn=<SliceBackward0>), last_hidden_state=tensor([[[nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         ...,
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan]],

        [[nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         ...,
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan]]], device='cuda:0',
       dtype=torch.float16, grad_fn=<AddBackward0>), hidden_states=None, attentions=None, cross_attentions=None)
PerceiverFCS forward gives PerceiverModelOutput(logits=tensor([[nan, nan, nan]