In [26]:
from datasets import load_dataset, DatasetDict, Dataset
import json
from transformers import PerceiverTokenizer, PerceiverModel, PerceiverConfig, PerceiverPreTrainedModel, PerceiverForSequenceClassification, TrainingArguments, Trainer, \
    DataCollatorWithPadding, PerceiverForMaskedLM
import re
import os
from tqdm import tqdm
import torch
import datasets

In [27]:
ROOT_PATH = ".."

### Load Dataset

In [28]:
id2label = {0: "entailment", 1: "neutral", 2: "contradiction"}
label2id = {v: k for k, v in id2label.items()}

In [29]:
snli_dataset = datasets.load_dataset("stanfordnlp/snli")

In [30]:
for mode in ['train', 'validation', 'test']:
    snli_dataset[mode] = snli_dataset[mode].rename_column('label', 'labels').filter(lambda e: e['labels'] != -1)

In [31]:
perceiver_tokenizer = PerceiverTokenizer.from_pretrained('deepmind/language-perceiver')

In [32]:
def process_snli_dataset(element):
    return perceiver_tokenizer(element['premise'], element['hypothesis'], truncation=True)

In [33]:
snli_dataset_processed = snli_dataset.map(process_snli_dataset, batched=True)

In [34]:
# for loop to figure out how long the tokenized sentences are
max_len = 559
if max_len is None:
    record = []
    for mode in ['train', 'validation', 'test']:
        mode_dataset = snli_dataset_processed[mode]
        for i in tqdm(range(len(mode_dataset)), total=len(mode_dataset)):
            record.append(len(mode_dataset[i]['input_ids']))
    max_len = max(record)
print(f"max length of tokenized dataset element is: {max_len}")

max length of tokenized dataset element is: 559


### Try different strategies of initiating Perceiver

In [35]:
### Strategy 1

perceiver_config_1 = PerceiverConfig.from_pretrained('deepmind/language-perceiver', num_labels=3, num_latents=512)
perceiver_model_1 = PerceiverForSequenceClassification.from_pretrained('deepmind/language-perceiver', config=perceiver_config_1, ignore_mismatched_sizes=True)

Some weights of PerceiverForSequenceClassification were not initialized from the model checkpoint at deepmind/language-perceiver and are newly initialized: ['perceiver.decoder.decoder.decoding_cross_attention.attention.output.dense.bias', 'perceiver.decoder.decoder.decoding_cross_attention.attention.output.dense.weight', 'perceiver.decoder.decoder.decoding_cross_attention.attention.self.key.bias', 'perceiver.decoder.decoder.decoding_cross_attention.attention.self.key.weight', 'perceiver.decoder.decoder.decoding_cross_attention.attention.self.layernorm1.bias', 'perceiver.decoder.decoder.decoding_cross_attention.attention.self.layernorm1.weight', 'perceiver.decoder.decoder.decoding_cross_attention.attention.self.layernorm2.bias', 'perceiver.decoder.decoder.decoding_cross_attention.attention.self.layernorm2.weight', 'perceiver.decoder.decoder.decoding_cross_attention.attention.self.query.bias', 'perceiver.decoder.decoder.decoding_cross_attention.attention.self.query.weight', 'perceiver.de

In [36]:
### Strategy 2

### input_preprocessor, embeddings, encoder

perceiver_config_2 = PerceiverConfig.from_pretrained('deepmind/language-perceiver', num_labels=3)
# perceiver_model_2_ = PerceiverForSequenceClassification(config=perceiver_config_2)
perceiver_model_2 = PerceiverForSequenceClassification.from_pretrained('deepmind/language-perceiver', config=perceiver_config_2)
perceiver_model_for_masked_lm_2 = PerceiverForMaskedLM.from_pretrained('deepmind/language-perceiver')

perceiver_model_2.perceiver.input_preprocessor = perceiver_model_for_masked_lm_2.perceiver.input_preprocessor
perceiver_model_2.perceiver.embeddings = perceiver_model_for_masked_lm_2.perceiver.embeddings
perceiver_model_2.perceiver.encoder = perceiver_model_for_masked_lm_2.perceiver.encoder


Some weights of PerceiverForSequenceClassification were not initialized from the model checkpoint at deepmind/language-perceiver and are newly initialized: ['perceiver.decoder.decoder.decoding_cross_attention.attention.output.dense.bias', 'perceiver.decoder.decoder.decoding_cross_attention.attention.output.dense.weight', 'perceiver.decoder.decoder.decoding_cross_attention.attention.self.key.bias', 'perceiver.decoder.decoder.decoding_cross_attention.attention.self.key.weight', 'perceiver.decoder.decoder.decoding_cross_attention.attention.self.layernorm1.bias', 'perceiver.decoder.decoder.decoding_cross_attention.attention.self.layernorm1.weight', 'perceiver.decoder.decoder.decoding_cross_attention.attention.self.layernorm2.bias', 'perceiver.decoder.decoder.decoding_cross_attention.attention.self.layernorm2.weight', 'perceiver.decoder.decoder.decoding_cross_attention.attention.self.query.bias', 'perceiver.decoder.decoder.decoding_cross_attention.attention.self.query.weight', 'perceiver.de

In [37]:
#perceiver_config = PerceiverConfig(num_labels=3, d_latents=5120)

In [38]:
#checkpoint_path = os.path.join(ROOT_PATH, "ignored_dir/training_outputs/perceiver_contract_nli/run_5/checkpoint-309")
# perceiver_model = PerceiverForSequenceClassification.from_pretrained(checkpoint_path, config=perceiver_config, ignore_mismatched_sizes=True)
#perceiver_model = PerceiverForSequenceClassification(config=perceiver_config)
#perceiver_model_for_masked_lm = PerceiverForMaskedLM.from_pretrained('deepmind/language-perceiver')
# perceiver_model.perceiver = perceiver_model_for_masked_lm.perceiver
# perceiver_model = PerceiverForSequenceClassification.from_pretrained("deepmind/language-perceiver", config=perceiver_config, offload_state_dict=True, torch_dtype=torch.float16, ignore_mismatched_sizes=True)

In [39]:
#perceiver_model.perceiver.input_preprocessor = perceiver_model_for_masked_lm.perceiver.input_preprocessor
#perceiver_model.perceiver.embeddings = perceiver_model_for_masked_lm.perceiver.embeddings
#perceiver_model.perceiver.encoder = perceiver_model_for_masked_lm.perceiver.encoder

In [40]:
#print(perceiver_model.perceiver)

In [41]:
# print(perceiver_model_for_masked_lm.perceiver.input_preprocessor)

In [42]:
perceiver_model = perceiver_model_1

In [43]:
perceiver_data_collator = DataCollatorWithPadding(tokenizer=perceiver_tokenizer)

In [44]:
ignored_dir_path = os.path.join(ROOT_PATH, "ignored_dir")
if not os.path.exists(ignored_dir_path):
    os.mkdir(ignored_dir_path)
training_outputs_path = os.path.join(ignored_dir_path, "training_outputs")
if not os.path.exists(training_outputs_path):
    os.mkdir(training_outputs_path)
output_path = os.path.join(training_outputs_path, "perceiver_snli")
if not os.path.exists(output_path):
    os.mkdir(output_path)
output_path_content = os.listdir(output_path)
pattern = "^run_([0-9]+)"
output_path_legal_content = [e for e in output_path_content if re.match(pattern, e)]
run_output_path = os.path.join(output_path, f"run_{len(output_path_legal_content) + 1}")
print(f"saving to {run_output_path}")
perceiver_training_arguments = TrainingArguments(
    run_output_path,
    do_train=True,
    do_eval=True,
    num_train_epochs=20,
    learning_rate = 1e-6,
    gradient_accumulation_steps=16,
    per_device_train_batch_size=14,
    per_device_eval_batch_size=28,
    fp16=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    load_best_model_at_end=True,
)

saving to ../ignored_dir/training_outputs/perceiver_snli/run_18


In [45]:
import evaluate

accuracy = evaluate.load("accuracy")

In [46]:
import numpy as np

def perceiver_compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [47]:
perceiver_trainer = Trainer(
    model=perceiver_model,
    args=perceiver_training_arguments,
    train_dataset=snli_dataset_processed['train'],
    eval_dataset=snli_dataset_processed["validation"],
    tokenizer=perceiver_tokenizer,
    data_collator=perceiver_data_collator,
    compute_metrics=perceiver_compute_metrics,
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [48]:
ret = perceiver_trainer.evaluate()s

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


In [49]:
print(ret)

{'eval_loss': 1.1004080772399902, 'eval_accuracy': 0.331233489128226, 'eval_runtime': 195.4954, 'eval_samples_per_second': 50.344, 'eval_steps_per_second': 0.9}


In [50]:
perceiver_trainer.train()

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
Could not estimate the number of tokens of the input, floating-point operations will not be computed


Epoch,Training Loss,Validation Loss
