In [1]:
#Set the task
task = 'text'

#Set Seed
seed = 42

In [2]:
from datasets import load_dataset
#from Nystromformer.LRA.datasets import text
import pickle, numpy as np
import torch
import random
import os

In [3]:
from data.Nystromformer.LRA.code import lra_config
from data.Nystromformer.LRA.code.dataset import LRADataset
#from Nystromformer.LRA.code.run_tasks import training_config
from torch.utils.data import DataLoader, RandomSampler

#get training config
training_config = lra_config.config[task]["training"]

#Update training config
#training_config["learning_rate"] = 0.05
#training_config["weight_decay"] = 0.1
#training_config["eval_frequency"] = 1000

#Check Train Config
print('Training Config: ', training_config)

#get pre-defined model config
model_config = lra_config.config[task]['model']

#Check model Config
print('Model Config: ', model_config)

#Get the dataset
piaynTaskDataDir = f"data/{task}"
os.makedirs(f"output1/{task}", exist_ok = True)
piaynTaskModelDir = "./output"

#Get the dataset
train_dataset = LRADataset(piaynTaskDataDir + f"/{task}.train.pickle", False)
val_dataset = LRADataset(piaynTaskDataDir + f"/{task}.dev.pickle", False)
test_dataset = LRADataset(piaynTaskDataDir + f"/{task}.test.pickle", False)

#Create DataLoader iterators
ds_iter = {
    "train":enumerate(DataLoader(train_dataset, 
                                 #Sample batches randomly for number of specified steps
                                 sampler = RandomSampler(train_dataset, 
                                                         replacement=True, 
                                                         num_samples= training_config["num_train_steps"]*lra_config.config[task]['dataset']['train']), 
                                 batch_size = training_config["batch_size"], 
                                 drop_last = True)),
    "dev":enumerate(DataLoader(val_dataset, batch_size = training_config["batch_size"], drop_last = True)),
    "test":enumerate(DataLoader(test_dataset, batch_size = training_config["batch_size"], drop_last = True)),
}


Training Config:  {'batch_size': 32, 'learning_rate': 0.0001, 'warmup': 8000, 'lr_decay': 'linear', 'weight_decay': 0, 'eval_frequency': 500, 'num_train_steps': 20000, 'num_eval_steps': 781}
Model Config:  {'learn_pos_emb': True, 'tied_weights': False, 'embedding_dim': 64, 'transformer_dim': 64, 'transformer_hidden_dim': 128, 'head_dim': 32, 'num_head': 2, 'num_layers': 2, 'vocab_size': 512, 'max_seq_len': 4000, 'dropout_prob': 0.1, 'attention_dropout': 0.1, 'pooling_mode': 'MEAN', 'num_classes': 2}
Loaded data/text/text.train.pickle... size=25000
Loaded data/text/text.dev.pickle... size=25000
Loaded data/text/text.test.pickle... size=25000


In [4]:
#Check sizes of batches
batch = next((ds_iter['train']))
for k,v in batch[1].items():
  print(k,v.shape)

input_ids_0 torch.Size([32, 1024])
mask_0 torch.Size([32, 1024])
label torch.Size([32])


## Define model

Next, we define our model, and put it on the GPU.

In [5]:
from transformers import PerceiverForSequenceClassification
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [6]:
from transformers import PerceiverConfig
#get default perceiver config
configuration = PerceiverConfig()

#Update the Perceiver configurations with Preset model configs
#configuration.update(model_config)

#Print Updated Perceiver Configuration
print(configuration)

PerceiverConfig {
  "attention_probs_dropout_prob": 0.1,
  "audio_samples_per_frame": 1920,
  "cross_attention_shape_for_attention": "kv",
  "cross_attention_widening_factor": 1,
  "d_latents": 1280,
  "d_model": 768,
  "hidden_act": "gelu",
  "image_size": 56,
  "initializer_range": 0.02,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 2048,
  "model_type": "perceiver",
  "num_blocks": 1,
  "num_cross_attention_heads": 8,
  "num_frames": 16,
  "num_latents": 256,
  "num_self_attends_per_block": 26,
  "num_self_attention_heads": 8,
  "output_shape": [
    1,
    16,
    224,
    224
  ],
  "qk_channels": null,
  "samples_per_patch": 16,
  "self_attention_widening_factor": 1,
  "train_size": [
    368,
    496
  ],
  "transformers_version": "4.16.2",
  "use_query_residual": true,
  "v_channels": null,
  "vocab_size": 262
}



In [7]:
def initialize_model(config):
  #Initialize Model
  model = PerceiverForSequenceClassification(config)
  
  #Get Model Parameter Counts
  pytorch_total_params = sum(p.numel() for p in model.parameters())
  pytorch_total_params_Trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
  print('Total Parameters: ', pytorch_total_params, '\nTrainable Parameters: ', pytorch_total_params_Trainable)  

  return model#.to(device)                                          

In [8]:
#Change the perceiver configurations to get total parameters within 10% of BERT
configuration.num_labels = 2
configuration.num_self_attends_per_block = 2
configuration.d_latents = 512
configuration.num_latents = 512
configuration.d_model = 512

#if max_length>2000:
#  configuration.max_position_embeddings = max_length + 24*np.power(2, int(max_length/1000)-1)

In [9]:
import torch.nn as nn
class IterativePerceiver(nn.Module):
    def __init__(self,config):
        super(IterativePerceiver, self).__init__()
        self.perceiver1 = initialize_model(config)
        self.perceiver2 = initialize_model(config)
        self.encoder = self.perceiver2.perceiver.encoder
        self.classifier = self.perceiver2.perceiver.decoder
        self.activation = {}
        self.perceiver1.perceiver.embeddings.register_forward_hook(self.get_activation('embedding'))
        self.perceiver1.perceiver.input_preprocessor.register_forward_hook(self.get_activation('inp_proc'))
    def get_activation(self, name):
        def hook(model, input, output):
            if type(output) is tuple:
                self.activation[name] = []
                for o in output:
                    if o is not None:
                        self.activation[name].append(o.detach())
            else:
                self.activation[name] = output.detach()
        return hook
    def forward(self,inputs, attention_mask, labels):
        perceiver1_final_state = self.perceiver1(inputs = inputs, attention_mask = attention_mask, output_hidden_states = True)[-1][0]
        encoder_final_state = self.encoder(perceiver1_final_state, inputs = self.activation['inp_proc'][0], attention_mask = None)[0]
        logits = self.classifier(self.classifier.decoder_query(encoder_final_state), encoder_final_state).logits
        return logits

In [10]:
model = IterativePerceiver(configuration)
batch = next(ds_iter['train'])[1]
inputs = batch["input_ids_0"]
attention_mask = batch["mask_0"]
labels = batch["label"]
logits = model(inputs = inputs, attention_mask = attention_mask, labels = labels)

Total Parameters:  7760386 
Trainable Parameters:  7760386
Total Parameters:  7760386 
Trainable Parameters:  7760386


## Train the model

Here we train the model using native PyTorch.

In [None]:
from transformers import AdamW
#from torch.optim import Adam
from tqdm.notebook import tqdm
from sklearn.metrics import accuracy_score
from datasets import load_metric
import pandas as pd

best_score = 0 
prev_score = 0
maxPatience = 100
currentPatience = 0

#steps = int(training_config["num_train_steps"]/20000)
steps = training_config["num_train_steps"]

optimizer = AdamW(model.parameters(), 
                  lr = 0.05,
                  betas = (0.9, 0.999), 
                  eps = 1e-6, 
                  weight_decay = training_config["weight_decay"])

lr_scheduler = torch.optim.lr_scheduler.OneCycleLR(
    optimizer = optimizer,
    max_lr = training_config["learning_rate"],
    pct_start = training_config["warmup"] / training_config["num_train_steps"],
    anneal_strategy = training_config["lr_decay"],
    total_steps = training_config["num_train_steps"]
)

#amp_scaler = torch.cuda.amp.GradScaler() if model_config["mixed_precision"] else None

#initialize training summary
trainingSummary = pd.DataFrame(columns=['step', 'mean_train_loss', 'mean_train_acc', 'val_acc'])


model.to(device)
loss_fn = nn.CrossEntropyLoss()
#initialize training accuracy metric and loss list
train_accuracy = load_metric("accuracy")
loss_list = list()

for step in tqdm(range(steps)):  # Perform gradient updates for multiple steps
    
    model.train()
    
    #print("Step:", step)
    #for batch in tqdm(train_dataloader):
    batch = next(ds_iter['train'])[1]

    # get the inputs; 
    inputs = batch["input_ids_0"].to(device)
    attention_mask = batch["mask_0"].to(device)
    labels = batch["label"].to(device)

    # zero the parameter gradients
    optimizer.zero_grad()

    # forward + backward + optimize
    logits = model(inputs=inputs, attention_mask=attention_mask, labels=labels)
    loss = loss_fn(logits, labels)
    loss.backward()
    optimizer.step()
    lr_scheduler.step()

    # evaluate
    predictions = logits.argmax(-1).cpu().detach().numpy()
    accuracy = accuracy_score(y_true=batch["label"].numpy(), y_pred=predictions)
    references = batch["label"].numpy()
    train_accuracy.add_batch(predictions=predictions, references=references)
    
    #Add to loss list
    loss_list.append(loss.item())

    #print(f"Loss: {loss.item()}, Accuracy: {accuracy}")

    #delete intermediate variables to free up GPU space
    del loss, inputs, attention_mask, labels, predictions, accuracy


    #Every 1000 steps validate and save model
    if (step+1)%training_config['eval_frequency']  == 0:
    #if (step+1)%2  == 0:
      
      model.eval()

      print('Validating at Step: ', step)

      val_accuracy = load_metric("accuracy")

      #reset dev iterator
      ds_iter['dev'] = enumerate(DataLoader(val_dataset, batch_size = 32, drop_last = True))

      with torch.no_grad():
        for i, batch in tqdm(ds_iter['dev']):
              
          # get the inputs; 
          inputs = batch["input_ids_0"].to(device)
          attention_mask = batch["mask_0"].to(device)
          labels = batch["label"].to(device)

          # forward pass
          logits = model(inputs=inputs, attention_mask=attention_mask, labels=labels) 
          predictions = logits.argmax(-1).cpu().detach().numpy()
          references = batch["label"].numpy()
          val_accuracy.add_batch(predictions=predictions, references=references)

          #delete intermediate variables to free up GPU space
          del logits, inputs, attention_mask, labels, predictions, references
      
      #Compute val accuracy
      final_val_score = val_accuracy.compute()['accuracy']

      #Compute training accuracy till now
      train_score = train_accuracy.compute()['accuracy']

      #Compute training loss till now
      train_loss = sum(loss_list)/len(loss_list)

      #Add to trainingSummary
      trainingSummary.loc[len(trainingSummary.index)] = [step+1, train_loss, train_score, final_val_score]

      #save training summary
      trainingSummary.to_csv(piaynTaskModelDir + '/trainingSummaryToken.csv')

      #print progress
      print('Step: ', step+1, "\n\tAverage Train Loss: ", train_loss, "\n\tAverage Train Accuracy: ", train_score, "\n\tValidation Accuracy: ", final_val_score)

      #Save if performance better than best model
      if final_val_score >= best_score:
        best_score = final_val_score
        torch.save(model.to('cpu').state_dict(), piaynTaskModelDir + '/trainedPerceiverClassifier.pkl')
        model.to(device)
      else:
        pass  

      #Stop training if patience limit reached
      if final_val_score <= prev_score:
        currentPatience += 1
        if currentPatience >= maxPatience:
          print('Patience Limit reached! Stopping early!')
          torch.save(model.to('cpu').state_dict(), piaynTaskModelDir + '/trainedPerceiverClassifierStep_' + str(step + 1) + '.pkl')
          break  
      else:
        currentPatience = 0
      
      #Update prev_score
      prev_score = final_val_score

  setattr(self, word, getattr(machar, word).flat[0])
  return self._float_to_str(self.smallest_subnormal)
  setattr(self, word, getattr(machar, word).flat[0])
  return self._float_to_str(self.smallest_subnormal)


  0%|          | 0/20000 [00:00<?, ?it/s]

Validating at Step:  499


0it [00:00, ?it/s]

Step:  500 
	Average Train Loss:  0.6850523625612259 
	Average Train Accuracy:  0.5485625 
	Validation Accuracy:  0.5943101792573624
Validating at Step:  999


0it [00:00, ?it/s]

Step:  1000 
	Average Train Loss:  0.6777168549299241 
	Average Train Accuracy:  0.589 
	Validation Accuracy:  0.5760643405889885
Validating at Step:  1499


0it [00:00, ?it/s]

Step:  1500 
	Average Train Loss:  0.6734429421623548 
	Average Train Accuracy:  0.6011875 
	Validation Accuracy:  0.6160371318822023
Validating at Step:  1999


0it [00:00, ?it/s]

Step:  2000 
	Average Train Loss:  0.6686147816032171 
	Average Train Accuracy:  0.6199375 
	Validation Accuracy:  0.6235195262483995
Validating at Step:  2499


0it [00:00, ?it/s]

Step:  2500 
	Average Train Loss:  0.6633905837416649 
	Average Train Accuracy:  0.63275 
	Validation Accuracy:  0.629121318822023
Validating at Step:  2999


0it [00:00, ?it/s]

Step:  3000 
	Average Train Loss:  0.6582119697729747 
	Average Train Accuracy:  0.641125 
	Validation Accuracy:  0.6414452624839949
Validating at Step:  3499


0it [00:00, ?it/s]

Step:  3500 
	Average Train Loss:  0.6544266347203936 
	Average Train Accuracy:  0.6408125 
	Validation Accuracy:  0.6342829705505761
Validating at Step:  3999


0it [00:00, ?it/s]

Step:  4000 
	Average Train Loss:  0.6509401421621441 
	Average Train Accuracy:  0.6548125 
	Validation Accuracy:  0.6248799615877081
Validating at Step:  4499


0it [00:00, ?it/s]

Step:  4500 
	Average Train Loss:  0.6469876473678483 
	Average Train Accuracy:  0.6675 
	Validation Accuracy:  0.6371638924455826
Validating at Step:  4999


0it [00:00, ?it/s]

Step:  5000 
	Average Train Loss:  0.6433139230906963 
	Average Train Accuracy:  0.6761875 
	Validation Accuracy:  0.5996718950064021
Validating at Step:  5499


0it [00:00, ?it/s]

Step:  5500 
	Average Train Loss:  0.640317226220261 
	Average Train Accuracy:  0.6726875 
	Validation Accuracy:  0.5951504481434059
Validating at Step:  5999


0it [00:00, ?it/s]

Step:  6000 
	Average Train Loss:  0.6369470079938571 
	Average Train Accuracy:  0.6830625 
	Validation Accuracy:  0.621919014084507
Validating at Step:  6499


0it [00:00, ?it/s]

Step:  6500 
	Average Train Loss:  0.6333176508316627 
	Average Train Accuracy:  0.6925625 
	Validation Accuracy:  0.6257202304737516
Validating at Step:  6999


0it [00:00, ?it/s]

Step:  7000 
	Average Train Loss:  0.6298373585045338 
	Average Train Accuracy:  0.6981875 
	Validation Accuracy:  0.6077944942381562


In [None]:
!nvidia-smi

## Evaluate the model

Finally, we evaluate the model on the test set. We use the Datasets library to compute the accuracy.

In [10]:
from tqdm.notebook import tqdm
from datasets import load_metric

accuracy = load_metric("accuracy")

#load best performing model checkpoint
model = IterativePerceiver(configuration)
model.load_state_dict(torch.load(piaynTaskModelDir + '/trainedPerceiverClassifier.pkl'))
model.to(device)
model.eval()

with torch.no_grad():
  for i, batch in tqdm(ds_iter['test']):
        
        # get the inputs; 
        inputs = batch["input_ids_0"].to(device)
        attention_mask = batch["mask_0"].to(device)
        labels = batch["label"].to(device)

        # forward pass
        #outputs1 = model1(inputs=inputs, attention_mask=attention_mask, labels=labels, output_hidden_states = True)
        #final_state = outputs1[-1][-1].mean(axis=0)
        #model2.perceiver.set_input_embeddings(torch.nn.Parameter(final_state))
        logits = model(inputs=inputs, attention_mask=attention_mask, labels=labels)  
        predictions = logits.argmax(-1).cpu().detach().numpy()
        references = batch["label"].numpy()
        accuracy.add_batch(predictions=predictions, references=references)

        #delete intermediate variables to free up GPU space
        del logits, inputs, attention_mask, labels, predictions, references

final_score = accuracy.compute()
print("Accuracy on test set:", final_score['accuracy'])

  setattr(self, word, getattr(machar, word).flat[0])
  return self._float_to_str(self.smallest_subnormal)
  setattr(self, word, getattr(machar, word).flat[0])
  return self._float_to_str(self.smallest_subnormal)


Total Parameters:  7760386 
Trainable Parameters:  7760386
Total Parameters:  7760386 
Trainable Parameters:  7760386


0it [00:00, ?it/s]

Accuracy on test set: 0.6414452624839949
