In [1]:
import numpy as np

import torch
from torch import nn
from transformers import BertPreTrainedModel, BertConfig, BertModel, BertTokenizer, AdamW
from transformers import AutoTokenizer, AutoModelForSequenceClassification

import torch
import torch.nn as nn
from torch import optim
from torch.utils.data import DataLoader, RandomSampler, Dataset, SequentialSampler

from load_data_ensemble import initialize_data
from reading_datasets import read_task5
from labels_to_ids import task5_labels_to_ids

import pickle
import json
import csv

In [7]:
class BertEnsemble(BertPreTrainedModel):
  def __init__(self, config, *args, **kwargs):
      super().__init__(config)

      # model 1
      self.bert_model_1 = AutoModelForSequenceClassification.from_pretrained('saved_models/Archive/bert-base-multilingual-cased0')
      # model 2
      self.bert_model_2 = AutoModelForSequenceClassification.from_pretrained('saved_models/Archive/bert-base-multilingual-cased1')
      # combine the 2 models into 1
      self.cls = nn.Linear(self.config.hidden_size, 3)
      self.init_weights()

  def forward(
          self,
          input_ids=None,
          attention_mask=None,
          token_type_ids=None,
          position_ids=None,
          head_mask=None,
          inputs_embeds=None,
          labels=None,
  ):
    outputs = []
    
    input_ids_1 = input_ids[0].to(device, dtype = torch.long)
    attention_mask_1 = attention_mask[0].to(device, dtype = torch.long)
    labels_1 = labels[0].to(device, dtype = torch.long)
    outputs.append(self.bert_model_1(input_ids_1,
                                     attention_mask=attention_mask_1, labels =labels_1))

    input_ids_2 = input_ids[1].to(device, dtype = torch.long)
    attention_mask_2 = attention_mask[1].to(device, dtype = torch.long)
    labels_2 = labels[1].to(device, dtype = torch.long)
    outputs.append(self.bert_model_2(input_ids_2,
                                     attention_mask=attention_mask_2, labels =labels_2))
    print(outputs)

    # just get the [CLS] embeddings
    last_hidden_states = torch.cat([output[1] for output in outputs], dim=1)
    print("Printing last hidden states")
    print(last_hidden_states)
    logits = self.cls(last_hidden_states)
    return logits

In [8]:
device = torch.device("cuda")

config = BertConfig()
model = BertEnsemble(config)
model.to(device)
learning_rate = 1e-05

no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [{
  "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
  }]
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)

# Training code

In [9]:
# standard pytorch way of doing things
# 1. create a custom Dataset 
# 2. pass the dataset to a dataloader
# 3. iterate the dataloader and pass the inputs to the model

max_len = 256
batch_size = 6
grad_step = 1
initialization_input = (max_len, batch_size)

model1_tokenizer = AutoTokenizer.from_pretrained('saved_models/Archive/bert-base-multilingual-cased0')
model2_tokenizer = AutoTokenizer.from_pretrained('saved_models/Archive/bert-base-multilingual-cased1')

#Reading datasets and initializing data loaders
dataset_location = '../2022.07.07_task5/'

#Gives us tweet_id, sentence, and label for each dataset.
train_data = read_task5(dataset_location , split = 'train')
#test_data = read_task5(dataset_location , split = 'dev')#load test set
labels_to_ids = task5_labels_to_ids
#input_data = (train_data, dev_data, labels_to_ids)

dataloader_m1 = initialize_data(model1_tokenizer, initialization_input, train_data, labels_to_ids, shuffle = True)
dataloader_m2 = initialize_data(model2_tokenizer, initialization_input, train_data, labels_to_ids, shuffle = True)

epochs = 5
for epoch in range(epochs):
  # iterate the QA and the AQ inputs simultaneously
  for step, combined_batch in enumerate(zip(dataloader_m1, dataloader_m2)):
    batch_1, batch_2 = combined_batch
    # training so, dropout needed to avoid overfitting
    model.train()
    # move input to GPU
    inputs = {
        "input_ids": [batch_1['input_ids'], batch_2['input_ids']],
        "attention_mask": [batch_1['attention_mask'], batch_2['attention_mask']],
        "labels": [batch_1['labels'], batch_2['labels']]}
    print(type(inputs))
    
    output = model(**inputs)
#     # model outputs are always tuple in transformers (see doc)
#     print(loss)
#     exit()
    
#     # backpass
#     loss.backward()
#     print(f"epoch:{epoch}, loss:{loss}")
    
#     # re-calculate the weights
#     optimizer.step()
#     # again set the grads to 0 for next epoch
#     model.zero_grad()
  
  print("\n")

<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.1354, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.0180, -0.1941,  0.0535],
        [ 0.3210, -0.3704, -0.1389],
        [-0.0154, -0.3609, -0.1357],
        [-0.1497, -0.0103, -0.1480],
        [ 0.0241, -0.2400,  0.0163],
        [-0.1895, -0.2245, -0.0981]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(1.1745, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.4437,  0.0876,  0.1342],
        [-0.2217, -0.1233,  0.1770],
        [-0.3411,  0.0071,  0.1863],
        [-0.5134,  0.2617,  0.0532],
        [-0.2329,  0.0977, -0.0128],
        [-0.2595,  0.1988, -0.1119]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[-0.0180, -0.1941,  0.0535, -0.4437,  0.0876,  0.1342],
        [ 0.3210, -0.3704, -0.1389, -0.2217, -0.1233,  0.1770],
        

RuntimeError: mat1 and mat2 shapes cannot be multiplied (6x6 and 768x3)

# Test Code

In [10]:
# standard pytorch way of doing things
# 1. create a custom Dataset 
# 2. pass the dataset to a dataloader
# 3. iterate the dataloader and pass the inputs to the model

input_ids_m1, attention_masks_m1, labels_m1 = prepare_data(dataset)
train_dataset_m1 = dataset(input_ids_m1, attention_masks_m1, labels_m1)

input_ids_m2, attention_masks_m2, labels_m2 = prepare_data(dataset)
train_dataset_m2 = dataset(input_ids_m2, attention_masks_m2, labels_m2)

dataloader_m1 =  DataLoader(dataset=train_dataset_m1, 
                            batch_size=10, 
                            sampler=SequentialSampler(train_dataset_m1))
dataloader_m2 =  DataLoader(dataset=train_dataset_m2, 
                            batch_size=10, 
                            sampler=SequentialSampler(train_dataset_m2))

complete_outputs, complete_label_ids = [], []

# iterate the QA and the AQ inputs simultaneously
for step, combined_batch in enumerate(zip(dataloader_m1, dataloader_m2)):
  # only forward pass so no dropout
  model.eval()
  batch_1, batch_2 = combined_batch

  # move input to GPU
  batch_1 = tuple(t.to(device) for t in batch_1)
  batch_2 = tuple(t.to(device) for t in batch_2)

  # no back pass so no need to track variables for differentiation
  with torch.no_grad():
    inputs = {
        "input_ids": [batch_1[0], batch_2[0]],
        "attention_mask": [batch_1[1], batch_2[1]],
        "labels": [batch_1[2], batch_2[2]]
    }
    outputs = model(**inputs)
    tmp_eval_loss, logits = outputs[:2]
    logits = logits.detach().cpu().numpy()
    outputs = np.argmax(logits, axis=1)
    label_ids = inputs["labels"].detach().cpu().numpy()
  complete_outputs.extend(outputs)
  complete_label_ids.extend(label_ids)

print(complete_outputs, complete_label_ids)

NameError: name 'prepare_data' is not defined