In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.data import DataLoader, Dataset
import collections
from transformers import RobertaTokenizer, RobertaForTokenClassification
from torch.utils.data.sampler import SubsetRandomSampler
from datasets import load_dataset
from data_loader import EventSentenceLoader
from tqdm import tqdm
import re
import numpy as np
import argparse

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
use_freeze_model = False
validation_run = True
learning_rate = 0.00001

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')


def tokenize(batched_text):
    return tokenizer(batched_text['sentence'], padding=True, truncation=True, max_length=128)


In [4]:
class TextClassificationDataset(Dataset):
    def __init__(self, data):
            self.data_points = data

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        data = self.data_points[idx]
        return {'tokens': data['tokens'].flatten(), 'attention_mask': data['attention'].flatten(), 'labels': torch.tensor(data['labels'])}

In [5]:
def load_roberta_data():
    filepath = "events.txt"
    tokenizer_name = "bert-base-cased"
    loader = EventSentenceLoader(filepath, tokenizer_name)
    train_data = TextClassificationDataset(loader.load_data())

    train_data = list(map(lambda d: (torch.tensor(d['tokens']), d['labels']), train_data))

    return train_data

In [6]:
def test(model, data):
    confusion_matrix_size = 4
    confusion_matrix = []

    total = 0
    correct = 0

    for i in range(confusion_matrix_size):
        row = []
        for j in range(confusion_matrix_size):
            row.append(0)
        confusion_matrix.append(row)

    for i, batch in tqdm(enumerate(data, 0), total=len(data), leave=False):
        inputs, labels = batch
        inputs, labels = inputs.to(device), labels.to(device)
            
        outputs = model(inputs).logits.to(device)

        _, predicted = torch.max(outputs, 1)

        for j in range(len(predicted)):
            confusion_matrix[labels[j].item()][predicted[j].item()] += 1
        
        total += len(predicted)
        correct += predicted.eq(labels).sum().item()
    print("Test Accuracy: {:.3f}".format(correct/total))
    print(confusion_matrix)
    return confusion_matrix

In [29]:
def train(model, data, epochs):
    n = len(data)

    if validation_run:
        # This mode is to test hyperparameters
        # To make it less time consuming epochs for this mode is set to 1
        epochs = 1
        
    # Define metrics to monitor change in performance during execution
    accuracy_history_epoch = []
    accuracy_history_step = []


    loss_func = nn.CrossEntropyLoss()
    optimizer = optim.AdamW(model.parameters(), lr=learning_rate)
    
    for epoch in range(1, epochs + 1):
        correct = 0
        total = 0
        for i, batch in tqdm(enumerate(data, 0), total=len(data), leave=False):
            inputs, labels = batch
            inputs, labels = inputs.to(device), labels.to(device)
            
            outputs = model(inputs).logits.to(device)
            predicted = torch.argmax(outputs, 2)

            optimizer.zero_grad()
            loss = loss_func(outputs.flatten(start_dim=0, end_dim=1), F.one_hot(labels).float().flatten(start_dim=0, end_dim=1))
            loss.backward()
            optimizer.step()
            
            total += len(predicted)
            print(predicted.shape)
            print(labels.shape)
            correct += predicted.eq(labels).sum().item()
            accuracy_history_step.append((i+1, correct/total))

        accuracy_history_epoch.append(correct / total)
        print("Epoch: {:>3d} Accuracy: {:.3f}".format(epoch, accuracy_history_epoch[-1]))

    return accuracy_history_epoch, accuracy_history_step

In [8]:
print(train_data[0][0].shape)
print(train_data[0][1].shape)

NameError: name 'train_data' is not defined

In [30]:
filepath = "events.txt"
tokenizer_name = "bert-base-cased"
loader = EventSentenceLoader(filepath, tokenizer_name)
train_data = TextClassificationDataset(loader.load_data())

train_data = load_roberta_data()

  return {'tokens': data['tokens'].flatten(), 'attention_mask': data['attention'].flatten(), 'labels': torch.tensor(data['labels'])}
  train_data = list(map(lambda d: (torch.tensor(d['tokens']), d['labels']), train_data))


In [31]:
model = RobertaForTokenClassification.from_pretrained("roberta-base").to(device)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForTokenClassification: ['lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able

In [32]:
indices = list(range(len(train_data)))

train_dataloader = DataLoader(train_data, batch_size=64, num_workers=0, shuffle=True)

train(model, train_dataloader, 3)

  2%|▏         | 1/53 [00:40<34:42, 40.04s/it]

torch.Size([64, 128])
torch.Size([64, 128])


  4%|▍         | 2/53 [01:23<35:56, 42.28s/it]

torch.Size([64, 128])
torch.Size([64, 128])


  6%|▌         | 3/53 [02:07<35:52, 43.05s/it]

torch.Size([64, 128])
torch.Size([64, 128])


  8%|▊         | 4/53 [02:52<35:43, 43.75s/it]

torch.Size([64, 128])
torch.Size([64, 128])


  9%|▉         | 5/53 [03:38<35:27, 44.32s/it]

torch.Size([64, 128])
torch.Size([64, 128])


                                              

torch.Size([64, 128])
torch.Size([64, 128])




RuntimeError: stack expects each tensor to be equal size, but got [128] at entry 0 and [168] at entry 36