In [1]:
import pandas as pd
import os
from nltk.tokenize import word_tokenize
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM #, BertTokenizerFast
import torch
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import StepLR
import torch.optim as optim
from tqdm import tqdm

In [6]:
LEARNING_RATE = 5e-3
EPOCHS = 10
BATCH_SIZE = 2
label_all_tokens = False
# read data, this setting is for training and testing on original data, change the data file to
# joined_train and joined_val to test on the new dataset
path_to_data = os.path.join('Data', 'rebel_v2', 'data')
df_train = pd.read_csv(path_to_data+'/original_train.csv')
df_train['tag'] = df_train['tag_2'].str.replace('O', '0')

df_val = pd.read_csv(path_to_data+'/original_val.csv')
df_val['tag'] = df_val['tag_2'].str.replace('O', '0')

labels = [word_tokenize(i) for i in df_train['tag'].values.tolist()] #To split the tokens

# Check how many labels are there in the dataset
unique_labels = set()

for lb in labels:
    [unique_labels.add(i) for i in lb if i not in unique_labels]

labels_to_ids={'0': 0, 'trigger1': 1, 'effect': 2}

# Map each label into its id representation and vice versa
labels_to_ids = {k: v for v, k in enumerate(sorted(unique_labels))}
ids_to_labels = {v: k for v, k in enumerate(sorted(unique_labels))}

In [7]:
model_checkpoint = "Babelscape/rebel-large"
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
#bert_tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased') #Is this supposed to do this!

In [8]:
def align_label(texts, labels): #this is used to allign the labels with the generated tokens from the bert tokenizer
    tokenized_inputs = tokenizer(texts, padding='max_length', max_length=512, truncation=True)

    word_ids = tokenized_inputs.word_ids()

    previous_word_idx = None
    label_ids = []

    for word_idx in word_ids:

        if word_idx is None:
            label_ids.append(-100)

        elif word_idx != previous_word_idx:
            try:
                label_ids.append(labels_to_ids[labels[word_idx]])
            except:
                label_ids.append(-100)
        else:
            try:
                label_ids.append(labels_to_ids[labels[word_idx]] if label_all_tokens else -100)
            except:
                label_ids.append(-100)
        previous_word_idx = word_idx

    return label_ids

In [9]:
class DataSequence(torch.utils.data.Dataset):

    def __init__(self, df):
        lb = [word_tokenize(i) for i in df['tag'].values.tolist()]
        txt = df['sentence'].values.tolist()
        self.texts = [tokenizer(str(i),
                                padding='max_length', max_length=512, truncation=True, return_tensors="pt") for i in
                      txt]
        self.labels = [align_label(i, j) for i, j in zip(txt, lb)]

    def __len__(self):
        return len(self.labels)

    def get_batch_data(self, idx):
        return self.texts[idx]

    def get_batch_labels(self, idx):
        return torch.LongTensor(self.labels[idx])

    def __getitem__(self, idx):
        batch_data = self.get_batch_data(idx)
        batch_labels = self.get_batch_labels(idx)

        return batch_data, batch_labels

In [10]:
def train_loop(model, df_train, df_val):
    train_dataset = DataSequence(df_train)
    val_dataset = DataSequence(df_val)

    train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    # optimizer = SGD(model.parameters(), lr=LEARNING_RATE)
    optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE)
    # create a scheduler that reduces the learning rate by a factor of 0.1 every 10 epochs
    scheduler = StepLR(optimizer, step_size=3, gamma=0.1)

    if use_cuda:
        model = model.cuda()
    # Define the weights for each class
    weights = torch.tensor([0.2, 1.0, 1.0])
    weights = weights.to(device)
    # Define the loss function with the weights
    criterion = torch.nn.CrossEntropyLoss(weight=weights)
    best_acc = 0
    best_loss = 1000

    for epoch_num in range(EPOCHS):

        total_acc_train = 0
        total_loss_train = 0

        model.train()

        for train_data, train_label in tqdm(train_dataloader):
            print(f"Length data: {len(train_data['input_ids'])}")
            print(f"Length labels: {len(train_label)}")

            train_label = train_label.to(device)
            mask = train_data['attention_mask'].squeeze(1).to(device)
            input_id = train_data['input_ids'].squeeze(1).to(device)

            optimizer.zero_grad()
            outputs = model(input_id, mask, train_label)

            logits = outputs.view(-1, model.config.vocab_size)
            print(len(logits))
            #train_label.view(-1)
            for i in range(logits.shape[0]):
                logits_clean = logits[i][train_label[i] != -100]
                print('cleaned logits')
                print(logits_clean)
                label_clean = train_label[i][train_label[i] != -100]
                print('clea_label')
                print(label_clean)
                # Compute the loss
                loss = criterion(logits_clean, label_clean)
                print('loss for one example')
                print(loss)



In [11]:
train_loop(model, df_train, df_val)

  0%|          | 0/244 [00:00<?, ?it/s]

Length data: 2
Length labels: 2


  0%|          | 0/244 [00:07<?, ?it/s]


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.