In [2]:
import numpy as np
import argparse
import pandas as pd

import torch
import torch.nn as nn

from torch.optim import lr_scheduler, AdamW

from dataset import create_dataset_object, load_agnews_dataset, load_imdb_dataset, load_topic_dataset, load_yelp_dataset
from dataloader import get_dataloaders

from prompt import PROMPTEmbedding
from model import APT, Prompt_Head
from utils import get_accuracy, count_parameters, freeze_params

from transformers import RobertaTokenizer, RobertaForSequenceClassification, RobertaModel
from transformers import get_linear_schedule_with_warmup
from tqdm import tqdm

from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [3]:
dataset = 'topic'   #imdb

model_type = 'roberta-base'   #roberta

number_of_tokens = 20

mode = 'finetune'


batch_size = 16

learning_rate = 2e-5

epochs = 10

In [5]:
tokenizer = RobertaTokenizer.from_pretrained(model_type)

train_text, train_labels, test_text, test_labels, valid_text, valid_labels = load_topic_dataset(dataset)

train_data_object = create_dataset_object(train_text, train_labels, number_of_tokens, tokenizer, dataset, mode)

test_data_object  = create_dataset_object(test_text, test_labels, number_of_tokens, tokenizer, dataset, mode)

val_data_object = create_dataset_object(valid_text, valid_labels, number_of_tokens, tokenizer, dataset, mode)


dataloaders = get_dataloaders(train_data_object, test_data_object, val_data_object, batch_size)

num_labels = 10

In [6]:
model = RobertaForSequenceClassification.from_pretrained(model_type, 
                                                    num_labels=num_labels,
                                                    output_attentions=False,
                                                    output_hidden_states=False)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

In [7]:
device = torch.device('cuda:3' if torch.cuda.is_available() else 'cpu')

#Loss function
criterion = nn.CrossEntropyLoss()


optimizer = AdamW(model.parameters(), lr = learning_rate, eps=1e-8)

# Defining LR Scheduler
scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=len(dataloaders['Train'])*epochs/15, 
    num_training_steps=len(dataloaders['Train'])*epochs
)

In [8]:
def train_model(config_train):

    dataset = config_train['dataset']
    dataloaders = config_train['dataloaders']
    model = config_train['model']
    device = config_train['device']
    criterion = config_train['criterion']
    optimizer = config_train['optimizer']
    mode = config_train['mode']
    scheduler = config_train['scheduler']
    epochs = config_train['epochs']
    save_checkpoint = config_train['save_checkpoint']

    checkpoint = config_train['checkpoint']


    model = model.to(device)

    best_valid_f1 = 0.0

    if save_checkpoint:
        if checkpoint != None:
            saved_model_path = checkpoint
        
        else:
            saved_model_path = dataset + "_" + mode + '.pt'

    for epoch in range(0, epochs):
        print('Epoch {}/{}'.format(epoch+1, epochs))

        for phase in ['Train', 'Val']:
            
            batch_loss = 0.0000   #live loss
            batch_acc = 0.0000   #live accuracy

            y_true = []
            y_pred = []

            if phase == 'Train':
                model.train()
            else:
                model.eval()
            
            with tqdm(dataloaders[phase], unit="batch", desc=phase) as tepoch:

                for idx, (data, labels) in enumerate(tepoch):

                    input_ids =  data['input_ids'].squeeze(1).to(device)
                    attention_mask = data['attention_mask'].squeeze(1).to(device)
                    
                    
                    labels = labels.to(device)


                    if mode =='apt':
                        output = model(input_ids = input_ids, attention_mask = attention_mask)

                    elif mode == 'finetune':
                         output = model(input_ids).logits
                    else:
                        output = model(input_ids = input_ids, attention_mask = attention_mask).logits

                    loss = criterion(output, labels)

                    if phase == 'Train':

                        #zero gradients
                        optimizer.zero_grad() 

                        # Backward pass  (calculates the gradients)
                        loss.backward()   

                        optimizer.step()             # Updates the weights
                        
                        scheduler.step()
                        
                    batch_loss += loss.item()
                        
                    _, preds = output.data.max(1)
                    y_pred.extend(preds.tolist())
                    y_true.extend(labels.tolist())
                    
                    batch_acc = get_accuracy(y_pred, y_true)
                    
                    tepoch.set_postfix(loss = batch_loss/(idx+1), accuracy = batch_acc )

                pre = precision_score(y_true, y_pred, average='weighted')
                recall = recall_score(y_true, y_pred, average='weighted')
                f1 = f1_score(y_true, y_pred, average='weighted')
                

                print("F1: {:.4f}, Precision: {:.4f}, Recall : {:.4f}.".format(f1, pre, recall))

                if save_checkpoint:
                
                    if phase == 'Val':
                        if f1 > best_valid_f1:
                            best_valid_f1 = f1
                            torch.save(model.state_dict(), saved_model_path)
                            print('Model Saved!')
                
                print()


In [9]:
config_train = {
    
    'dataset': dataset,
    'dataloaders':dataloaders, 
    'model': model, 
    'device': device, 
    'criterion':criterion, 
    'optimizer':optimizer, 
    'mode':mode, 
    'scheduler': scheduler,
    'epochs': epochs,
    'save_checkpoint': True,
    'checkpoint': None
}

train_model(config_train)

Epoch 1/10


Train: 100%|██████████| 4062/4062 [1:20:37<00:00,  1.19s/batch, accuracy=0.592, loss=1.24]


F1: 0.5884, Precision: 0.5910, Recall : 0.5917.



Val: 100%|██████████| 750/750 [05:07<00:00,  2.44batch/s, accuracy=0.727, loss=0.863]


F1: 0.7234, Precision: 0.7245, Recall : 0.7273.
Model Saved!

Epoch 2/10


Train:   0%|          | 0/4062 [00:07<?, ?batch/s]


RuntimeError: CUDA out of memory. Tried to allocate 96.00 MiB (GPU 3; 47.46 GiB total capacity; 21.78 GiB already allocated; 84.31 MiB free; 21.96 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF