## Prompt + head layer tuning

### Imports

In [2]:
import os


import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn

from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader

from transformers import RobertaTokenizer, RobertaForSequenceClassification, RobertaModel
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm

from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

import torch.nn.functional as F

import warnings
warnings.filterwarnings("ignore")

### Prompt Embeddings

In [3]:
class PROMPTEmbedding(nn.Module):
    def __init__(self, 
                wte: nn.Embedding,
                n_tokens: int = 10, 
                random_range: float = 0.5,
                initialize_from_vocab: bool = True):
        super(PROMPTEmbedding, self).__init__()
        self.wte = wte
        self.n_tokens = n_tokens
        self.learned_embedding = nn.parameter.Parameter(self.initialize_embedding(wte,
                                                                               n_tokens, 
                                                                               random_range, 
                                                                               initialize_from_vocab))
            
    def initialize_embedding(self, 
                             wte: nn.Embedding,
                             n_tokens: int = 10, 
                             random_range: float = 0.5, 
                             initialize_from_vocab: bool = True):
        if initialize_from_vocab:
            return self.wte.weight[:n_tokens].clone().detach()
        return torch.FloatTensor(wte.weight.size(1), n_tokens).uniform_(-random_range, random_range)
            
    def forward(self, tokens):
        input_embedding = self.wte(tokens[:, self.n_tokens:])
        learned_embedding = self.learned_embedding.repeat(input_embedding.size(0), 1, 1)
        return torch.cat([learned_embedding, input_embedding], 1)


### Check GPU

In [4]:
# Check GPU
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

torch.backends.cudnn.benchmark = True
torch.backends.cudnn.enabled   = True

cuda:0


### Load Data

In [5]:
train_data = pd.read_csv('Train.csv')
test_data = pd.read_csv('Test.csv')
valid_data = pd.read_csv('Valid.csv')

train_text = train_data['text']
train_labels = train_data['label']

valid_text = valid_data['text']
valid_labels = valid_data['label']

test_text = test_data['text']
test_labels = test_data['label']

#print total length of dataset
len(train_text), len(train_labels), len(test_labels), len(test_text), len(valid_text), len(valid_labels)

(40000, 40000, 5000, 5000, 5000, 5000)

### Hyper parameters

In [8]:
## Specify the Hyper parameters 

BATCH_SIZE = 16
NUM_LABELS = 2
EPOCHS = 15
LEARNING_RATE = 8e-5
N_Tokens = 20


In [9]:
## load the Roberta tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

### Dataset class

In [10]:
class Dataset(torch.utils.data.Dataset):
    """
    This is our custom dataset class which will load the text and their corresponding labels into Pytorch tensors
    """
    def __init__(self, labels, text):
        self.labels = labels
        self.text = text
        
        self.n_tokens = N_Tokens

    def __getitem__(self, idx):
        sample = {}
        text = self.text[idx]

        #Roberta Tokenizer to tokenize the text
        inputs = tokenizer.encode_plus(text, 
                                        add_special_tokens=True,   # Adds [CLS] and [SEP] token to every input text
                                        max_length=492, 
                                        truncation=True, 
                                        return_tensors='pt',
                                        padding="max_length")
        
        inputs['input_ids'] = torch.cat([torch.full((1,self.n_tokens), 5256), inputs['input_ids']], 1)
        inputs['attention_mask'] = torch.cat([torch.full((1, self.n_tokens), 1), inputs['attention_mask']], 1)

        
        return inputs, torch.tensor(self.labels[idx])
    
    def __len__(self):
        return len(self.labels)
        
        
# Create train, test and val datasets
train_data_object = Dataset(
    labels = train_labels,
    text = train_text,
)

test_data_object = Dataset(
    labels = test_labels,
    text = test_text,
)

val_data_object = Dataset(
    labels = valid_labels,
    text = valid_text,
)


## We call the dataloader class
train_loader = torch.utils.data.DataLoader(
    train_data_object,
    batch_size=BATCH_SIZE,
    pin_memory=True,
    num_workers= 64,
    shuffle=True,
    drop_last=True
 )

test_loader = torch.utils.data.DataLoader(
    test_data_object,
    batch_size=BATCH_SIZE,
    pin_memory=True,
    num_workers=64,
    shuffle=True,
    drop_last=True
 )

val_loader = torch.utils.data.DataLoader(
    val_data_object,
    batch_size=BATCH_SIZE,
    pin_memory=True,
    num_workers=64,
    shuffle=True,
    drop_last=True
 )

dataloaders = {'Train': train_loader, 'Test': test_loader, 'Val': val_loader}

### Model

In [8]:
class Model(torch.nn.Module):
    
    def __init__(self, bert, num_classes):
        """
        In the constructor we instantiate two nn.Linear modules and assign them as
        member variables.
        """
        super(Model, self).__init__()
        
        self.bert = bert
        
        
        
        self.fcbert1 = nn.Linear(768, 128)
        self.fcbert2 = nn.Linear(128, 16)
        self.fcbert3 = nn.Linear(16, num_classes)

        
        self.dropout = nn.Dropout(0.3)
        


    def forward(self, input_ids, attention_mask):
        """
        In the forward function we accept a Tensor of input data and we must return
        a Tensor of output data. We can use Modules defined in the constructor as
        well as arbitrary operators on Tensors.
        """
        # 1, 768
        output_bert = self.bert(input_ids = input_ids, attention_mask = attention_mask).pooler_output
        
        
        output = self.dropout(F.leaky_relu(self.fcbert1(output_bert), .1))    #1, 128
         
        output = self.dropout(F.leaky_relu(self.fcbert2(output), 0.1))     #1, 16
        
        output = self.fcbert3(output)    #1, 3
         
        return output

In [11]:
model = RobertaForSequenceClassification.from_pretrained('roberta-base', 
                                                         num_labels=2,
                                                         output_attentions=False,
                                                         output_hidden_states=False)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

In [None]:
model.roberta

In [14]:

for param in model.roberta.parameters():
    param.requires_grad = False


### Add prompt embedding

In [15]:
prompt_emb = PROMPTEmbedding(model.roberta.get_input_embeddings(), 
                      n_tokens= N_Tokens, 
                      initialize_from_vocab=True)

prompt_emb

PROMPTEmbedding(
  (wte): Embedding(50265, 768, padding_idx=1)
)

In [None]:
model.roberta.set_input_embeddings(prompt_emb)

model.to(device)

### few checks

In [12]:
print(model.bert.embeddings.word_embeddings.learned_embedding.shape)
print()

for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)

torch.Size([20, 768])

bert.embeddings.word_embeddings.learned_embedding tensor([[ 0.1476, -0.0365,  0.0753,  ..., -0.0023,  0.0172, -0.0016],
        [ 0.0156,  0.0076, -0.0118,  ..., -0.0022,  0.0081, -0.0156],
        [-0.0347, -0.0873, -0.0180,  ...,  0.1174, -0.0098, -0.0355],
        ...,
        [-0.1332, -0.0391, -0.0661,  ..., -0.0450, -0.0546,  0.0156],
        [ 0.0358,  0.0647, -0.1526,  ..., -0.1164, -0.0242, -0.0792],
        [-0.0911, -0.1117, -0.0304,  ..., -0.0569, -0.1113, -0.1200]],
       device='cuda:0')
fcbert1.weight tensor([[ 0.0113, -0.0122, -0.0056,  ...,  0.0317,  0.0335,  0.0181],
        [ 0.0041,  0.0299,  0.0168,  ...,  0.0254,  0.0282, -0.0151],
        [-0.0133, -0.0197,  0.0121,  ..., -0.0112, -0.0184,  0.0258],
        ...,
        [ 0.0023,  0.0245,  0.0175,  ...,  0.0328,  0.0109, -0.0280],
        [-0.0139, -0.0232, -0.0149,  ...,  0.0152,  0.0285, -0.0148],
        [-0.0111, -0.0249, -0.0346,  ..., -0.0162,  0.0322,  0.0188]],
       device='cuda:

In [13]:
# print input check
tokenizer.decode(train_data_object[700][0]['input_ids'].detach().cpu().numpy().tolist()[0])

' prompted prompted prompted prompted prompted prompted prompted prompted prompted prompted prompted prompted prompted prompted prompted prompted prompted prompted prompted prompted<s>Usually musicals in the 1940\'s were of a set formula - and if you studied films you know what I\'m talking about - a certain running lenghth, very "showy" performances that were great on the surface but never got into the real personalities of the characters etc.<br /><br />THIS ONE IS DIFFERENT - and light years better and well worth it\'s nomination for best picture of the year - 1945 (although had no chance of beating the eventual winner - Lost Weekend).<br /><br />Gene Kelly was probably in the best form of his career - yes I know about "American in Paris" and "Singing in the Rain". This one is different. He really gets into his character of a "sea wolf" thinking (at first) that "picking up any girl while on leave" is nothing more than a lark. And if you had to make up a "story" to get her - so be it

In [14]:

#optimizer
optimizer = AdamW(model.parameters(), lr = LEARNING_RATE, eps=1e-8)

#Loss function
criterion = nn.CrossEntropyLoss()

# Defining LR Scheduler
scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=0, 
    num_training_steps=len(train_loader)*EPOCHS
)

In [15]:
#Print trainable parameters

from prettytable import PrettyTable

def count_parameters(model):
    table = PrettyTable(["Modules", "Parameters"])
    total_params = 0
    for name, parameter in model.named_parameters():
        if not parameter.requires_grad: continue
        param = parameter.numel()
        table.add_row([name, param])
        total_params+=param
    print(table)
    print(f"Total Trainable Params: {total_params}")
    
count_parameters(model)

+---------------------------------------------------+------------+
|                      Modules                      | Parameters |
+---------------------------------------------------+------------+
| bert.embeddings.word_embeddings.learned_embedding |   15360    |
|                   fcbert1.weight                  |   98304    |
|                    fcbert1.bias                   |    128     |
|                   fcbert2.weight                  |    2048    |
|                    fcbert2.bias                   |     16     |
|                   fcbert3.weight                  |     32     |
|                    fcbert3.bias                   |     2      |
+---------------------------------------------------+------------+
Total Trainable Params: 115890


In [16]:
#to calculate accuracy

def get_accuracy(preds, labels):
  total_acc = 0.0
  
  for i in range(len(labels)):
    if labels[i] == preds[i]:
      total_acc+=1.0
  
  return total_acc / len(labels)

### Train

In [None]:
best_valid_f1 = 0.0
PATH = 'prompt_head_ft_imdb.pt'

for epoch in range(0, EPOCHS):
  

    print('-'*10)
    print('Epoch {}/{}'.format(epoch+1, EPOCHS))

    for phase in ['Train', 'Val']:

        batch_loss = 0.0000   #live loss
        batch_acc = 0.0000   #live accuracy

        y_true = []
        y_pred = []

        if phase == 'Train':
            model.train()
        else:
            model.eval()
        
        with tqdm(dataloaders[phase], unit="batch", desc=phase) as tepoch:

          for idx, (data, labels) in enumerate(tepoch):
            input_ids =  data['input_ids'].squeeze(1).to(device)
            attention_mask = data['attention_mask'].squeeze(1).to(device)
            
            
            labels = labels.to(device)

            output = model(input_ids = input_ids, attention_mask = attention_mask)

            loss = criterion(output, labels)

            if phase == 'Train':

                #zero gradients
                optimizer.zero_grad() 

                # Backward pass  (calculates the gradients)
                loss.backward()   

                optimizer.step()             # Updates the weights
                
                scheduler.step()
                
                
            batch_loss += loss.item()
                
            _, preds = output.data.max(1)
            y_pred.extend(preds.tolist())
            y_true.extend(labels.tolist())
            
            batch_acc = get_accuracy(y_pred, y_true)
            
            tepoch.set_postfix(loss = batch_loss/(idx+1), accuracy = batch_acc )

          pre = precision_score(y_true, y_pred, average='weighted')
          recall = recall_score(y_true, y_pred, average='weighted')
          f1 = f1_score(y_true, y_pred, average='weighted')
          

          print("F1: {:.4f}, Precision: {:.4f}, Recall : {:.4f}.".format(f1, pre, recall))
        
          if phase == 'Val':
            if f1 > best_valid_f1:
                best_valid_f1 = f1
                torch.save(model.state_dict(), PATH)
                print('Model Saved!')
        
          print()


In [18]:
model.load_state_dict(torch.load('prompt_head_ft_imdb.pt'))

<All keys matched successfully>

### Test

In [19]:
batch_loss = 0.0   #batch loss
batch_acc = 0.0   #batch accuracy

y_true = []
y_pred = []

# set the model to evaluation mode            
model.eval()

phase = 'Test'

with tqdm(test_loader, unit="batch", desc=phase) as tepoch:
    
    for idx, (data, labels) in enumerate(tepoch):
        
        input_ids =  data['input_ids'].squeeze(1).to(device)
        attention_mask = data['attention_mask'].squeeze(1).to(device)

        labels = labels.to(device)
        
        with torch.no_grad():

            output = model(input_ids = input_ids, attention_mask = attention_mask)

            loss = criterion(output, labels)
            
            _, preds = output.data.max(1)
            y_pred.extend(preds.tolist())
            y_true.extend(labels.tolist())
            
            batch_acc = get_accuracy(y_pred, y_true)
            batch_loss += loss.item()
            
        tepoch.set_postfix(loss = batch_loss/(idx+1), accuracy = batch_acc )


pre = precision_score(y_true, y_pred, average='macro')
recall = recall_score(y_true, y_pred, average='macro')
f1 = f1_score(y_true, y_pred, average='macro')
print("")

print("F1: {:.6f}, Precision: {:.6f}, Recall : {:.6f}".format(f1, pre, recall))

Test: 100%|██████████| 312/312 [01:03<00:00,  4.91batch/s, accuracy=0.926, loss=0.214]


F1: 0.926482, Precision: 0.926483, Recall : 0.926485





## Only prompt

### Imports

In [20]:
import os

import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn

from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader

from transformers import RobertaTokenizer, RobertaForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm

from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

import torch.nn.functional as F

import warnings
warnings.filterwarnings("ignore")

### prompts emb

In [21]:
class PROMPTEmbedding(nn.Module):
    def __init__(self, 
                wte: nn.Embedding,
                n_tokens: int = 10, 
                random_range: float = 0.5,
                initialize_from_vocab: bool = True):
        super(PROMPTEmbedding, self).__init__()
        self.wte = wte
        self.n_tokens = n_tokens
        self.learned_embedding = nn.parameter.Parameter(self.initialize_embedding(wte,
                                                                               n_tokens, 
                                                                               random_range, 
                                                                               initialize_from_vocab))
            
    def initialize_embedding(self, 
                             wte: nn.Embedding,
                             n_tokens: int = 10, 
                             random_range: float = 0.5, 
                             initialize_from_vocab: bool = True):
        if initialize_from_vocab:
            return self.wte.weight[:n_tokens].clone().detach()
        return torch.FloatTensor(wte.weight.size(1), n_tokens).uniform_(-random_range, random_range)
            
    def forward(self, tokens):
        input_embedding = self.wte(tokens[:, self.n_tokens:])
        learned_embedding = self.learned_embedding.repeat(input_embedding.size(0), 1, 1)
        return torch.cat([learned_embedding, input_embedding], 1)


In [22]:

# Check GPU
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

torch.backends.cudnn.benchmark = True
torch.backends.cudnn.enabled   = True

cuda:0


### Load data

In [23]:
train_data = pd.read_csv('Train.csv')
test_data = pd.read_csv('Test.csv')
valid_data = pd.read_csv('Valid.csv')

train_text = train_data['text']
train_labels = train_data['label']

valid_text = valid_data['text']
valid_labels = valid_data['label']

test_text = test_data['text']
test_labels = test_data['label']

#print total length of dataset
len(train_text), len(train_labels), len(test_labels), len(test_text), len(valid_text), len(valid_labels)

(40000, 40000, 5000, 5000, 5000, 5000)

### Hyper parameters

In [24]:
## Specify the Hyper parameters 

BATCH_SIZE = 16
NUM_LABELS = 2  # since we have two labels -  positive and negative i.e 0 and 1
EPOCHS = 15
LEARNING_RATE = 1e-4
N_Tokens = 20

In [25]:
## load the Roberta tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

### Dataset and Dataloader

In [26]:
class Dataset(torch.utils.data.Dataset):
    """
    This is our custom dataset class which will load the text and their corresponding labels into Pytorch tensors
    """
    def __init__(self, labels, text):
        self.labels = labels
        self.text = text
        
        self.n_tokens = N_Tokens

    def __getitem__(self, idx):
        sample = {}
        text = self.text[idx]

        #Roberta Tokenizer to tokenize the text
        inputs = tokenizer.encode_plus(text, 
                                        add_special_tokens=True,   # Adds [CLS] and [SEP] token to every input text
                                        max_length=492, 
                                        truncation=True, 
                                        return_tensors='pt',
                                        padding="max_length")
        
        inputs['input_ids'] = torch.cat([torch.full((1,self.n_tokens), 5256), inputs['input_ids']], 1)
        inputs['attention_mask'] = torch.cat([torch.full((1, self.n_tokens), 1), inputs['attention_mask']], 1)

        
        return inputs, torch.tensor(self.labels[idx])
    
    def __len__(self):
        return len(self.labels)
        
        
# Create train, test and val datasets
train_data_object = Dataset(
    labels = train_labels,
    text = train_text,
)

test_data_object = Dataset(
    labels = test_labels,
    text = test_text,
)

val_data_object = Dataset(
    labels = valid_labels,
    text = valid_text,
)


## We call the dataloader class
train_loader = torch.utils.data.DataLoader(
    train_data_object,
    batch_size=BATCH_SIZE,
    pin_memory=True,
    num_workers= 64,
    shuffle=True,
    drop_last=True
 )

test_loader = torch.utils.data.DataLoader(
    test_data_object,
    batch_size=BATCH_SIZE,
    pin_memory=True,
    num_workers=64,
    shuffle=True,
    drop_last=True
 )

val_loader = torch.utils.data.DataLoader(
    val_data_object,
    batch_size=BATCH_SIZE,
    pin_memory=True,
    num_workers=64,
    shuffle=True,
    drop_last=True
 )

dataloaders = {'Train': train_loader, 'Test': test_loader, 'Val': val_loader}

### Model

In [27]:

model = RobertaForSequenceClassification.from_pretrained('roberta-base', 
                                                         num_labels=NUM_LABELS,
                                                         output_attentions=False,
                                                         output_hidden_states=False)

for param in model.parameters():
    param.requires_grad = False


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classi

### Add prompt emb

In [28]:
prompt_emb = PROMPTEmbedding(model.get_input_embeddings(), 
                      n_tokens= N_Tokens, 
                      initialize_from_vocab=True)

print(prompt_emb)

model.set_input_embeddings(prompt_emb)

model.to(device)


model.roberta.embeddings.word_embeddings.learned_embedding.shape

PROMPTEmbedding(
  (wte): Embedding(50265, 768, padding_idx=1)
)


torch.Size([20, 768])

### Optimizer

In [29]:

#optimizer
optimizer = AdamW([model.roberta.embeddings.word_embeddings.learned_embedding], lr = LEARNING_RATE, eps=1e-8)

#Loss function
criterion = nn.CrossEntropyLoss()

# Defining LR Scheduler
scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=0, 
    num_training_steps=len(train_loader)*EPOCHS
)

In [30]:
#Print trainable parameters

from prettytable import PrettyTable

def count_parameters(model):
    table = PrettyTable(["Modules", "Parameters"])
    total_params = 0
    for name, parameter in model.named_parameters():
        if not parameter.requires_grad: continue
        param = parameter.numel()
        table.add_row([name, param])
        total_params+=param
    print(table)
    print(f"Total Trainable Params: {total_params}")
    
count_parameters(model)

+------------------------------------------------------+------------+
|                       Modules                        | Parameters |
+------------------------------------------------------+------------+
| roberta.embeddings.word_embeddings.learned_embedding |   15360    |
+------------------------------------------------------+------------+
Total Trainable Params: 15360


### Train

In [28]:
#to calculate accuracy

def get_accuracy(preds, labels):
  total_acc = 0.0
  
  for i in range(len(labels)):
    if labels[i] == preds[i]:
      total_acc+=1.0
  
  return total_acc / len(labels)

In [None]:
best_valid_f1 = 0.0
PATH = 'only_prompt_imdb.pt'

for epoch in range(0, EPOCHS):
  

    print('-'*10)
    print('Epoch {}/{}'.format(epoch+1, EPOCHS))

    for phase in ['Train', 'Val']:

        batch_loss = 0.0000   #live loss
        batch_acc = 0.0000   #live accuracy

        y_true = []
        y_pred = []

        if phase == 'Train':
            model.train()
        else:
            model.eval()
        
        with tqdm(dataloaders[phase], unit="batch", desc=phase) as tepoch:

          for idx, (data, labels) in enumerate(tepoch):
            input_ids =  data['input_ids'].squeeze(1).to(device)
            attention_mask = data['attention_mask'].squeeze(1).to(device)
            
            labels = labels.to(device)

            output = model(input_ids = input_ids, attention_mask = attention_mask).logits

            loss = criterion(output, labels)

            if phase == 'Train':

                #zero gradients
                optimizer.zero_grad() 

                # Backward pass  (calculates the gradients)
                loss.backward()   

                optimizer.step()             # Updates the weights
                
                scheduler.step()
                
                
            batch_loss += loss.item()
                
            _, preds = output.data.max(1)
            y_pred.extend(preds.tolist())
            y_true.extend(labels.tolist())
            
            batch_acc = get_accuracy(y_pred, y_true)
            
            tepoch.set_postfix(loss = batch_loss/(idx+1), accuracy = batch_acc )

          pre = precision_score(y_true, y_pred, average='weighted')
          recall = recall_score(y_true, y_pred, average='weighted')
          f1 = f1_score(y_true, y_pred, average='weighted')
        
          print("F1: {:.4f}, Precision: {:.4f}, Recall : {:.4f}.".format(f1, pre, recall))
        
        
                    
          if phase == 'Val':
            if f1 > best_valid_f1:
                best_valid_f1 = f1
                torch.save(model.state_dict(), PATH)
                print('Model Saved!')
        
          print()

### train few more epochs

In [None]:
best_valid_f1 = 0.8069
PATH = 'only_prompt_imdb.pt'

model.load_state_dict(torch.load(PATH))

In [None]:
# For IMDB datasets

for epoch in range(0, EPOCHS):
  

    print('-'*10)
    print('Epoch {}/{}'.format(epoch+1, EPOCHS))

    for phase in ['Train', 'Val']:

        batch_loss = 0.0000   #live loss
        batch_acc = 0.0000   #live accuracy

        y_true = []
        y_pred = []

        if phase == 'Train':
            model.train()
        else:
            model.eval()
        
        with tqdm(dataloaders[phase], unit="batch", desc=phase) as tepoch:

          for idx, (data, labels) in enumerate(tepoch):
            input_ids =  data['input_ids'].squeeze(1).to(device)
            attention_mask = data['attention_mask'].squeeze(1).to(device)
            
            labels = labels.to(device)

            output = model(input_ids = input_ids, attention_mask = attention_mask).logits

            loss = criterion(output, labels)

            if phase == 'Train':

                #zero gradients
                optimizer.zero_grad() 

                # Backward pass  (calculates the gradients)
                loss.backward()   

                optimizer.step()             # Updates the weights
                
                scheduler.step()
                
                
            batch_loss += loss.item()
                
            _, preds = output.data.max(1)
            y_pred.extend(preds.tolist())
            y_true.extend(labels.tolist())
            
            batch_acc = get_accuracy(y_pred, y_true)
            
            tepoch.set_postfix(loss = batch_loss/(idx+1), accuracy = batch_acc )
        


          print(confusion_matrix(y_true, y_pred))
          pre = precision_score(y_true, y_pred, average='weighted')
          recall = recall_score(y_true, y_pred, average='weighted')
          f1 = f1_score(y_true, y_pred, average='weighted')
        
          print("F1: {:.4f}, Precision: {:.4f}, Recall : {:.4f}.".format(f1, pre, recall))
        
        
                    
          if phase == 'Val':
            if f1 > best_valid_f1:
                best_valid_f1 = f1
                torch.save(model.state_dict(), PATH)
                print('Model Saved!')
        
          print()

### Test

In [31]:
model.load_state_dict(torch.load('only_prompt_imdb.pt'))

<All keys matched successfully>

In [32]:
batch_loss = 0.0   #batch loss
batch_acc = 0.0   #batch accuracy

y_true = []
y_pred = []

# set the model to evaluation mode            
model.eval()

phase = 'Test'

with tqdm(test_loader, unit="batch", desc=phase) as tepoch:
    
    for idx, (data, labels) in enumerate(tepoch):
        
        input_ids =  data['input_ids'].squeeze(1).to(device)
        attention_mask = data['attention_mask'].squeeze(1).to(device)

        labels = labels.to(device)
        
        with torch.no_grad():

            output = model(input_ids = input_ids, attention_mask = attention_mask).logits

            loss = criterion(output, labels)
            
            _, preds = output.data.max(1)
            y_pred.extend(preds.tolist())
            y_true.extend(labels.tolist())
            
            batch_acc = get_accuracy(y_pred, y_true)
            batch_loss += loss.item()
            
        tepoch.set_postfix(loss = batch_loss/(idx+1), accuracy = batch_acc )

pre = precision_score(y_true, y_pred, average='macro')
recall = recall_score(y_true, y_pred, average='macro')
f1 = f1_score(y_true, y_pred, average='macro')
print("")

print("F1: {:.6f}, Precision: {:.6f}, Recall : {:.6f}".format(f1, pre, recall))

Test: 100%|██████████| 312/312 [01:03<00:00,  4.90batch/s, accuracy=0.893, loss=0.327]


F1: 0.893024, Precision: 0.893075, Recall : 0.893022





## Prompt + Adapter

In [33]:
import os

import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn

from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader

from transformers import RobertaTokenizer, RobertaForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm

from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

import torch.nn.functional as F

import warnings
warnings.filterwarnings("ignore")

### Prompt Embeddding

In [34]:
class PROMPTEmbedding(nn.Module):
    def __init__(self, 
                wte: nn.Embedding,
                n_tokens: int = 10, 
                random_range: float = 0.5,
                initialize_from_vocab: bool = True):
        super(PROMPTEmbedding, self).__init__()
        self.wte = wte
        self.n_tokens = n_tokens
        self.learned_embedding = nn.parameter.Parameter(self.initialize_embedding(wte,
                                                                               n_tokens, 
                                                                               random_range, 
                                                                               initialize_from_vocab))
            
    def initialize_embedding(self, 
                             wte: nn.Embedding,
                             n_tokens: int = 10, 
                             random_range: float = 0.5, 
                             initialize_from_vocab: bool = True):
        if initialize_from_vocab:
            return self.wte.weight[:n_tokens].clone().detach()
        return torch.FloatTensor(wte.weight.size(1), n_tokens).uniform_(-random_range, random_range)
            
    def forward(self, tokens):
        input_embedding = self.wte(tokens[:, self.n_tokens:])
        learned_embedding = self.learned_embedding.repeat(input_embedding.size(0), 1, 1)
        return torch.cat([learned_embedding, input_embedding], 1)

In [35]:

# Check GPU
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

torch.backends.cudnn.benchmark = True
torch.backends.cudnn.enabled   = True

cuda:0


### Load Data

In [36]:
train_data = pd.read_csv('Train.csv')
test_data = pd.read_csv('Test.csv')
valid_data = pd.read_csv('Valid.csv')

train_text = train_data['text']
train_labels = train_data['label']

valid_text = valid_data['text']
valid_labels = valid_data['label']

test_text = test_data['text']
test_labels = test_data['label']

#print total length of dataset
len(train_text), len(train_labels), len(test_labels), len(test_text), len(valid_text), len(valid_labels)

(40000, 40000, 5000, 5000, 5000, 5000)


### Hyper parameters

In [37]:
## Specify the Hyper parameters 

BATCH_SIZE = 16
NUM_LABELS = 2   # since we have two labels -  positive and negative i.e 0 and 1
EPOCHS = 15
LEARNING_RATE = 1e-4
N_Tokens = 20


adapter_hidden = 16

In [38]:
## load the Roberta tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

In [39]:
class Dataset(torch.utils.data.Dataset):
    """
    This is our custom dataset class which will load the text and their corresponding labels into Pytorch tensors
    """
    def __init__(self, labels, text):
        self.labels = labels
        self.text = text
        
        self.n_tokens = N_Tokens

    def __getitem__(self, idx):
        sample = {}
        text = self.text[idx]

        #Roberta Tokenizer to tokenize the text
        inputs = tokenizer.encode_plus(text, 
                                        add_special_tokens=True,   # Adds [CLS] and [SEP] token to every input text
                                        max_length=492, 
                                        truncation=True, 
                                        return_tensors='pt',
                                        padding="max_length")
        
        inputs['input_ids'] = torch.cat([torch.full((1,self.n_tokens), 5256), inputs['input_ids']], 1)
        inputs['attention_mask'] = torch.cat([torch.full((1, self.n_tokens), 1), inputs['attention_mask']], 1)

        
        return inputs, torch.tensor(self.labels[idx])
    
    def __len__(self):
        return len(self.labels)
        
        
        
# Create train, test and val datasets
train_data_object = Dataset(
    labels = train_labels,
    text = train_text,
)

test_data_object = Dataset(
    labels = test_labels,
    text = test_text,
)

val_data_object = Dataset(
    labels = valid_labels,
    text = valid_text,
)



## We call the dataloader class
train_loader = torch.utils.data.DataLoader(
    train_data_object,
    batch_size=BATCH_SIZE,
    pin_memory=True,
    num_workers=2,
    shuffle=True,
    drop_last=True
 )

test_loader = torch.utils.data.DataLoader(
    test_data_object,
    batch_size=BATCH_SIZE,
    pin_memory=True,
    num_workers=2,
    shuffle=True,
    drop_last=True
 )

val_loader = torch.utils.data.DataLoader(
    val_data_object,
    batch_size=BATCH_SIZE,
    pin_memory=True,
    num_workers=2,
    shuffle=True,
    drop_last=True
 )

dataloaders = {'Train': train_loader, 'Test': test_loader, 'Val': val_loader}

In [40]:

roberta = RobertaForSequenceClassification.from_pretrained('roberta-base', 
                                                         num_labels=NUM_LABELS,
                                                         output_attentions=False,
                                                         output_hidden_states=False)

for param in roberta.parameters():
    param.requires_grad = False


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classi

### Add prompt emb

In [41]:
prompt_emb = PROMPTEmbedding(roberta.get_input_embeddings(), 
                      n_tokens= N_Tokens, 
                      initialize_from_vocab=True)

print(prompt_emb)

roberta.set_input_embeddings(prompt_emb)


roberta.roberta.embeddings.word_embeddings.learned_embedding.shape

PROMPTEmbedding(
  (wte): Embedding(50265, 768, padding_idx=1)
)


torch.Size([20, 768])


### Model

In [42]:
class Model(torch.nn.Module):
    
    def __init__(self, bert, adapter_hidden):

        super(Model, self).__init__()
        
        
        self.bert = bert
        
        self.adapter_1 = nn.Linear(768, adapter_hidden)
        
        self.adapter_2 = nn.Linear(adapter_hidden, 768)

            


    def forward(self, input_ids, attention_mask):
        
        # get output embeddings

        output_embed = self.bert.roberta.embeddings(input_ids = input_ids)
        
        roberta_text = output_embed
        
        
        # pass the output of embeddings into first 4 encoder layers
        for i in range(4):
            
            roberta_text = self.bert.roberta.encoder.layer[i](roberta_text)[0]
            
            
        # pass the ouput of 4th encoder layer to adapter layers
        roberta_text = self.adapter_1(roberta_text)
        
        roberta_text = self.adapter_2(roberta_text)
        
        
        # output of adapter layer to 5th encoder layer and so on
            
        for i in range(4, 12):
            roberta_text = self.bert.roberta.encoder.layer[i](roberta_text)[0]
        
        
        # final output to classifier head
        output = self.bert.classifier(roberta_text)
        
        # return final oitput
        return output

In [43]:
model = Model(roberta, adapter_hidden).to(device)

In [44]:
#Print trainable parameters

from prettytable import PrettyTable

def count_parameters(model):
    table = PrettyTable(["Modules", "Parameters"])
    total_params = 0
    for name, parameter in model.named_parameters():
        if not parameter.requires_grad: continue
        param = parameter.numel()
        table.add_row([name, param])
        total_params+=param
    print(table)
    print(f"Total Trainable Params: {total_params}")
    
count_parameters(model)

+-----------------------------------------------------------+------------+
|                          Modules                          | Parameters |
+-----------------------------------------------------------+------------+
| bert.roberta.embeddings.word_embeddings.learned_embedding |   15360    |
|                      adapter_1.weight                     |   12288    |
|                       adapter_1.bias                      |     16     |
|                      adapter_2.weight                     |   12288    |
|                       adapter_2.bias                      |    768     |
+-----------------------------------------------------------+------------+
Total Trainable Params: 40720


In [45]:

#optimizer
optimizer = AdamW(model.parameters(), lr = LEARNING_RATE, eps=1e-8)

#Loss function
criterion = nn.CrossEntropyLoss()

# Defining LR Scheduler
scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=0, 
    num_training_steps=len(train_loader)*EPOCHS
)

In [46]:
#to calculate accuracy

def get_accuracy(preds, labels):
  total_acc = 0.0
  
  for i in range(len(labels)):
    if labels[i] == preds[i]:
      total_acc+=1.0
  
  return total_acc / len(labels)

In [None]:
# Test pass


for (data, labels) in train_loader:

    input_ids =  data['input_ids'].squeeze(1).to(device)
    attention_mask = data['attention_mask'].squeeze(1).to(device)


    labels = labels.to(device)

    output = model(input_ids = input_ids, attention_mask = attention_mask)
    
    print(output)
    
    break


In [None]:
best_valid_f1 = 0.0
PATH = 'prompt+adapter_imdb.pt'

for epoch in range(0, EPOCHS):
  

    print('-'*10)
    print('Epoch {}/{}'.format(epoch+1, EPOCHS))

    for phase in ['Train', 'Val']:

        batch_loss = 0.0000   #live loss
        batch_acc = 0.0000   #live accuracy

        y_true = []
        y_pred = []

        if phase == 'Train':
            model.train()
        else:
            model.eval()
        
        with tqdm(dataloaders[phase], unit="batch", desc=phase) as tepoch:

          for idx, (data, labels) in enumerate(tepoch):
            input_ids =  data['input_ids'].squeeze(1).to(device)
            attention_mask = data['attention_mask'].squeeze(1).to(device)
            
            labels = labels.to(device)

            output = model(input_ids = input_ids, attention_mask = attention_mask)

            loss = criterion(output, labels)

            if phase == 'Train':

                #zero gradients
                optimizer.zero_grad() 

                # Backward pass  (calculates the gradients)
                loss.backward()   

                optimizer.step()             # Updates the weights
                
                scheduler.step()
                
                
            batch_loss += loss.item()
                
            _, preds = output.data.max(1)
            y_pred.extend(preds.tolist())
            y_true.extend(labels.tolist())
            
            batch_acc = get_accuracy(y_pred, y_true)
            
            tepoch.set_postfix(loss = batch_loss/(idx+1), accuracy = batch_acc )
        


          print(confusion_matrix(y_true, y_pred))
          pre = precision_score(y_true, y_pred, average='weighted')
          recall = recall_score(y_true, y_pred, average='weighted')
          f1 = f1_score(y_true, y_pred, average='weighted')
        
          print("F1: {:.4f}, Precision: {:.4f}, Recall : {:.4f}.".format(f1, pre, recall))
        
        
                    
          if phase == 'Val':
            if f1 > best_valid_f1:
                best_valid_f1 = f1
                torch.save(model.state_dict(), PATH)
                print('Model Saved!')
        
          print()

### Test

In [47]:
model.load_state_dict(torch.load('prompt+adapter_imdb.pt'))

<All keys matched successfully>

In [48]:
batch_loss = 0.0   #batch loss
batch_acc = 0.0   #batch accuracy

y_true = []
y_pred = []

# set the model to evaluation mode            
model.eval()

phase = 'Test'

with tqdm(test_loader, unit="batch", desc=phase) as tepoch:
    
    for idx, (data, labels) in enumerate(tepoch):
        
        input_ids =  data['input_ids'].squeeze(1).to(device)
        attention_mask = data['attention_mask'].squeeze(1).to(device)

        labels = labels.to(device)
        
        with torch.no_grad():

            output = model(input_ids = input_ids, attention_mask = attention_mask)

            loss = criterion(output, labels)
            
            _, preds = output.data.max(1)
            y_pred.extend(preds.tolist())
            y_true.extend(labels.tolist())
            
            batch_acc = get_accuracy(y_pred, y_true)
            batch_loss += loss.item()
            
        tepoch.set_postfix(loss = batch_loss/(idx+1), accuracy = batch_acc )


pre = precision_score(y_true, y_pred, average='macro')
recall = recall_score(y_true, y_pred, average='macro')
f1 = f1_score(y_true, y_pred, average='macro')
print("")

print("F1: {:.6f}, Precision: {:.6f}, Recall : {:.6f}".format(f1, pre, recall))

Test: 100%|██████████| 312/312 [00:53<00:00,  5.79batch/s, accuracy=0.919, loss=0.231]


F1: 0.919271, Precision: 0.919290, Recall : 0.919278



