In [1]:
!pip install transformers
import torch
from torch import nn
import pandas as pd
import numpy as np
from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from sklearn.metrics import classification_report
from torch.optim import Adam
from torch.nn.utils import clip_grad_norm_
from IPython.display import clear_output
from transformers import logging
logging.set_verbosity_error()

import sklearn
from sklearn.linear_model import OrthogonalMatchingPursuit

from google.colab import drive
drive.mount('/content/drive')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
cuda


In [2]:
path = '/content/drive/MyDrive/NLP-Final/'

train_data = pd.read_csv(path + 'train.csv')
test_data = pd.read_csv(path + 'test.csv')

train_data = train_data[:2000]
test_data = test_data[:500]

train_data = train_data.to_dict(orient='records')
test_data = test_data.to_dict(orient='records')
type(train_data)

train_texts, train_labels = list(zip(*map(lambda d: (d['text'], d['sentiment']), train_data)))
test_texts, test_labels = list(zip(*map(lambda d: (d['text'], d['sentiment']), test_data)))

len(train_texts), len(train_labels), len(test_texts), len(test_labels)

(2000, 2000, 500, 500)

In [3]:
BATCH_SIZE = 4

def process_data(tokenizer):
    train_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:510] + ['[SEP]'], train_texts))
    test_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:510] + ['[SEP]'], test_texts))
    train_tokens_ids = pad_sequences(list(map(tokenizer.convert_tokens_to_ids, train_tokens)), maxlen=512, truncating="post", padding="post", dtype="int")
    test_tokens_ids = pad_sequences(list(map(tokenizer.convert_tokens_to_ids, test_tokens)), maxlen=512, truncating="post", padding="post", dtype="int")

    train_y = np.array(train_labels) == 'pos'
    test_y = np.array(test_labels) == 'pos'
    # test_y = np.append(test_y, np.array(test_labels) != 'pos')

    train_masks = [[float(i > 0) for i in ii] for ii in train_tokens_ids]
    test_masks = [[float(i > 0) for i in ii] for ii in test_tokens_ids]
    
    train_tokens_tensor = torch.tensor(train_tokens_ids)
    train_y_tensor = torch.tensor(train_y.reshape(-1, 1)).float()

    test_tokens_tensor = torch.tensor(test_tokens_ids)
    test_y_tensor = torch.tensor(test_y.reshape(-1, 1)).float()

    train_masks_tensor = torch.tensor(train_masks)
    test_masks_tensor = torch.tensor(test_masks)

    train_dataset = TensorDataset(train_tokens_tensor, train_masks_tensor, train_y_tensor)
    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=BATCH_SIZE)

    test_dataset = TensorDataset(test_tokens_tensor, test_masks_tensor, test_y_tensor)
    test_sampler = SequentialSampler(test_dataset)
    test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=BATCH_SIZE)
    
    return train_dataloader, test_dataloader, test_y

In [4]:
EPOCHS = 10

def train_model(model, train_dataloader):
  param_optimizer = list(model.named_parameters()) 
  optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]
  optimizer = Adam(model.parameters(), lr=3e-6)
  torch.cuda.empty_cache()   # Clearing Cache space for a fresh Model run

  for epoch_num in range(EPOCHS):
    model.train()
    train_loss = 0
    for step_num, batch_data in enumerate(train_dataloader):
        token_ids, masks, labels = tuple(t.to(device) for t in batch_data)
        print(str(torch.cuda.memory_allocated(device)/1000000 ) + 'M')

        tmp = torch.eq(labels, torch.zeros_like(labels))
        labels_for_loss = torch.cat((labels, tmp), dim=1)
        
        logits = model(token_ids, masks).logits
        
        loss_func = nn.CrossEntropyLoss()
        batch_loss = loss_func(logits, labels_for_loss)
        train_loss += batch_loss.item()
        
        model.zero_grad()
        batch_loss.backward()

        clip_grad_norm_(parameters=model.parameters(), max_norm=1.0)
        optimizer.step()
        
        clear_output(wait=True)
        print('Epoch: ', epoch_num + 1)
        print("\r" + "{0}/{1} loss: {2} ".format(step_num, len(train_data) / BATCH_SIZE, train_loss / (step_num + 1)))
  return model

In [4]:
def eval_model(model, test_dataloader, test_y):
  model_predicted = []
  all_logits = []
  with torch.no_grad():
      for step_num, batch_data in enumerate(test_dataloader):
          token_ids, masks, labels = tuple(t.to(device) for t in batch_data)
          tmp = torch.eq(labels, torch.zeros_like(labels))
          labels_for_loss = torch.cat((labels, tmp), dim=1)

          logits = model(token_ids, masks).logits

          loss_func = nn.CrossEntropyLoss()
          loss = loss_func(logits, labels_for_loss)
          numpy_logits = logits.cpu().detach().numpy()
          
          model_predicted += list(numpy_logits[:, 0] > 0.5)
          all_logits += list(numpy_logits[:, 0])

          token_ids, masks, labels = tuple(t.to("cpu") for t in batch_data) # PATCH- move back to cpu
  print(type(test_y)) 
  print(type(model_predicted))        
  return classification_report(test_y, model_predicted)

In [None]:
# roberta without finetuning
roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
train_dataloader, test_dataloader, test_y = process_data(roberta_tokenizer)
roberta = RobertaForSequenceClassification.from_pretrained('roberta-base')
report = eval_model(roberta, test_dataloader, test_y)
print(report)

In [None]:
# finetune Roberta
roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
train_dataloader, test_dataloader, test_y = process_data(roberta_tokenizer)
roberta = RobertaForSequenceClassification.from_pretrained('roberta-base')
roberta = roberta.cuda()
roberta_w_finetuning = train_model(roberta, train_dataloader)
report = eval_model(roberta_w_finetuning, test_dataloader, test_y)
print(report)

STATE_PATH = '/content/drive/MyDrive/NLP-Final/roberta_state_w_finetuning'
roberta_st = roberta_w_finetuning.state_dict()
torch.save(roberta_st, STATE_PATH)

In [6]:
# this is how to load the model
roberta_check = RobertaForSequenceClassification.from_pretrained('roberta-base')
roberta_check = roberta_check.cuda()
roberta_check.load_state_dict(torch.load(STATE_PATH))
eval_model(roberta_check, test_dataloader, test_y)

<class 'numpy.ndarray'>
<class 'list'>


'              precision    recall  f1-score   support\n\n       False       0.93      0.92      0.93       246\n        True       0.92      0.94      0.93       254\n\n    accuracy                           0.93       500\n   macro avg       0.93      0.93      0.93       500\nweighted avg       0.93      0.93      0.93       500\n'

In [None]:
def print_layers_params(model):
  for name,x in model.named_parameters():
    if "intermediate.dense.weight" in name: # intermidiate is the feed forward part of the layer 
      print(name)