#Imports

In [None]:
!pip install --quiet transformers
!git clone https://github.com/ancatache/LaRoSeDa.git

In [None]:
import json
import numpy as np
from transformers import BertTokenizer, AutoModel,Adafactor
import torch 
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score,precision_score, recall_score, f1_score

if torch.cuda.is_available():
  device = torch.device('cuda')
  print('Running on gpu')
else:
    device = torch.device('cpu')
    print('Running on cpu')

Running on gpu


In [None]:
# #tokenizer for tokenization of the inputs of BERT
tokenizer = BertTokenizer.from_pretrained("dumitrescustefan/bert-base-romanian-cased-v1")
# #BERT model pre-trained on an romanian corpus
BERT = AutoModel.from_pretrained("dumitrescustefan/bert-base-romanian-cased-v1")


#batch size of the data set
batch_size = 3

#loss function
binary_cross_entropy = nn.BCELoss()
epochs = 3

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=385.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=500386589.0, style=ProgressStyle(descri…




#Data preparation

In [None]:
TEST_FILE = 'LaRoSeDa/data_splitted/laroseda_test.json'



#testing split:
testing_label   = []
testing_title   = []
testing_content = []



#TESTING DATA SET
with open(TEST_FILE) as f:
  reviews = json.load(f)['reviews']

  for i in range(len(reviews)):
    encoded = tokenizer(str(reviews[i]['title']), str(reviews[i]['content']))
    if len(encoded['input_ids']) <= 512:
      testing_title.append(str(reviews[i]['title']))
      testing_content.append(str(reviews[i]['content']))
      if int(reviews[i]['starRating']) >= 3:
        testing_label.append(int(reviews[i]['starRating']) - 2)
      if int(reviews[i]['starRating']) < 3:
        testing_label.append(int(reviews[i]['starRating']) - 1)


In [None]:
TRAIN_FILE = 'LaRoSeDa/data_splitted/laroseda_train.json'

#training split:
training_label   = []
training_title   = []
trinaing_content = []

#validation split:
validation_label   = []
validation_title   = []
validation_content = []

#TRAINING DATA SET
#validation split 15% from testing split
with open(TRAIN_FILE) as f:
  reviews = json.load(f)['reviews']

  #validation split is 15% from testing split
  val_stop = int(1.5/10*len(reviews))
  
  for i in range(val_stop):
    #BERT accepts max length of tokens of 512
    encoded = tokenizer(str(reviews[i]['title']), str(reviews[i]['content']))

    if len(encoded['input_ids']) <= 512:
      validation_title.append(str(reviews[i]['title']))
      validation_content.append(str(reviews[i]['content']))
      if int(reviews[i]['starRating']) >= 3:
        validation_label.append(int(reviews[i]['starRating']) - 2)
      if int(reviews[i]['starRating']) < 3:
        validation_label.append(int(reviews[i]['starRating']) - 1)
  
  for i in range(val_stop, len(reviews)):
    encoded = tokenizer(str(reviews[i]['title']), str(reviews[i]['content']))

    if len(encoded['input_ids']) <= 512:
      training_title.append(str(reviews[i]['title']))
      trinaing_content.append(str(reviews[i]['content']))
      if int(reviews[i]['starRating']) >= 3:
        training_label.append(int(reviews[i]['starRating']) - 2)
      if int(reviews[i]['starRating']) < 3:
        training_label.append(int(reviews[i]['starRating']) - 1)


In [None]:
training_batch = tokenizer(training_title, training_content,add_special_tokens=True,padding=True,truncation=True,max_length=512,return_tensors='pt')


In [None]:
train_data = TensorDataset(training_batch['input_ids'], training_batch['attention_mask'],training_batch['token_type_ids'],torch.tensor(training_label))
train_dataloader = DataLoader(train_data, batch_size=batch_size)


val_data = TensorDataset(validation_batch['input_ids'], validation_batch['attention_mask'],validation_batch['token_type_ids'],torch.tensor(validation_label))
val_dataloader = DataLoader(val_data, batch_size=batch_size)

#Model Preparation

In [None]:
#the model which will be fine-tunned

class BERT_Model(nn.Module):

  def __init__(self, bert):

    super(BERT_Model, self).__init__()
    self.bert = bert

    #fine-tunned layer: pooled_layer -> dropout layer -> relu -> last layer -> softmax
    self.dropout = nn.Dropout(0.2)
    self.relu = nn.LeakyReLU()
    self.layer = nn.Linear(768, 1)
    self.sigmoid = nn.Sigmoid()


  def forward_pass(self, id, mask, token_type_id):

    pooled_layer = self.bert(id, attention_mask=mask, token_type_ids=token_type_id)[1]
    x = self.dropout(pooled_layer)
    x = self.relu(x)
    x = self.layer(x)
    x = self.sigmoid(x)

    return x


#putting the model on to GPU

model = BERT_Model(BERT)
model = model.to(device)

Training function

In [None]:
#variables for the scheduler 
num_warmup_steps = int(len(train_dataloader)/ batch_size)
num_train_steps = int(len(train_dataloader) * epochs / batch_size)


optimizer = AdamW(model.parameters(), lr=3e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_train_steps)

#... trained on every epoch ...
def train():

  #function to put the model in to straining state
  model.train()
  #toatal loss of the epoch
  total_loss = 0

  for step, batch in enumerate(train_dataloader):
    #moving the data on same gpu as the model
    batch = [sample.to(device) for sample in batch]
    ids, masks, tokens, labels = batch

    model.zero_grad()

    #output of the model

    y = model.forward_pass(ids, masks, tokens)
    
    labels = labels.to(torch.float32)
    y = y.to(torch.float32)
    
    loss = binary_cross_entropy(y, labels) 
    total_loss+=loss
    

    #cliping the gradients
    torch.nn.utils.clip_grad_norm_(model.parameters(), 2.0)

    #backpropagation stage
    loss.backward()
    optimizer.step()
    scheduler.step()

    #moving the output on cpu to save memory
    y = y.detach().cpu().numpy()

    if step % 100 == 0 and not step == 0:
      print(' Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))
      print(' Total loss over this batch: {}'.format(loss))
      print('-------')

  return total_loss/len(train_dataloader)

Evaluation function

In [None]:
def convert(outputs):
  for output in outputs:
    if output[0] >= 0.5:
      output[0] = 1
    elif output[0] < 0.5:
      output[0] = 0



#evaluation function 
def evaluate():
  print('Evaluation...')

  total_loss = 0
  model.eval()
  test_scores = []
  
  for step, batch in enumerate(val_dataloader):
    
    batch = [sample.to(device) for sample in batch]
    id, mask, token, label = batch

    with torch.no_grad():

      output = model.forward_pass(id, mask, token)


      target = label.to(torch.float32)
      output = output.to(torch.float32)

      loss = binary_cross_entropy(output, target)

      total_loss+=loss

      convert(output)
      target = target.detach().cpu().numpy()
      output = output.detach().cpu().numpy()

      acc_score = accuracy_score(target,output)
      f_score = f1_score(target, output, average='macro')

      if step % 50 == 0 and not step == 0:
        print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(val_dataloader)))
        print('  Accuracy over F1: {}'.format(acc_score/f_score))

      test_scores.append(acc_score/f_score)
      
  return total_loss / len(val_dataloader)


#Training

In [None]:
best_loss = float('inf')


total_val_losses = []
total_train_losses = []
k = 0

for epoch in range(epochs):
  k+=1
  print('\n\n Epoch {:} / {:}'.format(epoch + 1, epochs))


  train_loss = train()
  val_loss = evaluate()

  #finding the best loss over the epoch to save the right model
  if val_loss < best_loss:
    best_loss = val_loss
    torch.save(model.state_dict(), 'saved_weights.pt') 

  total_train_losses.append(train_loss)
  total_val_losses.append(val_loss)

  print(f'\nTraining Loss: {train_loss:.3f}')
  print(f'Validation Loss: {val_loss:.3f}')

  #preventing overfitting
  if k >= 4 and abs(total_train_losses[k - 1] - total_train_losses[k - 2]) <= 0.01 and abs(total_train_losses[k - 2] - total_train_losses[k - 3]) <= 0.01:
    break


#Testing(Needs to be modified)

In [None]:
PATH = 'saved_weights.pt'
model.load_state_dict(torch.load(PATH, map_location='cuda:0'))

In [None]:
# testing...
from sklearn.metrics import accuracy_score,precision_score, recall_score, f1_score
# accuracy
# precision
# recall
# F1(macro)

def convert(outputs):
  for output in outputs:
    if output[0] >= 0.5:
      output[0] = 1
    elif output[0] < 0.5:
      output[0] = 0
  

def test():
  print('Testing...')
  model.eval()
  total_acc = 0
  total_prec = 0
  total_rec = 0
  total_f1 = 0


  for step, batch in enumerate(test_dataloader):

      batch = [sample.to(device) for sample in batch]
      id, mask, token, label = batch

      with torch.no_grad():
        output = model.forward_pass(id, mask, token)
        

        target = label.detach().cpu().numpy()
        output = output.detach().cpu().numpy()

        convert(output)

        acc_score = accuracy_score(target,output)
        prec_score = precision_score(target, output,average='macro')
        rec_score = recall_score(target, output, average='macro')
        f_1_score = f1_score(target, output, average=None)

        total_acc+=acc_score
        total_prec+=prec_score
        total_rec+=rec_score
        total_f1+=f_1_score

        if step % 50 == 0 and step != 0:
          print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(test_dataloader)))
          print('  Scores: {},{},{},{}'.format(total_acc/step, total_prec/step, total_rec/step, total_f1/step))
          print('------')



  return total_acc/len(test_dataloader),total_prec/len(test_dataloader),total_rec/len(test_dataloader),total_f1/len(test_dataloader)


def testing():
  print('Scores: Accuracy score, Precision score, Recall score, F1 score')

  accuracy_loss, precision_loss, recall_loss, f1_loss = test()

  print('  Average accuracy score: {}'.format(accuracy_loss))
  print('  Average precision score: {}'.format(precision_loss))
  print('  Average recall score: {}'.format(recall_loss))
  print('  Average F1 score: {}'.format(f1_loss))


testing()


