In [4]:
import os
import nltk
import random as rand
import re

import numpy as np
import pandas as pd


import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS

from transformers import AutoTokenizer, AutoModel, AlbertTokenizer, AlbertModel
from transformers import BertTokenizer, BertModel
import torch 
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
from torch.utils.data.sampler import RandomSampler, SequentialSampler
import torch.nn.functional as F
from torch.optim import Adam

from classifier import BertClassifier

from sklearn.metrics import confusion_matrix, f1_score, recall_score, precision_score

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

# Model

In [None]:
model_options = {
    "biobert": "dmis-lab/biobert-v1.1",
    "pubmed_abstract": "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract",
    "scibert": "allenai/scibert_scivocab_uncased",
    "pubmed_fulltext": "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext",
    "medbert": "Charangan/MedBERT",
    "basebert": "bert-base-uncased",
    "tinybert": "prajjwal1/bert-tiny",
    "minibert": "prajjwal1/bert-mini",
    "smallbert": "prajjwal1/bert-small",
    "mediumbert": "prajjwal1/bert-medium"
}

In [None]:
# pick the model and create the tokenizer
current_model = model_options['biobert']
tokenizer = AutoTokenizer.from_pretrained(current_model)

Downloading (…)okenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/462 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
# read the training & validation data
train_data = pd.read_csv('/content/gdrive/MyDrive/ESA/data70-10-20/cns_train_aug.csv')
val_data = pd.read_csv('/content/gdrive/MyDrive/ESA/data70-10-20/cns_val.csv')

In [None]:
labels = {
    'Excluded':0,
    'Included':1,
}

In [None]:
class Dataset(torch.utils.data.Dataset):
    """PyTorch Dataset class for our systematic review datasets.
    """
    def __init__(self, df):
        """Creates the dataset
              Params:
                df: dataset in a dataframe 
        """
        self.labels = [labels[label] for label in df['decision']]
        self.texts = [tokenizer(text, padding='max_length', max_length = 512, truncation=True,
                      return_tensors="pt") for text in df['titleabstract']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):
        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)
        return batch_texts, batch_y

In [None]:
def train(model, train_data, val_data, learning_rate, epochs):
    """ Function to train the model.
        Params:
          - model: the model to be trained
          - train_data: traing data (Pandas DataFrame format)
          - val_data: validation data (Pandas DataFrame format)
          - learning_rate: learning rate
          - epochs: the number of epochs for training
    """
    train, val = Dataset(train_data), Dataset(val_data)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=8, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=8)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    loss_weights = torch.Tensor([1., 17.]) #pick the weights 
    criterion = nn.CrossEntropyLoss(weight=loss_weights)
    optimizer = Adam(model.parameters(), lr=learning_rate)

    if use_cuda:
      model = model.cuda()
      criterion = criterion.cuda()

    for epoch_num in range(epochs):
      tp_t = 0
      fn_t = 0

      tp_v = 0
      fn_v = 0
      fp_v = 0

      total_acc_train = 0
      total_loss_train = 0
      total_recall_train = 0

      for train_input, train_label in train_dataloader:
        train_label = train_label.to(device)
        mask = train_input['attention_mask'].to(device)
        input_id = train_input['input_ids'].squeeze(1).to(device)

        output, attentions = model(input_id, mask)
                
        batch_loss = criterion(output, train_label.long())
        total_loss_train += batch_loss.item()
                
        acc = (output.argmax(dim=1) == train_label).sum().item()
        total_acc_train += acc

        for ind, out in enumerate(output.argmax(dim=1)):
          if out == train_label[ind] and train_label[ind] == 1:
            tp_t += 1
          elif out != train_label[ind] and train_label[ind] == 1:
            fn_t += 1
                
        model.zero_grad()
        batch_loss.backward()
        optimizer.step()
            
      total_acc_val = 0
      total_loss_val = 0

      with torch.no_grad():
        for val_input, val_label in val_dataloader:
          val_label = val_label.to(device)
          mask = val_input['attention_mask'].to(device)
          input_id = val_input['input_ids'].squeeze(1).to(device)

          output, attentions = model(input_id, mask)

          batch_loss = criterion(output, val_label.long())
          total_loss_val += batch_loss.item()
                    
          acc = (output.argmax(dim=1) == val_label).sum().item()

          for ind, out in enumerate(output.argmax(dim=1)):
            if out == val_label[ind] and val_label[ind] == 1:
              tp_v += 1
            elif out != val_label[ind] and val_label[ind] == 1:
              fn_v += 1
            elif val_label[ind] == 0 and out == 1:
              fp_v += 1

          total_acc_val += acc

      if tp_t + fn_t > 0:
        recall_t = tp_t/(tp_t + fn_t)
      else:
        recall_t = 0

      if tp_v + fn_v > 0:
        recall_v = tp_v/(tp_v+ fn_v)
      else:
        recall_v = 0
              
      if tp_v + fp_v > 0:
        precision_v = tp_v/(tp_v + fp_v)
      else:
        precision_v = 0

      print('EPOCH ', epoch_num)
      print("Train loss", {total_loss_train / len(train_dataloader)})
      print("Train Accuracy", {total_acc_train / len(train_data)})
      print("Train Recall", recall_t)
      print("Validation loss", {total_loss_val / len(val_data)})
      print("Validation Accuracy", {total_acc_val / len(val_data)})
      print("Validation Recall", recall_v)
      print('Val precision', precision_v)
      print('val tp', tp_v, 'fp', fp_v, 'fn', fn_v)

      model_name = "biobert" + str(epoch_num ) + ".pt"
      torch.save(model.state_dict(), "/content/" + model_name)

In [None]:
EPOCHS = 5
model = BertClassifier(hidden=768, model_type=current_model);
LR = 2e-5

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/433M [00:00<?, ?B/s]

In [None]:
train(model, train_data, val_data, LR, EPOCHS)

  final_layer = self.softmax(linear_output)


EPOCH  0
Train loss {0.05198394814345001}
Train Accuracy {0.49423142178486595}
Train Recall 0.9939024390243902
Validation loss {0.0708329515152068}
Validation Accuracy {0.6587677725118484}
Validation Recall 1.0
Val precision 0.2
val tp 36 fp 144 fn 0


In [None]:
torch.save(model.state_dict(), "bio.pt")

In [None]:
model.load_state_dict(torch.load("bio.pt"))

<All keys matched successfully>