### Importing packages

In [1]:
# Libraries

# Reading in files
import pandas as pd
import numpy as np

# Progress bar
from tqdm.auto import tqdm

# Torch modules
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast as AutoTokenizer, AutoModel, AdamW, get_linear_schedule_with_warmup

# Lightning modules
import pytorch_lightning as pl
from torchmetrics.functional import accuracy, auroc
from torchmetrics import F1Score
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger

# Split dataset/validation
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, multilabel_confusion_matrix

### Reading in data

In [2]:
train_df = pd.read_csv("data/csv/train.csv")
val_df = pd.read_csv("data/csv/test.csv")
test_df = pd.read_csv("data/csv/dev.csv")

In [3]:
train_df

Unnamed: 0,string,label
0,"However, how frataxin interacts with the Fe-S ...",background
1,"In the study by Hickey et al. (2012), spikes w...",background
2,"The drug also reduces catecholamine secretion,...",background
3,By clustering with lowly aggressive close kin ...,background
4,Ophthalmic symptoms are rare manifestations of...,background
...,...,...
8238,"Importantly, the results of Pascalis et al. (2...",background
8239,"As suggested by Nguena et al, there is a need ...",background
8240,Skeletal muscle is also a primary site of dise...,background
8241,ACTIVATION OF TRANSCRIPTION FACTORS Roles for ...,method


### Tokenizer 

In [4]:
SCIBERT_MODEL_NAME = 'allenai/scibert_scivocab_uncased'
tokenizer = AutoTokenizer.from_pretrained(SCIBERT_MODEL_NAME)

In [5]:
sample_row = train_df.iloc[16]
sample_comment = sample_row["string"]
sample_labels = sample_row["label"]

In [6]:
label_to_idx = {"method":0, "background": 1, "result": 2}
idx_to_label = {0:"method",  1:"background", 2:"result"}

In [7]:
label_to_idx[sample_labels]

1

In [8]:
MAX_TOKEN_COUNT = 512

encoding = tokenizer.encode_plus(
  sample_comment,
  add_special_tokens = True,
  max_length= MAX_TOKEN_COUNT,
  return_token_type_ids = False,
  padding = "max_length",
  return_attention_mask = True,
  return_tensors = 'pt',
)

encoding.keys()

dict_keys(['input_ids', 'attention_mask'])

In [9]:
class SciBertDataset(Dataset):
  
  def __init__(self, data: pd.DataFrame, tokenizer: AutoTokenizer, max_token_len: int = 512, mapping = label_to_idx):
    self.tokenizer = tokenizer
    self.data = data
    self.max_token_len = max_token_len
    self.mapping = mapping
  
  def __len__(self):
    return len(self.data)
  
  def __getitem__(self, index: int):
    
    data_row = self.data.iloc[index]
    text = data_row["string"]
    labels = self.mapping[data_row["label"]]
    
    encoding = self.tokenizer.encode_plus(
      text,
      add_special_tokens=True,
      max_length=self.max_token_len,
      return_token_type_ids=False,
      padding="max_length",
      truncation=True,
      return_attention_mask=True,
      return_tensors='pt',
    )

    return dict(
      text=text,
      input_ids=encoding["input_ids"].flatten(),
      attention_mask=encoding["attention_mask"].flatten(),
      labels=labels
    )

In [10]:
train_dataset = SciBertDataset(train_df, tokenizer, max_token_len = 512, mapping = label_to_idx)
val_dataset = SciBertDataset(val_df, tokenizer, max_token_len = 512, mapping = label_to_idx)
test_dataset = SciBertDataset(test_df, tokenizer, max_token_len = 512, mapping = label_to_idx)

In [23]:
class SciBertDataModule(pl.LightningDataModule):
  def __init__(self, train_df, test_df, val_df, tokenizer, batch_size=8, max_token_len=512):
    
    super().__init__()
    self.batch_size = batch_size
    self.train_df = train_df
    self.test_df = test_df
    self.val_df = val_df
    self.tokenizer = tokenizer
    self.max_token_len = max_token_len
  
  def setup(self, stage=None):
    self.train_dataset = SciBertDataset(
      self.train_df,
      self.tokenizer,
      self.max_token_len
    )
    self.test_dataset = SciBertDataset(
      self.test_df,
      self.tokenizer,
      self.max_token_len
    )
    self.val_dataset = SciBertDataset(
      self.val_df,
      self.tokenizer,
      self.max_token_len
    )
  
  
  def train_dataloader(self):
    return DataLoader(
      self.train_dataset,
      batch_size=self.batch_size,
      shuffle=True
    )
  
  def val_dataloader(self):
    return DataLoader(
      self.test_dataset,
      batch_size=self.batch_size
    )
  
  def test_dataloader(self):
    return DataLoader(
      self.test_dataset,
      batch_size=self.batch_size
    )

In [25]:
N_EPOCHS = 2
BATCH_SIZE = 8

data_module = SciBertDataModule(
  train_df,
  test_df,
  val_df,
  tokenizer,
  batch_size=BATCH_SIZE,
  max_token_len=MAX_TOKEN_COUNT
)

### Modelling

In [28]:
sample_batch = next(iter(DataLoader(train_dataset, batch_size=8)))
sample_batch

{'text': ['However, how frataxin interacts with the Fe-S cluster biosynthesis components remains unclear as direct one-to-one interactions with each component were reported (IscS [12,22], IscU/Isu1 [6,11,16] or ISD11/Isd11 [14,15]).',
  'In the study by Hickey et al. (2012), spikes were sampled from the field at the point of physiological\nrobinson et al.: genomic regions influencing root traits in barley 11 of 13\nmaturity, dried, grain threshed by hand, and stored at −20C to preserve grain dormancy before germination testing.',
  'The drug also reduces catecholamine secretion, thereby reducing stress and leading to a modest (10-20%) reduction in heart rate and blood pressure, which may be particularly beneficial in patients with cardiovascular disease.(7) Unlike midazolam, dexmedetomidine does not affect the ventilatory response to carbon dioxide.',
  'By clustering with lowly aggressive close kin (King 1989a,b; Viblanc et al. 2010; Arnaud, Dobson & Murie 2012), breeding females may 

In [26]:
SCIBERT_MODEL_NAME = "allenai/scibert_scivocab_uncased"

In [40]:
class SciBertClassModel(pl.LightningModule):
  
  def __init__(self, n_classes: int, n_training_steps=None, n_warmup_steps=None):
    
    super().__init__()
    self.scibert = AutoModel.from_pretrained(SCIBERT_MODEL_NAME, return_dict=True)
    self.classifier = nn.Linear(self.scibert.config.hidden_size, n_classes)
    self.n_training_steps = n_training_steps
    self.n_warmup_steps = n_warmup_steps
    self.criterion = nn.CrossEntropyLoss()
  
  def forward(self, input_ids, attention_mask, labels=None):
    output = self.scibert(input_ids, attention_mask=attention_mask)
    output = self.classifier(output.pooler_output)
    loss = 0
    if labels is not None:
        loss = self.criterion(output, labels)
    return loss, output
  
  def training_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("train_loss", loss, prog_bar=True, logger=True)
    return {"loss": loss, "predictions": outputs, "labels": labels}
  
  def validation_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("val_loss", loss, prog_bar=True, logger=True)
    return loss
  
  def test_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("test_loss", loss, prog_bar=True, logger=True)
    return loss
  
  def training_epoch_end(self, outputs):
    labels = []
    predictions = []
    
    for output in outputs:
      for out_labels in output["labels"].detach().cpu():
        labels.append(out_labels)
      for out_predictions in output["predictions"].detach().cpu():
        predictions.append(out_predictions)
    
    labels = torch.stack(labels).int()
    predictions = torch.stack(predictions)

    print(labels)
    print(predictions)
    
    for i, name in enumerate(LABEL_COLUMNS):
      class_roc_auc = auroc(predictions[:, i], labels[:, i])
      self.logger.experiment.add_scalar(f"{name}_roc_auc/Train", class_roc_auc, self.current_epoch)
  
  def configure_optimizers(self):
    optimizer = AdamW(self.parameters(), lr=2e-5)
    scheduler = get_linear_schedule_with_warmup(
      optimizer,
      num_warmup_steps=self.n_warmup_steps,
      num_training_steps=self.n_training_steps
    )
    return dict(
      optimizer = optimizer,
      lr_scheduler = dict(
        scheduler = scheduler,
        interval = 'step'
      )
    )

In [41]:
checkpoint_callback = ModelCheckpoint(
  dirpath = "checkpoints",
  filename = "best-checkpoint",
  save_top_k = 1,
  verbose = True,
  monitor = "val_loss",
  mode = "min"
)

logger = TensorBoardLogger("lightning_logs", name="sciBert")

early_stopping_callback = EarlyStopping(monitor='val_loss', patience=2)

trainer = pl.Trainer(
  logger = logger,
  callbacks = [checkpoint_callback, early_stopping_callback],
  max_epochs = N_EPOCHS,
  accelerator = "auto"
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [42]:
steps_per_epoch=len(train_df) // BATCH_SIZE
total_training_steps = steps_per_epoch * N_EPOCHS
warmup_steps = total_training_steps // 5
warmup_steps, total_training_steps

(412, 2060)

In [43]:
model = SciBertClassModel(
  n_classes = 3,
  n_warmup_steps = warmup_steps,
  n_training_steps = total_training_steps
)

Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [44]:
trainer.fit(model, data_module)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name       | Type             | Params
------------------------------------------------
0 | scibert    | BertModel        | 109 M 
1 | classifier | Linear           | 2.3 K 
2 | criterion  | CrossEntropyLoss | 0     
------------------------------------------------
109 M     Trainable params
0         Non-trainable params
109 M     Total params
439.683   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

tensor([1, 1, 1,  ..., 1, 2, 1], dtype=torch.int32)
tensor([[ 0.5595,  0.5880, -0.2772],
        [ 0.4827,  0.6871, -0.4410],
        [ 0.3551,  0.4746, -0.4078],
        ...,
        [ 1.9280,  1.7423, -2.5922],
        [-1.5197, -0.4256,  1.1477],
        [-0.2428,  3.1069, -2.0503]])


NameError: name 'LABEL_COLUMNS' is not defined