In [1]:
# Libraries

import matplotlib.pyplot as plt
import pandas as pd
import torch
import json 

# Preliminaries
from datasets import load_dataset

# Models

import torch.nn as nn
from transformers import AutoTokenizer, BertForSequenceClassification

# Training

import torch.optim as optim

# Evaluation

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns


## Reading in data

In [2]:
dataset = load_dataset("scicite")

Using custom data configuration default
Reusing dataset scicite (C:\Users\Benjamin Aw\.cache\huggingface\datasets\scicite\default\1.0.0\cb102eb6c2a9e96c287c1723c1399156ba699ec30d435bfa9b07c7745d73c820)


  0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['string', 'sectionName', 'label', 'citingPaperId', 'citedPaperId', 'excerpt_index', 'isKeyCitation', 'label2', 'citeEnd', 'citeStart', 'source', 'label_confidence', 'label2_confidence', 'id'],
        num_rows: 8194
    })
    validation: Dataset({
        features: ['string', 'sectionName', 'label', 'citingPaperId', 'citedPaperId', 'excerpt_index', 'isKeyCitation', 'label2', 'citeEnd', 'citeStart', 'source', 'label_confidence', 'label2_confidence', 'id'],
        num_rows: 916
    })
    test: Dataset({
        features: ['string', 'sectionName', 'label', 'citingPaperId', 'citedPaperId', 'excerpt_index', 'isKeyCitation', 'label2', 'citeEnd', 'citeStart', 'source', 'label_confidence', 'label2_confidence', 'id'],
        num_rows: 1859
    })
})

## Tokenising text

Using Tokerniser to create a dataset class for training of the model with:
1. Labels: The label of the training data
2. Text: The training data

In [7]:
import torch
import numpy as np
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")

labels = {'method':0,
          'background':1,
          'result':2
          }

class Dataset(torch.utils.data.Dataset):

    def __init__(self, df):

        self.labels = [labels[label] for label in df['category']]
        self.texts = [tokenizer(text,
                                padding='max_length', max_length = 512, truncation=True,
                                return_tensors="pt") for text in df['text']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):
        # Get a set of labels and training data based on the index provided
        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

In [12]:
from torch import nn
from transformers import  AutoModel

class SciBertClassifier(nn.Module):

    def __init__(self, dropout=0.5):

        super(SciBertClassifier, self).__init__()

        self.scibert = AutoModel.from_pretrained("allenai/scibert_scivocab_uncased")
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 5)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):

        _, pooled_output = self.scibert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)

        return final_layer

In [14]:
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42), 
                                     [int(.8*len(df)), int(.9*len(df))])

In [19]:
from torch.optim import Adam
from tqdm import tqdm

def train(model, train_data, val_data, learning_rate, epochs):

    train, val = Dataset(train_data), Dataset(val_data)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=2, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=2)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)

    if use_cuda:

            model = model.cuda()
            criterion = criterion.cuda()

    for epoch_num in range(epochs):

            total_acc_train = 0
            total_loss_train = 0

            for train_input, train_label in tqdm(train_dataloader):

                train_label = train_label.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)
                
                batch_loss = criterion(output, train_label.long())
                total_loss_train += batch_loss.item()
                
                acc = (output.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc

                model.zero_grad()
                batch_loss.backward()
                optimizer.step()
            
            total_acc_val = 0
            total_loss_val = 0

            with torch.no_grad():

                for val_input, val_label in val_dataloader:

                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    output = model(input_id, mask)

                    batch_loss = criterion(output, val_label.long())
                    total_loss_val += batch_loss.item()
                    
                    acc = (output.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc
            
            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} | Train Accuracy: {total_acc_train / len(train_data): .3f} | Val Loss: {total_loss_val / len(val_data): .3f} | Val Accuracy: {total_acc_val / len(val_data): .3f}')
                  
EPOCHS = 5
model = SciBertClassifier()
LR = 1e-6
              
train(model, df_train, df_val, LR, EPOCHS)
PATH = './Model_Parameters/scibert.pth'
torch.save(model.state_dict(), PATH)

100%|██████████| 890/890 [01:05<00:00, 13.66it/s]


Epochs: 1 | Train Loss:  0.022 | Train Accuracy:  0.994 | Val Loss:  0.042 | Val Accuracy:  0.986


100%|██████████| 890/890 [01:04<00:00, 13.75it/s]


Epochs: 2 | Train Loss:  0.012 | Train Accuracy:  0.998 | Val Loss:  0.038 | Val Accuracy:  0.982


100%|██████████| 890/890 [01:03<00:00, 13.94it/s]


Epochs: 3 | Train Loss:  0.008 | Train Accuracy:  0.999 | Val Loss:  0.038 | Val Accuracy:  0.982


100%|██████████| 890/890 [03:57<00:00,  3.74it/s]


Epochs: 4 | Train Loss:  0.006 | Train Accuracy:  0.999 | Val Loss:  0.035 | Val Accuracy:  0.982


100%|██████████| 890/890 [04:44<00:00,  3.13it/s]


Epochs: 5 | Train Loss:  0.005 | Train Accuracy:  0.999 | Val Loss:  0.039 | Val Accuracy:  0.977


In [18]:
def evaluate(model, test_data):

    test = Dataset(test_data)

    test_dataloader = torch.utils.data.DataLoader(test, batch_size=2)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:

        model = model.cuda()

    total_acc_test = 0
    with torch.no_grad():

        for test_input, test_label in test_dataloader:

              test_label = test_label.to(device)
              mask = test_input['attention_mask'].to(device)
              input_id = test_input['input_ids'].squeeze(1).to(device)

              output = model(input_id, mask)

              acc = (output.argmax(dim=1) == test_label).sum().item()
              total_acc_test += acc
    
    print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')
    
evaluate(model, df_test)

Test Accuracy:  0.973
