In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import re

In [None]:
train_df = pd.read_csv("/content/drive/MyDrive/archive/train.csv")
val_df = pd.read_csv("/content/drive/MyDrive/archive/validation.csv")
test_df = pd.read_csv("/content/drive/MyDrive/archive/test.csv")

In [None]:
# Calculate frequency table
frequency_table = train_df['sectionName'].value_counts()

# Print the frequency table
frequency_table

sectionName
Discussion                                         1227
Introduction                                        833
Methods                                             789
DISCUSSION                                          481
Results                                             359
                                                   ... 
9. Conclusion and future work                         1
2.3 Methods                                           1
5 Computational Study                                 1
Protein samples                                       1
2.2 Concepts for improved model version control       1
Name: count, Length: 1146, dtype: int64

In [None]:
def preprocess_sectionName(sectionName):
    sectionName = str(sectionName)
    newSectionName = sectionName.lower()
    # sectionName_split = newSectionName.split()
    # pattern = r'\d+'

    # if re.search(pattern, sectionName_split[0]):
    #     if len(sectionName_split) > 1:
    #         newSectionName = " ".join(sectionName_split[1:])
    #     else:
    #         newSectionName = None

    if newSectionName != None:
        if "introduction" in newSectionName or "preliminaries" in newSectionName:
            newSectionName = "introduction"
        elif "result" in newSectionName or "finding" in newSectionName:
            newSectionName = "results"
        elif "method" in newSectionName or "approach" in newSectionName:
            newSectionName = "method"
        elif "discussion" in newSectionName:
            newSectionName = "discussion"
        elif "background" in newSectionName:
            newSectionName = "background"
        elif "experiment" in newSectionName or "setup" in newSectionName or "set-up" in newSectionName or "set up" in newSectionName:
            newSectionName = "experiment"
        elif "related work" in newSectionName or "relatedwork" in newSectionName or "prior work" in newSectionName or "literature review" in newSectionName:
            newSectionName = "related work"
        elif "evaluation" in newSectionName:
            newSectionName = "evaluation"
        elif "implementation" in newSectionName:
            newSectionName = "implementation"
        elif "conclusion" in newSectionName:
            newSectionName = "conclusion"
        elif "limitation" in newSectionName:
            newSectionName = "limitation"
        elif "appendix" in newSectionName:
            newSectionName = "appendix"
        elif "future work" in newSectionName or "extension" in newSectionName:
            newSectionName = "appendix"
        elif "analysis" in newSectionName:
            newSectionName = "analysis"
        else:
            newSectionName = "unspecified"

        return newSectionName

In [None]:
train_df["sectionName"] = train_df["sectionName"].apply(preprocess_sectionName)
val_df["sectionName"] = val_df["sectionName"].apply(preprocess_sectionName)
test_df["sectionName"] = test_df["sectionName"].apply(preprocess_sectionName)

frequency_table = train_df['sectionName'].value_counts()

# Print the frequency table
frequency_table

sectionName
discussion        2009
introduction      1686
unspecified       1580
method            1532
results            764
experiment         291
background         124
implementation      68
related work        61
analysis            22
conclusion          20
evaluation          18
appendix            14
limitation           5
Name: count, dtype: int64

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from transformers import AutoModel,BertTokenizerFast

# tokenizer = BertTokenizerFast.from_pretrained("allenai/scibert_scivocab_uncased")
# bert = AutoModel.from_pretrained("allenai/scibert_scivocab_uncased", num_labels = 3)

# import BERT-base pretrained model
bert = AutoModel.from_pretrained('bert-base-uncased')

# Load the BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [None]:
import transformers
from transformers import AutoModel,BertTokenizerFast
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [None]:
device = torch.device("cuda")

In [None]:
sec_name_mapping = {"discussion": 0, "introduction": 1, "unspecified": 2, "method": 3,
                    "results": 4, "experiment": 5, "background": 6, "implementation": 7,
                    "related work": 8, "analysis": 9, "conclusion": 10, "evaluation": 11,
                    "appendix": 12, "limitation": 13}

In [None]:
train_df_clean = train_df[["string", "sectionName", "label"]]
val_df_clean = val_df[["string", "sectionName", "label"]]
test_df_clean = test_df[["string", "sectionName", "label"]]

In [None]:
# train_sec = train_df_clean['sectionName'].tolist()
# train_sec = [sec_name_mapping[label] for label in train_sec]
train_text = train_df_clean['string']
train_labels = train_df_clean["label"]

# val_sec = val_df_clean['sectionName'].tolist()
# val_sec = [sec_name_mapping[label] for label in val_sec]
val_text = val_df_clean['string']
val_labels = val_df_clean["label"]

# test_sec = test_df_clean['sectionName'].tolist()
# test_sec = [sec_name_mapping[label] for label in test_sec]
test_text = test_df_clean['string']
test_labels = test_df_clean["label"]

In [None]:
max_seq_len = 75

In [None]:
# tokenize and encode sequences in the training set
tokens_train = tokenizer.batch_encode_plus(
    train_text.tolist(),
    max_length = max_seq_len,
    padding='max_length',
    truncation=True,
    return_token_type_ids=False
)

# tokenize and encode sequences in the validation set
tokens_val = tokenizer.batch_encode_plus(
    val_text.tolist(),
    max_length = max_seq_len,
    padding='max_length',
    truncation=True,
    return_token_type_ids=False
)

# tokenize and encode sequences in the test set
tokens_test = tokenizer.batch_encode_plus(
    test_text.tolist(),
    max_length = max_seq_len,
    padding='max_length',
    truncation=True,
    return_token_type_ids=False
)

In [None]:
# for train set
train_seq = torch.tensor(tokens_train['input_ids'])
train_mask = torch.tensor(tokens_train['attention_mask'])
# train_sec = torch.tensor(train_sec)
train_y = torch.tensor(train_labels.tolist())

# for validation set
val_seq = torch.tensor(tokens_val['input_ids'])
val_mask = torch.tensor(tokens_val['attention_mask'])
# val_sec = torch.tensor(val_sec)
val_y = torch.tensor(val_labels.tolist())

# for test set
test_seq = torch.tensor(tokens_test['input_ids'])
test_mask = torch.tensor(tokens_test['attention_mask'])
# test_sec = torch.tensor(test_sec)
test_y = torch.tensor(test_labels.tolist())

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

#define a batch size
batch_size = 32

# wrap tensors
train_data = TensorDataset(train_seq, train_mask, train_y)

# sampler for sampling the data during training
train_sampler = RandomSampler(train_data)

# dataLoader for train set
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# wrap tensors
val_data = TensorDataset(val_seq, val_mask, val_y)

# sampler for sampling the data during training
val_sampler = SequentialSampler(val_data)

# dataLoader for validation set
val_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size=batch_size)

In [None]:
# # freeze all the parameters
for param in bert.parameters():
    param.requires_grad = False

In [None]:
class BERT_Arch(nn.Module):

    def __init__(self, bert):

        super(BERT_Arch, self).__init__()

        self.bert = bert

        # dropout layer
        self.dropout = nn.Dropout(0.1)

        # relu activation function
        self.relu =  nn.ReLU()

        # dense layer 1
        self.fc1 = nn.Linear(768,512)
        self.relu =  nn.ReLU()
        # dense layer 2 (Output layer)
        self.fc2 = nn.Linear(512,3)
        self.fc3 = nn.Linear(4,3)

        self.softmax = nn.LogSoftmax(dim=1)

    #define the forward pass
    def forward(self, sent_id, mask):

        #pass the inputs to the model
        _, cls_hs = self.bert(sent_id, attention_mask=mask, return_dict=False)

        x = self.fc1(cls_hs)

        x = self.relu(x)

        x = self.dropout(x)

        # output layer
        x = self.fc2(x)

        # x = torch.cat((x, sec.float().view(-1, 1)), 1)

        # x = self.fc3(x)

        # apply softmax activation
        x = self.softmax(x)
        return x

In [None]:
# pass the pre-trained BERT to our define architecture
model = BERT_Arch(bert)

# push the model to GPU
model = model.to(device)

In [None]:
# optimizer from hugging face transformers
from torch.optim import AdamW

# define the optimizer
optimizer = AdamW(model.parameters(), lr = 1e-3)

In [None]:
from sklearn.utils.class_weight import compute_class_weight

#compute the class weights
class_wts = compute_class_weight(class_weight='balanced', classes=np.unique(train_labels), y=train_labels)

print(class_wts)

[1.20535452 0.56432507 2.51041667]


In [None]:
# convert class weights to tensor
weights= torch.tensor(class_wts,dtype=torch.float)
weights = weights.to(device)

# loss function
cross_entropy  = nn.NLLLoss(weight=weights)

# number of training epochs
epochs = 10

In [None]:
def train():

  model.train()

  total_loss, total_accuracy = 0, 0

  # empty list to save model predictions
  total_preds=[]

  # iterate over batches
  for step,batch in enumerate(train_dataloader):

    # progress update after every 50 batches.
    if step % 50 == 0 and not step == 0:
      print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))

    # push the batch to gpu
    batch = [r.to(device) for r in batch]

    sent_id, mask, labels = batch
    # print(sent_id.shape, mask.shape, sec.shape, labels.shape)

    # clear previously calculated gradients
    model.zero_grad()
    # print(sent_id.shape, mask.shape, sec.shape, labels.shape)

    # get model predictions for the current batch
    preds = model(sent_id, mask)
    # print(sent_id.shape, mask.shape, sec.shape, labels.shape)

    # compute the loss between actual and predicted values
    loss = cross_entropy(preds, labels)

    # add on to the total loss
    total_loss = total_loss + loss.item()

    # backward pass to calculate the gradients
    loss.backward()

    # clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

    # update parameters
    optimizer.step()

    # model predictions are stored on GPU. So, push it to CPU
    preds=preds.detach().cpu().numpy()

    # append the model predictions
    total_preds.append(preds)

  # compute the training loss of the epoch
  avg_loss = total_loss / len(train_dataloader)

  # predictions are in the form of (no. of batches, size of batch, no. of classes).
  # reshape the predictions in form of (number of samples, no. of classes)
  total_preds = np.concatenate(total_preds, axis=0)

  #returns the loss and predictions
  print(avg_loss, total_preds)
  return avg_loss, total_preds

In [None]:
# function for evaluating the model
def evaluate():

  print("\nEvaluating...")

  # deactivate dropout layers
  model.eval()

  total_loss, total_accuracy = 0, 0

  # empty list to save the model predictions
  total_preds = []

  # iterate over batches
  for step,batch in enumerate(val_dataloader):

    # Progress update every 50 batches.
    if step % 50 == 0 and not step == 0:

      # Calculate elapsed time in minutes.
      elapsed = format_time(time.time() - t0)

      # Report progress.
      print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(val_dataloader)))

    # push the batch to gpu
    batch = [t.to(device) for t in batch]

    sent_id, mask, labels = batch


    # deactivate autograd
    with torch.no_grad():

      # model predictions
      preds = model(sent_id, mask)

      # compute the validation loss between actual and predicted values
      loss = cross_entropy(preds,labels)

      total_loss = total_loss + loss.item()

      preds = preds.detach().cpu().numpy()

      total_preds.append(preds)

  # compute the validation loss of the epoch
  avg_loss = total_loss / len(val_dataloader)

  # reshape the predictions in form of (number of samples, no. of classes)
  total_preds  = np.concatenate(total_preds, axis=0)

  return avg_loss, total_preds

In [None]:
# set initial loss to infinite
best_valid_loss = float('inf')

# empty lists to store training and validation loss of each epoch
train_losses=[]
valid_losses=[]

#for each epoch
for epoch in range(epochs):

    print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))

    #train model
    train_loss, _ = train()

    #evaluate model
    valid_loss, _ = evaluate()

    #save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')

    # append training and validation loss
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)

    print(f'\nTraining Loss: {train_loss:.3f}')
    print(f'Validation Loss: {valid_loss:.3f}')


 Epoch 1 / 10
  Batch    50  of    257.
  Batch   100  of    257.
  Batch   150  of    257.
  Batch   200  of    257.
  Batch   250  of    257.
0.9366491700192834 [[-1.1113416  -1.1184449  -1.0668366 ]
 [-1.1367912  -1.0692683  -1.0909613 ]
 [-1.0611637  -1.2017632  -1.0404843 ]
 ...
 [-1.1856863  -1.1629978  -0.9625551 ]
 [-0.35344782 -1.6040735  -2.3365388 ]
 [-0.42011163 -1.2809013  -2.7296872 ]]

Evaluating...

Training Loss: 0.937
Validation Loss: 0.782

 Epoch 2 / 10
  Batch    50  of    257.
  Batch   100  of    257.
  Batch   150  of    257.
  Batch   200  of    257.
  Batch   250  of    257.
0.7826803889024119 [[-1.4143225  -0.68643373 -1.372227  ]
 [-0.8082758  -1.024401   -1.632897  ]
 [-2.5951493  -2.4016607  -0.1805657 ]
 ...
 [-1.587745   -1.20713    -0.7000515 ]
 [-0.27083156 -1.7468596  -2.7656598 ]
 [-0.8931618  -0.7512741  -2.1296782 ]]

Evaluating...

Training Loss: 0.783
Validation Loss: 0.685

 Epoch 3 / 10
  Batch    50  of    257.
  Batch   100  of    257.
  Bat

In [None]:
path = 'saved_weights.pt'
model.load_state_dict(torch.load(path))

<All keys matched successfully>

In [None]:
# get predictions for test data
with torch.no_grad():
  preds = model(test_seq.to(device), test_mask.to(device))
  preds = preds.detach().cpu().numpy()

In [None]:
# model's performance
preds = np.argmax(preds, axis = 1)
print(classification_report(test_y, preds))


              precision    recall  f1-score   support

           0       0.78      0.82      0.80       604
           1       0.87      0.73      0.79       996
           2       0.52      0.78      0.62       259

    accuracy                           0.76      1859
   macro avg       0.72      0.78      0.74      1859
weighted avg       0.79      0.76      0.77      1859



In [None]:
from sklearn.metrics import f1_score

# Assuming y_true and y_pred are your true and predicted labels, respectively
micro_f1 = f1_score(test_y, preds, average='micro')

print("Micro F1 Score:", micro_f1)

Micro F1 Score: 0.7643894566971491


In [None]:
macro_f1 = f1_score(test_y, preds, average='macro')

print("Micro F1 Score:", macro_f1)

Micro F1 Score: 0.7373121387038225
