**Import libraries and mount gdrive**

In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [3]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import shutil
torch.__version__

'1.10.0+cu111'

**Check if gpu is available**

In [4]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
torch.cuda.empty_cache()
print(device)

cuda


**Load Datasets**

In [5]:
df_train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/proj/Data/sentences/sentence_train.csv')
df_test= pd.read_csv('/content/drive/MyDrive/Colab Notebooks/proj/Data/sentences/sentence_test.csv')

**Preparing Dataset**

In [6]:
targetList= df_train['label'].unique().tolist()
targetList

['vector spaces',
 'pos_def_matrices',
 'eigenvec_val',
 'determinants',
 'orthogonality',
 'linear_prog',
 'gauss elim',
 'computations']

In [7]:
df_test = pd.concat([df_test,pd.get_dummies(df_test['label'])], axis=1)
df_test.drop(columns='label')
print(df_test.shape)
df_test.head()

(12121, 10)


Unnamed: 0,text,label,computations,determinants,eigenvec_val,gauss elim,linear_prog,orthogonality,pos_def_matrices,vector spaces
0,This book begins with the central problem of l...,gauss elim,0,0,0,1,0,0,0,0
1,should work and it does: 4 times (x = −1) plus...,gauss elim,0,0,0,1,0,0,0,0
2,If we stay with determinants (which we don’t p...,gauss elim,0,0,0,1,0,0,0,0
3,"formula to compute the other unknown, x:",gauss elim,0,0,0,1,0,0,0,0
4,The idea of elimination is deceptively simple—...,gauss elim,0,0,0,1,0,0,0,0


In [8]:
df_train = pd.concat([df_train,pd.get_dummies(df_train['label'])], axis=1)
df_train = df_train.drop(columns='label')
print(df_train.shape)
df_train.head()

(9983, 9)


Unnamed: 0,text,computations,determinants,eigenvec_val,gauss elim,linear_prog,orthogonality,pos_def_matrices,vector spaces
0,1 3 3 2,0,0,0,0,0,0,0,1
1,The second eigenvalue λ2(A) = 2 is above the l...,0,0,0,0,0,0,1,0
2,matrix Q−1AQ = QTAQ.,0,0,0,0,0,0,1,0
3,i . Stability depends on the eigenvalues:,0,0,1,0,0,0,0,0
4,Rayleigh quotient is the fundamental frequency...,0,0,0,0,0,0,1,0


In [None]:
pip install transformers

In [10]:
from transformers import BertTokenizer, BertModel

In [None]:
tokenizer= BertTokenizer.from_pretrained("bert-base-uncased")

**Split dataset into 2 halves to make NN computations less heavy**

In [12]:
df_train1=df_train.sample(frac=0.5, random_state= 200).reset_index(drop=True)
page_df= df_train.drop(df_train1.index).reset_index(drop=True)
df_train= df_train1
page_df.head(), df_train.head()

(                                                text  ...  vector spaces
 0  26. If aij is i times j, show that detA = 0. (...  ...              0
 1                   by U cannot destroy the scaling.  ...              0
 2  Hint: Subtracting the last row from each of th...  ...              0
 3  41. A=2∗eye(n)−diag(ones(n−1, 1),1)−diag(ones(...  ...              0
 4  optimal λ and µ. In the exercises, we stay wit...  ...              0
 
 [5 rows x 9 columns],
                                                 text  ...  vector spaces
 0  ak and bk. From this inﬁnite sequence of sines...  ...              0
 1  that a zero can appear in a pivot position, ev...  ...              0
 2              13. Apply the Gram-Schmidt process to  ...              0
 3                                   A changes to ATA  ...              0
 4  The general case is the same. We “solve” ax = ...  ...              0
 
 [5 rows x 9 columns])

**Basic parameters for training a deep learning model**

In [13]:
MAX_LEN= 256
BATCH_SIZE= 16
EPOCHS= 4
LR= 1e-4

**Dataset class to get text encodings**

In [14]:
class myDataset(torch.utils.data.Dataset):
    def __init__(self,df, tokenizer, max_len):
        self.df = df
        self.tokenizer= tokenizer
        self.max_len = max_len
        self.text= df['text']
        self.targets = self.df[targetList].values
    def __len__(self):
        return len(self.text)
    def __getitem__(self,index):
        
        text= str(self.text[index])
        text=  " ".join(text.split())
        
        inputs= self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids= True,
            truncation=True,
            return_attention_mask=True,
            return_tensors= 'pt'
        )
        
        return {
            'input_ids':inputs['input_ids'].flatten(),
            'attention_mask':inputs['attention_mask'].flatten(),
            'token_type_ids':inputs['token_type_ids'].flatten(),
            'targets':torch.FloatTensor(self.targets[index])
        }


**Split current training set into train and validation sets**

In [15]:
train_df= df_train.sample(frac=0.8,random_state=200).reset_index(drop=True)
val_df= df_train.drop(train_df.index).reset_index(drop=True)
train_df.shape, val_df.shape

((3994, 9), (998, 9))

In [16]:
train_dataset=myDataset(train_df,tokenizer, MAX_LEN)
val_dataset = myDataset(val_df,tokenizer,MAX_LEN)

**Prepare DataLoader objects**

In [17]:
trainLoader = torch.utils.data.DataLoader(
    train_dataset,
    shuffle=True,
    batch_size = BATCH_SIZE,
    num_workers= 0 
    )
valLoader = torch.utils.data.DataLoader(
    val_dataset,
    shuffle=False,
    batch_size = BATCH_SIZE,
    num_workers= 0 
    )

**Load and save checkpoints in case system crashes or we need to use model parameters somewhere else**

In [18]:
def load_ckp(ckpt_path,model,optimizer):
    checkpoint= torch.load(ckpt_path)
    model.load_state_dict(checkpoint['state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer'])
    valid_loss_min= checkpoint['valid_loss_min']
    return model, optimizer, checkpoint['epoch'],valid_loss_min
def save_ckp(state, is_best, ckpt_path,best_model_path):
    torch.save(state, ckpt_path)
    if is_best:
        best_path = best_model_path
        shutil.copyfile(ckpt_path,best_path)

**BERT CLASS: Describes how model will behave**
_Bert base computations->dropout layer->linear layer_ 

In [None]:
class BERTModel(nn.Module):
    def __init__(self):
        super(BERTModel,self).__init__()
        self.bert= BertModel.from_pretrained("bert-base-uncased", return_dict= True)
        self.dropout = nn.Dropout(0.6) 
        self.linear =nn.Linear(768,8)
    def forward(self, input_ids,attention_mask,token_type_ids):
        output = self.bert(input_ids,attention_mask,token_type_ids)
        outputDropout= self.dropout(output.pooler_output)
        output= self.linear(outputDropout)
        return output
model = BERTModel()
model.to(device)

**Define Loss function**

_Model.parameters() consists of model weights and biases_

In [24]:
def loss_fn(outputs, targets):
  return nn.BCEWithLogitsLoss()(outputs,targets)
def accuracy(output,targets):
  final_output = torch.sigmoid(output).cpu().detach().numpy()
  return np.sum( np.argmax(targets.cpu().numpy(),axis=1) == np.argmax(final_output, axis=1) ) / BATCH_SIZE *100
optimizer= torch.optim.Adam(params=model.parameters(),lr=LR)

**Define train function. Define fine-tuning procedure step by step**

In [25]:
def train_model(epochs, trainLoader,valLoader,model,optimizer,ckpt_path,best_model_path,val_loss_min):
    for epoch in range(1, epochs+1):
        print("epoch[{}]".format(epoch))
        train_loss= 0
        val_loss= 0
        for index, batch in enumerate(trainLoader):
            input_ids= batch['input_ids'].to(device, dtype= torch.long)
            attention_mask = batch['attention_mask'].to(device, dtype= torch.long)
            token_type_ids= batch['token_type_ids'].to(device, dtype= torch.long)
            targets = batch['targets'].to(device, dtype= torch.float)

            output= model(input_ids, attention_mask, token_type_ids)
            
            optimizer.zero_grad()
            loss= loss_fn(output,targets)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_acc= accuracy(output,targets)
            train_loss=train_loss + ((1/(index+1))*(loss.item()-train_loss))
            print("TRAIN BATCH {}/{}====>train loss[{:.8f}] | train accuracy[{:.8f}%]".format(index+1,len(trainLoader) , train_loss, train_acc ))
            #train_lossList+=[train_loss]
            #train_accList+=[train_acc]
        model.eval() 
        with torch.no_grad():
            for index, batch in enumerate(valLoader):
                print("validation batch index:{}/{}".format(index,len(valLoader)))
                input_ids= batch['input_ids'].to(device, dtype= torch.long)
                attention_mask = batch['attention_mask'].to(device, dtype= torch.long)
                token_type_ids= batch['token_type_ids'].to(device, dtype= torch.long)
                targets = batch['targets'].to(device, dtype= torch.float)
                
                output =model(input_ids,attention_mask,token_type_ids)
                val_acc= accuracy(output,targets)
                loss= loss_fn(output,targets)
                val_loss= (val_loss + (1/(index+1))*(loss.item()-val_loss))
                print("VALIDATION BATCH {}/{}====>val loss[{:.8f}] | val accuracy[{:.8f}%]".format(index+1,len(valLoader) , val_loss, val_acc ))
                #val_lossList+=[val_loss]
                #val_accList+=[val_acc]
        checkpoint ={
            'epoch': epoch+1,
            'valid_loss_min':val_loss,
            'state_dict':model.state_dict(),
            'optimizer':optimizer.state_dict()
        }
        save_ckp(checkpoint,False,ckpt_path,best_model_path)

        if val_loss < val_loss_min:
          print("previous Val_loss_min={:.8f}; new val_loss_min={:.8f}".format(val_loss_min, val_loss))
          save_ckp(checkpoint, True, ckpt_path, best_model_path)
          print("SAVED")
          val_loss_min = val_loss
        print("epoch {} end".format(epoch))
    return model

**Train 1st half of dataset**

In [26]:
ckpt_path = "/content/drive/MyDrive/Colab Notebooks/proj/Data/sentences/curr_path"
best_model_path = "/content/drive/MyDrive/Colab Notebooks/proj/Data/sentences/best.pt"

In [27]:
trained_model=train_model(EPOCHS,trainLoader,valLoader,model, optimizer,ckpt_path,best_model_path,np.Inf)

epoch[1]
TRAIN BATCH 1/250====>train loss[0.74241090] | train accuracy[25.00000000%]
TRAIN BATCH 2/250====>train loss[0.71333539] | train accuracy[12.50000000%]
TRAIN BATCH 3/250====>train loss[0.69315662] | train accuracy[6.25000000%]
TRAIN BATCH 4/250====>train loss[0.66752519] | train accuracy[6.25000000%]
TRAIN BATCH 5/250====>train loss[0.63965445] | train accuracy[12.50000000%]
TRAIN BATCH 6/250====>train loss[0.61854233] | train accuracy[6.25000000%]
TRAIN BATCH 7/250====>train loss[0.59202537] | train accuracy[37.50000000%]
TRAIN BATCH 8/250====>train loss[0.57190254] | train accuracy[18.75000000%]
TRAIN BATCH 9/250====>train loss[0.55311090] | train accuracy[25.00000000%]
TRAIN BATCH 10/250====>train loss[0.53944784] | train accuracy[18.75000000%]
TRAIN BATCH 11/250====>train loss[0.52685070] | train accuracy[12.50000000%]
TRAIN BATCH 12/250====>train loss[0.51544100] | train accuracy[18.75000000%]
TRAIN BATCH 13/250====>train loss[0.50590301] | train accuracy[18.75000000%]
TR

**Train 2nd half of dataset**

In [28]:
train_df2= page_df.sample(frac=0.8,random_state=200).reset_index(drop=True)
val_df2= page_df.drop(train_df2.index).reset_index(drop=True)

In [29]:
train_dataset2=myDataset(train_df2,tokenizer, MAX_LEN)
val_dataset2 = myDataset(val_df2,tokenizer,MAX_LEN)

In [30]:
trainLoader2 = torch.utils.data.DataLoader(
    train_dataset2,
    shuffle=True,
    batch_size = BATCH_SIZE,
    num_workers= 0 
    )
valLoader2 = torch.utils.data.DataLoader(
    val_dataset2,
    shuffle=False,
    batch_size = BATCH_SIZE,
    num_workers= 0 
    )

In [31]:
_,_,_,valid_loss_min= load_ckp(best_model_path,model,optimizer)
print(valid_loss_min)

0.10706023387019598


In [32]:
trained_model=train_model(EPOCHS,trainLoader2,valLoader2,model, optimizer,ckpt_path,best_model_path,valid_loss_min)

epoch[1]
TRAIN BATCH 1/250====>train loss[0.24400586] | train accuracy[68.75000000%]
TRAIN BATCH 2/250====>train loss[0.22997821] | train accuracy[75.00000000%]
TRAIN BATCH 3/250====>train loss[0.20277646] | train accuracy[75.00000000%]
TRAIN BATCH 4/250====>train loss[0.20922163] | train accuracy[62.50000000%]
TRAIN BATCH 5/250====>train loss[0.19725994] | train accuracy[75.00000000%]
TRAIN BATCH 6/250====>train loss[0.18821281] | train accuracy[81.25000000%]
TRAIN BATCH 7/250====>train loss[0.17353845] | train accuracy[87.50000000%]
TRAIN BATCH 8/250====>train loss[0.16241735] | train accuracy[93.75000000%]
TRAIN BATCH 9/250====>train loss[0.16496048] | train accuracy[75.00000000%]
TRAIN BATCH 10/250====>train loss[0.16358655] | train accuracy[75.00000000%]
TRAIN BATCH 11/250====>train loss[0.16651003] | train accuracy[62.50000000%]
TRAIN BATCH 12/250====>train loss[0.17867123] | train accuracy[56.25000000%]
TRAIN BATCH 13/250====>train loss[0.18661779] | train accuracy[62.50000000%]

**Test model**

In [33]:
test_dataset = myDataset(df_test,tokenizer,MAX_LEN)
testLoader = torch.utils.data.DataLoader(
    test_dataset,
    shuffle=False,
    batch_size = BATCH_SIZE,
    num_workers= 0 
    )


In [34]:
#testing
test_accList =[]
test_loss=0
model.eval() 
with torch.no_grad():
  for index, batch in enumerate(testLoader):
    input_ids= batch['input_ids'].to(device, dtype= torch.long)
    attention_mask = batch['attention_mask'].to(device, dtype= torch.long)
    token_type_ids= batch['token_type_ids'].to(device, dtype= torch.long)
    targets = batch['targets'].to(device, dtype= torch.float)
    
    output =model(input_ids,attention_mask,token_type_ids)
    
    loss= loss_fn(output,targets)
    test_acc= accuracy(output,targets)
    test_accList+= [test_acc]
    test_loss = test_loss + ((1 / (index + 1)) * (loss.item() - test_loss))
    print("TEST BATCH {}/{}====>test loss[{:.8f}] | test accuracy[{:.8f}%]".format(index+1,len(testLoader) , test_loss, test_acc ))
    #final_output = torch.sigmoid(output).cpu().detach().numpy()
    #print(np.argmax(targets.cpu().numpy(),axis=1), np.argmax(final_output, axis=1) )
print("AVG Accuracy = ", sum(test_accList)/len(test_accList))

TEST BATCH 1/758====>test loss[0.17626327] | test accuracy[81.25000000%]
TEST BATCH 2/758====>test loss[0.20837490] | test accuracy[75.00000000%]
TEST BATCH 3/758====>test loss[0.18197175] | test accuracy[81.25000000%]
TEST BATCH 4/758====>test loss[0.18138437] | test accuracy[81.25000000%]
TEST BATCH 5/758====>test loss[0.15669663] | test accuracy[93.75000000%]
TEST BATCH 6/758====>test loss[0.16370901] | test accuracy[81.25000000%]
TEST BATCH 7/758====>test loss[0.17030768] | test accuracy[68.75000000%]
TEST BATCH 8/758====>test loss[0.15309876] | test accuracy[93.75000000%]
TEST BATCH 9/758====>test loss[0.13683048] | test accuracy[100.00000000%]
TEST BATCH 10/758====>test loss[0.12447920] | test accuracy[100.00000000%]
TEST BATCH 11/758====>test loss[0.12011157] | test accuracy[93.75000000%]
TEST BATCH 12/758====>test loss[0.12753581] | test accuracy[75.00000000%]
TEST BATCH 13/758====>test loss[0.12085439] | test accuracy[93.75000000%]
TEST BATCH 14/758====>test loss[0.11545673] |