# Fine Tuning Transformer for MultiClass Text Classification

# Importing the libraries needed

In [13]:

import pandas as pd
import torch
import transformers
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertModel, DistilBertTokenizer

# Setting up the device for GPU usage

In [14]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [15]:
device

'cuda'

In [16]:
import re

## Load and process train dataset

In [17]:
def remove_comments(string):
            pattern = r"(\".*?\"|\'.*?\')|(/\*.*?\*/|//[^\r\n]*$)"
            # first group captures quoted strings (double or single)
            # second group captures comments (//single-line or /* multi-line */)
            regex = re.compile(pattern, re.MULTILINE|re.DOTALL)
            def _replacer(match):
                # if the 2nd group is not None, then we have captured a real comment string.
                if match.group(2) is not None:
                    return "" 
                else: # otherwise, we will return the 1st group
                    return match.group(1) 
            return regex.sub(_replacer, string)

In [21]:
train_set = pd.read_pickle("/home/bombbom/Documents/Multil_Model_Detection/dataset/reentrancy_train.pkl") # train
train_set = pd.DataFrame(train_set).drop(columns=['address','sourcecode_len','bytecode_len', 'dataset'])
train_set['slither'] = train_set.slither.eq('reentrancy').mul(1)
train_set=train_set.rename(columns={"slither":'label'})
train_set.source_code = train_set.source_code.apply(remove_comments)
train_set['label'] = train_set['label']*1



## Detail of train dataset

In [22]:
len(train_set)

51197

## Defining some key variables that will be used later on in the training

In [23]:
MAX_LEN = 512
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 32
EPOCHS = 1
LEARNING_RATE = 1e-05

## Load custom tokenizer

In [24]:

tokenizer = DistilBertTokenizer.from_pretrained('/home/bombbom/Documents/Multil_Model_Detection/dataset/custom/')

## Custom dataset class
- tokenizer.encode_plus()

In [25]:
class Triage(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __getitem__(self, index):
        title = str(self.data.source_code[index])
        title = " ".join(title.split())
        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(self.data.label[index], dtype=torch.long)
        } 
    
    def __len__(self):
        return self.len

In [26]:
training_set = Triage(train_set, tokenizer, MAX_LEN)


In [27]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)


## Load pre-trained models and Freeze all DistilBERT parameters

In [28]:
distil_model = DistilBertModel.from_pretrained("/home/bombbom/Documents/Multil_Model_Detection/dataset/pre_train/")
for prama in distil_model.parameters():
    prama.requires_grad=False

Some weights of the model checkpoint at /home/bombbom/Documents/Multil_Model_Detection/dataset/pre_train/ were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [29]:
distil_model

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(52000, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): MultiHeadSelfAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): Li

In [30]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model. 

class DistillBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistillBERTClass, self).__init__()
        self.l1 = distil_model
        self.lstm1 = torch.nn.LSTM(input_size=768,
                            hidden_size=256,
                            num_layers=2,
                            batch_first=True,
                            bidirectional=True) 
        self.pre_classifier = torch.nn.Linear(256*2, 256)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(256, 1)
         

    def forward(self, input_ids, attention_mask):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        self.lstm1.flatten_parameters()
        output, (hidden, cell) = self.lstm1(hidden_state)
        x = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)

        # pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(x)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [31]:
model = DistillBERTClass()
model.to(device)

DistillBERTClass(
  (l1): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(52000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(i

In [32]:
list(model.parameters())

[Parameter containing:
 tensor([[-0.0211, -0.0252, -0.0472,  ..., -0.0217, -0.0402, -0.0604],
         [-0.0183, -0.0248, -0.0569,  ..., -0.0166, -0.0360, -0.0571],
         [-0.0030, -0.0567, -0.0079,  ..., -0.0394, -0.0853, -0.0348],
         ...,
         [ 0.0791,  0.0050, -0.0314,  ...,  0.0104,  0.0212, -0.0423],
         [ 0.0135,  0.0329, -0.0137,  ..., -0.0062, -0.0112, -0.0236],
         [ 0.0280,  0.0217, -0.0366,  ...,  0.0127,  0.0092, -0.0426]],
        device='cuda:0'),
 Parameter containing:
 tensor([[ 0.0126, -0.0216, -0.0489,  ...,  0.0050,  0.0142,  0.0194],
         [ 0.0049,  0.0040, -0.0257,  ...,  0.0256,  0.0238, -0.0030],
         [-0.0167, -0.0048, -0.0135,  ...,  0.0144,  0.0216, -0.0056],
         ...,
         [ 0.0171,  0.0034, -0.0094,  ...,  0.0029,  0.0004, -0.0264],
         [ 0.0213, -0.0059,  0.0144,  ..., -0.0055, -0.0123, -0.0275],
         [ 0.0026, -0.0229,  0.0054,  ...,  0.0172,  0.0270, -0.0761]],
        device='cuda:0'),
 Parameter containin

## # Specify the optimizer and loss function

In [33]:
# Creating the loss function and optimizer
loss_function = torch.nn.BCELoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [34]:
import evaluate

In [35]:
accuracy_metric = evaluate.load("accuracy")

In [36]:
len(training_loader)

1600

## Defining the training function on the 80% of the dataset for tuning the distilbert model

In [22]:

def train(epoch, name):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    accuracy = 0
    model.train()
    for _,data in enumerate(training_loader, 0):
        # print(len(data['ids']))
        # print(len(data))
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float32)
        # print(targets)
        # print(targets.size(0))
        outputs = model(ids, mask)
        loss = loss_function(torch.sigmoid(outputs),targets.view(-1,1))
        tr_loss += loss.item()
        
        pre = torch.round(torch.sigmoid(outputs))
        # print(pre)
        # print(targets.shape)
        # size = targets.shape
        # print(size)
        acc = accuracy_metric.compute(references=targets, predictions=pre.reshape(targets.shape))
        accuracy+=acc['accuracy']
        # print(accuracy)
        # print(acc)
        # print(pre.reshape(targets.shape).shape)
        # n_correct=acc['accuracy']*(len(data['ids']))
        # print(n_correct)
        # break
        # print(_)
        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)
        # print(nb_tr_examples)
        if _%100==0:
            # loss_step = tr_loss/nb_tr_steps
            # accu_step = (n_correct*100)/nb_tr_examples 
            # print(f"Training Loss per 5000 steps: {loss_step}")
            accu_step = accuracy/nb_tr_steps

            print(f"Training Accuracy per {_} steps: {accu_step}")
        # break
        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()
        # break
    # print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    # epoch_loss = tr_loss/nb_tr_steps
    # epoch_accu = (n_correct*100)/nb_tr_examples
    # print(f"Training Loss Epoch: {epoch_loss}")
    epoch_accu = accuracy/len(training_loader)
    print(f"Training Accuracy Epoch: {epoch_accu}")
    path_name = "Model lstm " + str(name) + ".pth"
    torch.save(obj=model.state_dict(),f=path_name) # only saving the state_dict() only saves the models learned parameters 
    return 

In [23]:
EPOCHS = 5

In [None]:
for epoch in range(EPOCHS):
    train(epoch, epoch)
    

### Continue training

In [50]:
model.load_state_dict(torch.load(f="/home/bombbom/Documents/Multil_Model_Detection/notebook/save/model_lstm_12.pth"))

RuntimeError: Error(s) in loading state_dict for DistillBERTClass:
	Missing key(s) in state_dict: "l1.embeddings.word_embeddings.weight", "l1.embeddings.position_embeddings.weight", "l1.embeddings.LayerNorm.weight", "l1.embeddings.LayerNorm.bias", "l1.transformer.layer.0.attention.q_lin.weight", "l1.transformer.layer.0.attention.q_lin.bias", "l1.transformer.layer.0.attention.k_lin.weight", "l1.transformer.layer.0.attention.k_lin.bias", "l1.transformer.layer.0.attention.v_lin.weight", "l1.transformer.layer.0.attention.v_lin.bias", "l1.transformer.layer.0.attention.out_lin.weight", "l1.transformer.layer.0.attention.out_lin.bias", "l1.transformer.layer.0.sa_layer_norm.weight", "l1.transformer.layer.0.sa_layer_norm.bias", "l1.transformer.layer.0.ffn.lin1.weight", "l1.transformer.layer.0.ffn.lin1.bias", "l1.transformer.layer.0.ffn.lin2.weight", "l1.transformer.layer.0.ffn.lin2.bias", "l1.transformer.layer.0.output_layer_norm.weight", "l1.transformer.layer.0.output_layer_norm.bias", "l1.transformer.layer.1.attention.q_lin.weight", "l1.transformer.layer.1.attention.q_lin.bias", "l1.transformer.layer.1.attention.k_lin.weight", "l1.transformer.layer.1.attention.k_lin.bias", "l1.transformer.layer.1.attention.v_lin.weight", "l1.transformer.layer.1.attention.v_lin.bias", "l1.transformer.layer.1.attention.out_lin.weight", "l1.transformer.layer.1.attention.out_lin.bias", "l1.transformer.layer.1.sa_layer_norm.weight", "l1.transformer.layer.1.sa_layer_norm.bias", "l1.transformer.layer.1.ffn.lin1.weight", "l1.transformer.layer.1.ffn.lin1.bias", "l1.transformer.layer.1.ffn.lin2.weight", "l1.transformer.layer.1.ffn.lin2.bias", "l1.transformer.layer.1.output_layer_norm.weight", "l1.transformer.layer.1.output_layer_norm.bias", "l1.transformer.layer.2.attention.q_lin.weight", "l1.transformer.layer.2.attention.q_lin.bias", "l1.transformer.layer.2.attention.k_lin.weight", "l1.transformer.layer.2.attention.k_lin.bias", "l1.transformer.layer.2.attention.v_lin.weight", "l1.transformer.layer.2.attention.v_lin.bias", "l1.transformer.layer.2.attention.out_lin.weight", "l1.transformer.layer.2.attention.out_lin.bias", "l1.transformer.layer.2.sa_layer_norm.weight", "l1.transformer.layer.2.sa_layer_norm.bias", "l1.transformer.layer.2.ffn.lin1.weight", "l1.transformer.layer.2.ffn.lin1.bias", "l1.transformer.layer.2.ffn.lin2.weight", "l1.transformer.layer.2.ffn.lin2.bias", "l1.transformer.layer.2.output_layer_norm.weight", "l1.transformer.layer.2.output_layer_norm.bias", "l1.transformer.layer.3.attention.q_lin.weight", "l1.transformer.layer.3.attention.q_lin.bias", "l1.transformer.layer.3.attention.k_lin.weight", "l1.transformer.layer.3.attention.k_lin.bias", "l1.transformer.layer.3.attention.v_lin.weight", "l1.transformer.layer.3.attention.v_lin.bias", "l1.transformer.layer.3.attention.out_lin.weight", "l1.transformer.layer.3.attention.out_lin.bias", "l1.transformer.layer.3.sa_layer_norm.weight", "l1.transformer.layer.3.sa_layer_norm.bias", "l1.transformer.layer.3.ffn.lin1.weight", "l1.transformer.layer.3.ffn.lin1.bias", "l1.transformer.layer.3.ffn.lin2.weight", "l1.transformer.layer.3.ffn.lin2.bias", "l1.transformer.layer.3.output_layer_norm.weight", "l1.transformer.layer.3.output_layer_norm.bias", "l1.transformer.layer.4.attention.q_lin.weight", "l1.transformer.layer.4.attention.q_lin.bias", "l1.transformer.layer.4.attention.k_lin.weight", "l1.transformer.layer.4.attention.k_lin.bias", "l1.transformer.layer.4.attention.v_lin.weight", "l1.transformer.layer.4.attention.v_lin.bias", "l1.transformer.layer.4.attention.out_lin.weight", "l1.transformer.layer.4.attention.out_lin.bias", "l1.transformer.layer.4.sa_layer_norm.weight", "l1.transformer.layer.4.sa_layer_norm.bias", "l1.transformer.layer.4.ffn.lin1.weight", "l1.transformer.layer.4.ffn.lin1.bias", "l1.transformer.layer.4.ffn.lin2.weight", "l1.transformer.layer.4.ffn.lin2.bias", "l1.transformer.layer.4.output_layer_norm.weight", "l1.transformer.layer.4.output_layer_norm.bias", "l1.transformer.layer.5.attention.q_lin.weight", "l1.transformer.layer.5.attention.q_lin.bias", "l1.transformer.layer.5.attention.k_lin.weight", "l1.transformer.layer.5.attention.k_lin.bias", "l1.transformer.layer.5.attention.v_lin.weight", "l1.transformer.layer.5.attention.v_lin.bias", "l1.transformer.layer.5.attention.out_lin.weight", "l1.transformer.layer.5.attention.out_lin.bias", "l1.transformer.layer.5.sa_layer_norm.weight", "l1.transformer.layer.5.sa_layer_norm.bias", "l1.transformer.layer.5.ffn.lin1.weight", "l1.transformer.layer.5.ffn.lin1.bias", "l1.transformer.layer.5.ffn.lin2.weight", "l1.transformer.layer.5.ffn.lin2.bias", "l1.transformer.layer.5.output_layer_norm.weight", "l1.transformer.layer.5.output_layer_norm.bias", "lstm1.weight_ih_l0", "lstm1.weight_hh_l0", "lstm1.bias_ih_l0", "lstm1.bias_hh_l0", "lstm1.weight_ih_l0_reverse", "lstm1.weight_hh_l0_reverse", "lstm1.bias_ih_l0_reverse", "lstm1.bias_hh_l0_reverse", "lstm1.weight_ih_l1", "lstm1.weight_hh_l1", "lstm1.bias_ih_l1", "lstm1.bias_hh_l1", "lstm1.weight_ih_l1_reverse", "lstm1.weight_hh_l1_reverse", "lstm1.bias_ih_l1_reverse", "lstm1.bias_hh_l1_reverse", "pre_classifier.weight", "pre_classifier.bias", "classifier.weight", "classifier.bias". 
	Unexpected key(s) in state_dict: "epoch", "model_state_dict", "optimizer_state_dict". 

In [51]:
list(model.parameters())

[Parameter containing:
 tensor([[-0.0211, -0.0252, -0.0472,  ..., -0.0217, -0.0402, -0.0604],
         [-0.0183, -0.0248, -0.0569,  ..., -0.0166, -0.0360, -0.0571],
         [-0.0030, -0.0567, -0.0079,  ..., -0.0394, -0.0853, -0.0348],
         ...,
         [ 0.0791,  0.0050, -0.0314,  ...,  0.0104,  0.0212, -0.0423],
         [ 0.0135,  0.0329, -0.0137,  ..., -0.0062, -0.0112, -0.0236],
         [ 0.0280,  0.0217, -0.0366,  ...,  0.0127,  0.0092, -0.0426]],
        device='cuda:0'),
 Parameter containing:
 tensor([[ 0.0126, -0.0216, -0.0489,  ...,  0.0050,  0.0142,  0.0194],
         [ 0.0049,  0.0040, -0.0257,  ...,  0.0256,  0.0238, -0.0030],
         [-0.0167, -0.0048, -0.0135,  ...,  0.0144,  0.0216, -0.0056],
         ...,
         [ 0.0171,  0.0034, -0.0094,  ...,  0.0029,  0.0004, -0.0264],
         [ 0.0213, -0.0059,  0.0144,  ..., -0.0055, -0.0123, -0.0275],
         [ 0.0026, -0.0229,  0.0054,  ...,  0.0172,  0.0270, -0.0761]],
        device='cuda:0'),
 Parameter containin

In [40]:
model

DistillBERTClass(
  (l1): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(52000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(i

In [37]:
for _,data in enumerate(training_loader, 0):
    print(len(data))
    break

3


In [42]:
def train(epoch, name):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    accuracy = 0
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float32)

        outputs = model(ids, mask)
        loss = loss_function(torch.sigmoid(outputs),targets.view(-1,1))
        tr_loss += loss.item()
        
        pre = torch.round(torch.sigmoid(outputs))

        acc = accuracy_metric.compute(references=targets, predictions=pre.reshape(targets.shape))
        accuracy+=acc['accuracy']
        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)
       
        if _%100==0:
            accu_step = accuracy/nb_tr_steps
            print(f"Training Accuracy per {_} steps: {accu_step}")
        # break
        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()
        # break

    epoch_accu = accuracy/len(training_loader)
    print(f"Training Accuracy Epoch: {epoch_accu}")
    path_name = "Model_lstm_" + str(name+5) + ".pth"
    torch.save(obj=model.state_dict(),f=path_name) # only saving the state_dict() only saves the models learned parameters 
    return 

In [43]:
EPOCHS=5

In [44]:
for epoch in range(EPOCHS):
    train(epoch, epoch)

Training Accuracy per 0 steps: 0.84375
Training Accuracy per 100 steps: 0.8586014851485149
Training Accuracy per 200 steps: 0.867070895522388
Training Accuracy per 300 steps: 0.8647217607973422
Training Accuracy per 400 steps: 0.8661938902743143
Training Accuracy per 500 steps: 0.8661427145708582
Training Accuracy per 600 steps: 0.8671485024958403
Training Accuracy per 700 steps: 0.8675552781740371
Training Accuracy per 800 steps: 0.867626404494382
Training Accuracy per 900 steps: 0.8685141509433962
Training Accuracy per 1000 steps: 0.8683191808191808
Training Accuracy per 1100 steps: 0.8680744777475022
Training Accuracy per 1200 steps: 0.8679225645295587
Training Accuracy per 1300 steps: 0.8678180245964643
Training Accuracy per 1400 steps: 0.8677507137758744
Training Accuracy per 1500 steps: 0.8679005662891406
Training Accuracy Epoch: 0.8682926993534482
Training Accuracy per 0 steps: 0.90625
Training Accuracy per 100 steps: 0.8780940594059405
Training Accuracy per 200 steps: 0.8698694

In [48]:
def train(epoch, name):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    accuracy = 0
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float32)

        outputs = model(ids, mask)
        loss = loss_function(torch.sigmoid(outputs),targets.view(-1,1))
        tr_loss += loss.item()
        
        pre = torch.round(torch.sigmoid(outputs))

        acc = accuracy_metric.compute(references=targets, predictions=pre.reshape(targets.shape))
        accuracy+=acc['accuracy']
        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)
       
        if _%100==0:
            accu_step = accuracy/nb_tr_steps
            print(f"Training Accuracy per {_} steps: {accu_step}")
        # break
        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()
        # break

    epoch_accu = accuracy/len(training_loader)
    print(f"Training Accuracy Epoch: {epoch_accu}")
    path_name = "./save/model_lstm_" + str(name+10) + ".pth"
    # torch.save(obj=model.state_dict(),f=path_name) # only saving the state_dict() only saves the models learned parameters 
    E = epoch+10
    torch.save({
            'epoch': E,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),  
            }, f=path_name)
    return 

In [47]:
path_name =r"./save/model_lstm_9.pth" 
torch.save(obj=model.state_dict(),f=path_name)

In [49]:
for epoch in range(EPOCHS):
    train(epoch, epoch)

Training Accuracy per 0 steps: 0.78125
Training Accuracy per 100 steps: 0.880259900990099
Training Accuracy per 200 steps: 0.8824626865671642
Training Accuracy per 300 steps: 0.8838247508305648
Training Accuracy per 400 steps: 0.883182668329177
Training Accuracy per 500 steps: 0.8836701596806387
Training Accuracy per 600 steps: 0.882383527454243
Training Accuracy per 700 steps: 0.8815531383737518
Training Accuracy per 800 steps: 0.8820224719101124
Training Accuracy per 900 steps: 0.8815205327413984
Training Accuracy per 1000 steps: 0.8817432567432567
Training Accuracy per 1100 steps: 0.8816133060853769
Training Accuracy per 1200 steps: 0.881791215653622
Training Accuracy per 1300 steps: 0.8807647963105304
Training Accuracy per 1400 steps: 0.8810447894361171
Training Accuracy per 1500 steps: 0.8820369753497668
Training Accuracy Epoch: 0.8821518049568965
Training Accuracy per 0 steps: 0.9375
Training Accuracy per 100 steps: 0.8787128712871287
Training Accuracy per 200 steps: 0.8802860696

In [53]:
PATh = "/home/bombbom/Documents/Multil_Model_Detection/notebook/save_lstm_model/mode.pth"
torch.save({
            'epoch': 10,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),  
            }, PATh)

In [54]:
checkpoint = torch.load("/home/bombbom/Documents/Multil_Model_Detection/notebook/save/model_lstm_14.pth")

In [55]:
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

## Test Hugging face dataaset

In [None]:
# m2 = pd.read_pickle("/home/bombbom/Documents/Multil_Model_Detection/dataset/reentrancy_test.pkl") # val


# m2 = pd.DataFrame(m2).drop(columns=['address','sourcecode_len','bytecode_len', 'dataset'])


# m2['slither'] = m2.slither.eq('reentrancy').mul(1)


# m2=m2.rename(columns={"slither":'label'})

# m2.source_code = m2.source_code.apply(remove_comments)

# m2['label'] = m2['label']*1

# m3 = pd.read_pickle("/home/bombbom/Documents/Multil_Model_Detection/dataset/reentrancy_val.pkl") # train
# m3 = pd.DataFrame(m3).drop(columns=['address','sourcecode_len','bytecode_len', 'dataset'])
# m3['slither'] = m3.slither.eq('reentrancy').mul(1)
# m3=m3.rename(columns={"slither":'label'})
# m3.source_code = m3.source_code.apply(remove_comments)
# m3['label'] = m3['label']*1

In [None]:
testing_set = Triage(m2, tokenizer, MAX_LEN)

In [None]:
test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }
testing_loader = DataLoader(testing_set, **test_params)

In [28]:
label =[]
predict = []
with torch.no_grad():
    for _, data in enumerate(testing_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float32)
        outputs = model(ids, mask)
        pre = torch.round(torch.sigmoid(outputs))
        label.extend(targets.cpu().numpy())
        predict.extend(pre.reshape(targets.shape).cpu().numpy())
        # print(label)
        # print(predict)
        
        # break

In [None]:
len(label)

In [15]:
import evaluate

In [31]:
clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

In [32]:
clf_metrics.compute(references=label, predictions=predict)

{'accuracy': 0.8775808336579665,
 'f1': 0.8720350198513693,
 'precision': 0.860558569419329,
 'recall': 0.8838217086256707}

# Test Smartbugs Will datasets

In [2]:
import pandas as pd
data = pd.read_pickle("/home/bombbom/Documents/Multil_Model_Detection/dataset/smart_bug_dataset.pkl")

In [3]:
data=data.rename(columns={'findings': 'label'})

In [4]:
data = data[((data['label']=='reentrancy') | (data['label']=='safe'))]

In [5]:
data['label'] = data.label.eq('reentrancy').mul(1)

In [6]:
data = data.reset_index(drop=True)

In [7]:
data

Unnamed: 0,source_code,label
0,pragma solidity ^0.4.23;\n\ncontract SloadTest...,0
1,pragma solidity ^0.4.24;\n\nlibrary SafeMath {...,1
2,pragma solidity ^0.4.18;\n\n\n\n\nlibrary Safe...,0
3,pragma solidity ^0.4.25;\n\n\ncontract TwelveH...,0
4,\npragma solidity ^0.4.7;\ncontract Contest {\...,0
...,...,...
27438,pragma solidity ^0.4.11;\n\n\n\n\ncontract Mul...,0
27439,pragma solidity ^0.4.18;\n\nlibrary SafeMath {...,0
27440,\n\npragma solidity 0.5.2;\n\ncontract Subscry...,0
27441,pragma solidity ^0.4.22;\n\nlibrary SafeMath {...,0


In [8]:
from sklearn.model_selection import train_test_split

In [15]:
train, test = train_test_split(data,test_size=0.2, random_state=42)

In [16]:
test[test['label']==1]

Unnamed: 0,source_code,label
11290,pragma solidity 0.4.19;\n\ncontract MiniMeToke...,1
3525,pragma solidity ^0.4.15;\n\ncontract TokenCont...,1
7201,pragma solidity ^0.4.18;\n\n\nlibrary SafeMath...,1
15828,pragma solidity ^0.4.13;\n\n\nlibrary SafeMath...,1
18523,pragma solidity ^0.4.18;\n\n\nlibrary SafeMath...,1
...,...,...
15497,pragma solidity ^0.4.18;\n\n\nlibrary SafeMath...,1
21505,pragma solidity ^0.4.19;\n\nlibrary SafeMath {...,1
8459,pragma solidity ^0.4.18;\n\n\nlibrary SafeMath...,1
4504,pragma solidity ^0.4.18;\n\npragma solidity ^0...,1


In [17]:
test.reset_index(drop=True)

Unnamed: 0,source_code,label
0,pragma solidity ^0.4.21;\n\nlibrary SafeMath {...,0
1,pragma solidity 0.4.19;\n\ncontract MiniMeToke...,1
2,pragma solidity ^0.4.25;\n\n\ncontract EasyInv...,0
3,pragma solidity ^0.4.25;\n\n\n\ncontract Token...,0
4,pragma solidity ^0.4.15;\n\ncontract TokenCont...,1
...,...,...
5484,pragma solidity ^0.4.18;\n\n\nlibrary SafeMath...,0
5485,pragma solidity ^0.4.11;\n\n\ncontract SafeMat...,0
5486,pragma solidity ^0.4.23;\n\ncontract ERC20Basi...,0
5487,\n\npragma solidity ^0.4.24;\n\n\n\n\n\n\n\n\n...,0


In [22]:
testing_set_wild = Triage(data, tokenizer, MAX_LEN)

In [23]:
test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }
testing_loader_wild = DataLoader(testing_set_wild, **test_params)

In [24]:
label =[]
predict = []
with torch.no_grad():
    for _, data in enumerate(testing_loader_wild, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float32)
        outputs = model(ids, mask)
        pre = torch.round(torch.sigmoid(outputs))
        label.extend(targets.cpu().numpy())
        predict.extend(pre.reshape(targets.shape).cpu().numpy())

In [25]:
import evaluate

In [28]:
clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

In [29]:
clf_metrics.compute(references=label, predictions=predict)

{'accuracy': 0.854753489050031,
 'f1': 0.7253307607497244,
 'precision': 0.777285482203515,
 'recall': 0.6798863195969513}

# Train with Smartbugs wild 

In [1]:
from sklearn.model_selection import train_test_split

In [31]:
def train(epoch, name):
    tr_loss = 0
    
    nb_tr_steps = 0
    nb_tr_examples = 0
    accuracy = 0
   
    model.train()
    for _,data in enumerate(testing_loader_wild, 0):
        # print(len(data['ids']))
        # print(len(data))
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float32)
        # print(targets)
        # print(targets.size(0))
        outputs = model(ids, mask)
        loss = loss_function(torch.sigmoid(outputs),targets.view(-1,1))
        tr_loss += loss.item()
        
        pre = torch.round(torch.sigmoid(outputs))

        acc = accuracy_metric.compute(references=targets, predictions=pre.reshape(targets.shape))
        accuracy+=acc['accuracy']

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)
        # print(nb_tr_examples)
        if _%100==0:

            accu_step = accuracy/nb_tr_steps

            print(f"Training Accuracy per {_} steps: {accu_step}")
        # break
        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()
    epoch_accu = accuracy/len(testing_set_wild)
    print(f"Training Accuracy Epoch: {epoch_accu}")
    path_name = "Model_lstm_smart_bug" + str(name+3) + ".pth"
    torch.save(obj=model.state_dict(),f=path_name) # only saving the state_dict() only saves the models learned parameters 
    return 

In [33]:
len(testing_loader_wild)

858

In [34]:
for epoch in range(5):
    train(epoch, epoch)

Training Accuracy per 0 steps: 0.53125
Training Accuracy per 100 steps: 0.7209158415841584
Training Accuracy per 200 steps: 0.7153296019900498
Training Accuracy per 300 steps: 0.7195805647840532
Training Accuracy per 400 steps: 0.746571072319202
Training Accuracy per 500 steps: 0.7621007984031936
Training Accuracy per 600 steps: 0.7724625623960066
Training Accuracy per 700 steps: 0.7800909415121255
Training Accuracy per 800 steps: 0.7862827715355806
Training Accuracy Epoch: 0.024659006131368944
Training Accuracy per 0 steps: 0.75
Training Accuracy per 100 steps: 0.8307549504950495
Training Accuracy per 200 steps: 0.8347325870646766
Training Accuracy per 300 steps: 0.8365863787375415
Training Accuracy per 400 steps: 0.8336190773067331
Training Accuracy per 500 steps: 0.8338947105788423
Training Accuracy per 600 steps: 0.8350665557404326
Training Accuracy per 700 steps: 0.8335413694721826
Training Accuracy per 800 steps: 0.8348548689138576
Training Accuracy Epoch: 0.026149595717055638
Tr