In [23]:
!pip install torchinfo
!pip install datasets
!pip install hexbytes



In [24]:
import torch
import numpy as np

from torch import nn
from torch import optim
from torchinfo import summary
from datasets import load_dataset
from torchvision import transforms
from argparse import ArgumentParser
from torch.utils.data import DataLoader
from torch.nn.functional import normalize
from sklearn.metrics import accuracy_score
from hexbytes import HexBytes

In [25]:
train_ds = load_dataset("mwritescode/slither-audited-smart-contracts", 'big-multilabel', split='train', ignore_verifications=True)
val_ds = load_dataset("mwritescode/slither-audited-smart-contracts", 'big-multilabel', split='validation', ignore_verifications=True)
test_ds = load_dataset("mwritescode/slither-audited-smart-contracts", 'big-multilabel', split='test', ignore_verifications=True)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [26]:
train_ds = train_ds.filter(lambda elem: elem['bytecode'] != '0x')
val_ds = val_ds.filter(lambda elem: elem['bytecode'] != '0x')
test_ds = test_ds.filter(lambda elem: elem['bytecode'] != '0x')

In [27]:
SAFE_IDX = 4 # the index of safe smart contract

def __get_one_hot_encoded_label(label):
    one_hot = np.zeros(5)
    for elem in label:
        if elem < SAFE_IDX:
            one_hot[elem] = 1
        elif elem > SAFE_IDX:
            one_hot[elem-1] = 1
    return one_hot


def generate_signal_and_label(example):
    code = HexBytes(example['bytecode']) # convert from string to bytes
    image = np.frombuffer(code, dtype=np.uint8)
    example['image'] = image
    example['label'] = __get_one_hot_encoded_label(example['slither'])
    return example

In [28]:
map_func = generate_signal_and_label

train_ds = train_ds.map(map_func, remove_columns=['address', 'source_code', 'bytecode', 'slither'])
val_ds = val_ds.map(map_func, remove_columns=['address', 'source_code', 'bytecode', 'slither'])
test_ds = test_ds.map(map_func, remove_columns=['address', 'source_code', 'bytecode', 'slither'])

In [29]:
max_len = 512
padding_val = 0

def img_label_to_tensor(examples):
  if 'image' in examples.keys():
      examples['image'] = [np.pad(img, pad_width=(0, max_len - len(img)), constant_values=padding_val) if len(img) < max_len else img[:max_len] for img in examples['image']]
      examples['image'] = [torch.tensor(img) for img in examples['image']]
  if 'label' in examples.keys():
      examples['label'] = torch.tensor(examples['label'])
      return examples

In [30]:
train_ds.set_transform(img_label_to_tensor)
val_ds.set_transform(img_label_to_tensor)
test_ds.set_transform(img_label_to_tensor)

In [31]:
model_name = 'lstm'
num_cls = 5

In [32]:
batch_size = 64

loader_train = DataLoader(train_ds,
                    batch_size=batch_size,
                    drop_last=True,
                    shuffle=True)
loader_val = DataLoader(val_ds,
                    batch_size=batch_size,
                    drop_last=True,
                    shuffle=False)
loader_test = DataLoader(test_ds,
                    batch_size=batch_size,
                    shuffle=False)

In [33]:
class LSTMNetwork(nn.Module):
    def __init__(self, num_classes=1, classify=True, vocabulary_size=257):
        super(LSTMNetwork, self).__init__()
        self.layers = 3
        self.hidden_size = 128
        self.classify = classify
        self.embedding = nn.Embedding(num_embeddings=vocabulary_size, embedding_dim=150, padding_idx=256)
        self.dropout1 = nn.Dropout(0.1)
        self.lstm = nn.LSTM(bidirectional=True, input_size=150, hidden_size=self.hidden_size, batch_first=True, num_layers=self.layers)
        self.dense1 = nn.Linear(in_features=256, out_features=512)
        self.dropout2 = nn.Dropout(0.1)
        if classify:
            self.dense2 = nn.Linear(in_features=512, out_features=num_classes)
        self.relu = nn.ReLU()

    def forward(self, inputs):
        lenghts = inputs.shape[1] - (inputs == 256).sum(dim=1).to('cpu')
        out = self.dropout1(self.embedding(inputs))

        out = nn.utils.rnn.pack_padded_sequence(out, lenghts, batch_first=True, enforce_sorted=False)
        _, (h_n, _) = self.lstm(out)
        h_n = h_n.view(self.layers, 2, inputs.shape[0], self.hidden_size)
        last_hidden = h_n[-1]
        last_hidden_fwd = last_hidden[0]
        last_hidden_bwd = last_hidden[1]
        out = torch.cat((last_hidden_fwd, last_hidden_bwd), 1)

        out = self.dropout2(self.relu(self.dense1(out)))
        if self.classify:
            out = self.dense2(out)
        return out

    def get_layer_groups(self):
        linear_layers = [elem[1] for elem in self.dense2.named_parameters()]
        other_layers = [elem[1] for elem in filter(lambda param_tuple: 'dense2' not in param_tuple[0], self.named_parameters())]
        param_groups = {
            'classifier': linear_layers,
            'feature_extractor': other_layers
        }
        return param_groups

In [34]:
model = LSTMNetwork(num_classes=num_cls)
model = model.to('cuda')

optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

In [35]:
epochs = 10
device = 'cuda'

In [22]:
from tqdm import tqdm
from sklearn.metrics import accuracy_score

metrics = {'acc': accuracy_score}

def initialize_logs_dict(loader_train, loader_val):
    logs = {
        'epoch_num': 0,
        'train_batches_per_epoch': len(loader_train),
        'val_batches_per_epoch': len(loader_val) if loader_val is not None else None,
        'train': {'loss': 0.0, 'predictions': [], 'labels': [], 'batch_idx': 0},
        'val': {'loss': 0.0, 'predictions': [], 'labels': [], 'batch_idx': 0},
        'metrics': {'train_' + metric: 0.0 for metric in metrics.keys()} | {'val_' + metric: 0.0 for metric in metrics.keys()}
    }
    return logs

def run_epoch(model, criterion, optimizer, data_loader, device, mode, logs):
    model.train() if mode == 'train' else model.eval()
    total_loss = 0.0
    running_metrics = {metric: 0.0 for metric in metrics.keys()}
    pbar = tqdm(data_loader, desc=f'{mode.capitalize()}ing...')

    for data in pbar:
        images, labels = data['image'].to(device), data['label'].to(device)

        with torch.set_grad_enabled(mode == 'train'):
            outputs = model(images)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            preds = (outputs >= 0.0).float()
            logs[mode]['predictions'] += preds.tolist()
            logs[mode]['labels'] += labels.tolist()
            logs[mode]['loss'] = total_loss / (logs[mode]['batch_idx'] + 1)

            for metric_name, metric_func in metrics.items():
                running_metrics[metric_name] += metric_func(labels.tolist(), preds.tolist())
                logs['metrics'][mode + '_' + metric_name] = running_metrics[metric_name] / (logs[mode]['batch_idx'] + 1)

            logs[mode]['batch_idx'] += 1
            pbar.set_postfix({'loss': logs[mode]['loss'], **{metric_name: logs['metrics'][mode + '_' + metric_name] for metric_name in metrics.keys()}})

            if mode == 'train':
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

def main_training_loop(model, criterion, optimizer, loader_train, loader_val, device, epochs):
    for epoch in range(epochs):
        print(f'Epoch {epoch}:')
        logs = initialize_logs_dict(loader_train, loader_val)
        logs['epoch_num'] = epoch

        run_epoch(model, criterion, optimizer, loader_train, device, 'train', logs)
        run_epoch(model, criterion, optimizer, loader_val, device, 'val', logs)

        print('train_loss: {:.4f} | val_loss: {:.4f} |'.format(logs['train']['loss'], logs['val']['loss']), end=' ')
        print(" | ".join(['{}: {:.4f}'.format(metric_name, metric_val) for metric_name, metric_val in logs['metrics'].items()]), end='\n\n')

    run_epoch(model, criterion, optimizer, loader_test, device, 'val', logs)


main_training_loop(model, criterion, optimizer, loader_train, loader_val, device, epochs)

Epoch 0:


Training...: 100%|██████████| 1240/1240 [06:50<00:00,  3.02it/s, loss=1.87, acc=0.143]
Valing...: 100%|██████████| 169/169 [00:56<00:00,  3.01it/s, loss=1.81, acc=0.171]


train_loss: 1.8750 | val_loss: 1.8144 | train_acc: 0.1426 | val_acc: 0.1710

Epoch 1:


Training...: 100%|██████████| 1240/1240 [06:48<00:00,  3.03it/s, loss=1.77, acc=0.19]
Valing...: 100%|██████████| 169/169 [00:52<00:00,  3.25it/s, loss=1.73, acc=0.21]


train_loss: 1.7696 | val_loss: 1.7346 | train_acc: 0.1895 | val_acc: 0.2099

Epoch 2:


Training...: 100%|██████████| 1240/1240 [06:35<00:00,  3.13it/s, loss=1.71, acc=0.219]
Valing...: 100%|██████████| 169/169 [00:50<00:00,  3.33it/s, loss=1.7, acc=0.237]


train_loss: 1.7133 | val_loss: 1.7004 | train_acc: 0.2191 | val_acc: 0.2367

Epoch 3:


Training...: 100%|██████████| 1240/1240 [06:35<00:00,  3.13it/s, loss=1.67, acc=0.24]
Valing...: 100%|██████████| 169/169 [00:50<00:00,  3.34it/s, loss=1.68, acc=0.259]


train_loss: 1.6692 | val_loss: 1.6837 | train_acc: 0.2397 | val_acc: 0.2594

Epoch 4:


Training...: 100%|██████████| 1240/1240 [06:37<00:00,  3.12it/s, loss=1.63, acc=0.26]
Valing...: 100%|██████████| 169/169 [00:50<00:00,  3.33it/s, loss=1.66, acc=0.279]


train_loss: 1.6315 | val_loss: 1.6600 | train_acc: 0.2596 | val_acc: 0.2786

Epoch 5:


Training...: 100%|██████████| 1240/1240 [06:36<00:00,  3.13it/s, loss=1.59, acc=0.277]
Valing...: 100%|██████████| 169/169 [00:52<00:00,  3.25it/s, loss=1.64, acc=0.275]


train_loss: 1.5944 | val_loss: 1.6443 | train_acc: 0.2767 | val_acc: 0.2751

Epoch 6:


Training...: 100%|██████████| 1240/1240 [06:36<00:00,  3.13it/s, loss=1.56, acc=0.29]
Valing...: 100%|██████████| 169/169 [00:50<00:00,  3.34it/s, loss=1.64, acc=0.28]


train_loss: 1.5591 | val_loss: 1.6412 | train_acc: 0.2901 | val_acc: 0.2804

Epoch 7:


Training...: 100%|██████████| 1240/1240 [06:34<00:00,  3.14it/s, loss=1.52, acc=0.302]
Valing...: 100%|██████████| 169/169 [00:50<00:00,  3.33it/s, loss=1.64, acc=0.292]


train_loss: 1.5233 | val_loss: 1.6354 | train_acc: 0.3021 | val_acc: 0.2915

Epoch 8:


Training...: 100%|██████████| 1240/1240 [06:37<00:00,  3.12it/s, loss=1.49, acc=0.315]
Valing...: 100%|██████████| 169/169 [00:51<00:00,  3.31it/s, loss=1.64, acc=0.29]


train_loss: 1.4924 | val_loss: 1.6425 | train_acc: 0.3153 | val_acc: 0.2898

Epoch 9:


Training...: 100%|██████████| 1240/1240 [06:37<00:00,  3.12it/s, loss=1.46, acc=0.316]
Valing...: 100%|██████████| 169/169 [00:52<00:00,  3.23it/s, loss=1.64, acc=0.295]


train_loss: 1.4619 | val_loss: 1.6390 | train_acc: 0.3157 | val_acc: 0.2953



Valing...: 100%|██████████| 249/249 [01:16<00:00,  3.27it/s, loss=0.98, acc=0.174]


In [36]:
torch.save(model.state_dict(), 'model.pt')