In [1]:
!pip install torchinfo
!pip install datasets
!pip install hexbytes

Collecting torchinfo
  Downloading torchinfo-1.8.0-py3-none-any.whl (23 kB)
Installing collected packages: torchinfo
Successfully installed torchinfo-1.8.0
Collecting datasets
  Downloading datasets-2.16.0-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed da

In [2]:
import torch
import numpy as np
import pandas as pd

from torch import nn
from torch import optim
from torchinfo import summary
from datasets import load_dataset
from torchvision import transforms
from argparse import ArgumentParser
from torch.utils.data import DataLoader
from torch.nn.functional import normalize
from sklearn.metrics import accuracy_score
from hexbytes import HexBytes

In [3]:
train_ds = load_dataset("mwritescode/slither-audited-smart-contracts", 'big-multilabel', split='train', ignore_verifications=True)
val_ds = load_dataset("mwritescode/slither-audited-smart-contracts", 'big-multilabel', split='validation', ignore_verifications=True)
test_ds = load_dataset("mwritescode/slither-audited-smart-contracts", 'big-multilabel', split='test', ignore_verifications=True)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/8.00k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/19.9k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.12k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/203M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/197M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/193M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/224M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/227M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/232M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/230M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/233M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.04M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.26k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.97M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/659k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/79641 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/15972 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10861 [00:00<?, ? examples/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [4]:
train_ds = train_ds.filter(lambda elem: elem['bytecode'] != '0x')
val_ds = val_ds.filter(lambda elem: elem['bytecode'] != '0x')
test_ds = test_ds.filter(lambda elem: elem['bytecode'] != '0x')

Filter:   0%|          | 0/79641 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10861 [00:00<?, ? examples/s]

Filter:   0%|          | 0/15972 [00:00<?, ? examples/s]

In [5]:
SAFE_IDX = 4 # the index of safe smart contract

def __get_one_hot_encoded_label(label):
    one_hot = np.zeros(5)
    for elem in label:
        if elem < SAFE_IDX:
            one_hot[elem] = 1
        elif elem > SAFE_IDX:
            one_hot[elem-1] = 1
    return one_hot


def generate_signal_and_label(example):
    code = HexBytes(example['bytecode']) # convert from string to bytes
    image = np.frombuffer(code, dtype=np.uint8)
    example['image'] = image
    example['label'] = __get_one_hot_encoded_label(example['slither'])
    return example

In [6]:
map_func = generate_signal_and_label

train_ds = train_ds.map(map_func, remove_columns=['address', 'source_code', 'bytecode', 'slither'])
val_ds = val_ds.map(map_func, remove_columns=['address', 'source_code', 'bytecode', 'slither'])
test_ds = test_ds.map(map_func, remove_columns=['address', 'source_code', 'bytecode', 'slither'])

Map:   0%|          | 0/79414 [00:00<?, ? examples/s]

Map:   0%|          | 0/10831 [00:00<?, ? examples/s]

Map:   0%|          | 0/15921 [00:00<?, ? examples/s]

In [7]:
max_len = 512
padding_val = 0

def img_label_to_tensor(examples):
  if 'image' in examples.keys():
      examples['image'] = [np.pad(img, pad_width=(0, max_len - len(img)), constant_values=padding_val) if len(img) < max_len else img[:max_len] for img in examples['image']]
      examples['image'] = [torch.tensor(img) for img in examples['image']]
  if 'label' in examples.keys():
      examples['label'] = torch.tensor(examples['label'])
      return examples

In [8]:
train_ds.set_transform(img_label_to_tensor)
val_ds.set_transform(img_label_to_tensor)
test_ds.set_transform(img_label_to_tensor)

In [9]:
model_name = 'lstm'
num_cls = 5

In [10]:
batch_size = 32

loader_train = DataLoader(train_ds,
                    batch_size=batch_size,
                    drop_last=True,
                    shuffle=True)
loader_val = DataLoader(val_ds,
                    batch_size=batch_size,
                    drop_last=True,
                    shuffle=False)
loader_test = DataLoader(test_ds,
                    batch_size=batch_size,
                    shuffle=False)

In [11]:
class LSTMNetwork(nn.Module):
    def __init__(self, num_classes=1, classify=True, vocabulary_size=257):
        super(LSTMNetwork, self).__init__()
        self.layers = 3
        self.hidden_size = 128
        self.classify = classify
        self.embedding = nn.Embedding(num_embeddings=vocabulary_size, embedding_dim=150, padding_idx=256)
        self.dropout1 = nn.Dropout(0.1)
        self.lstm = nn.LSTM(bidirectional=True, input_size=150, hidden_size=self.hidden_size, batch_first=True, num_layers=self.layers)
        self.dense1 = nn.Linear(in_features=256, out_features=512)
        self.dropout2 = nn.Dropout(0.1)
        if classify:
            self.dense2 = nn.Linear(in_features=512, out_features=num_classes)
        self.relu = nn.ReLU()

    def forward(self, inputs):
        lenghts = inputs.shape[1] - (inputs == 256).sum(dim=1).to('cpu')
        out = self.dropout1(self.embedding(inputs))

        out = nn.utils.rnn.pack_padded_sequence(out, lenghts, batch_first=True, enforce_sorted=False)
        _, (h_n, _) = self.lstm(out)
        h_n = h_n.view(self.layers, 2, inputs.shape[0], self.hidden_size)
        last_hidden = h_n[-1]
        last_hidden_fwd = last_hidden[0]
        last_hidden_bwd = last_hidden[1]
        out = torch.cat((last_hidden_fwd, last_hidden_bwd), 1)

        out = self.dropout2(self.relu(self.dense1(out)))
        if self.classify:
            out = self.dense2(out)
        return out

    def get_layer_groups(self):
        linear_layers = [elem[1] for elem in self.dense2.named_parameters()]
        other_layers = [elem[1] for elem in filter(lambda param_tuple: 'dense2' not in param_tuple[0], self.named_parameters())]
        param_groups = {
            'classifier': linear_layers,
            'feature_extractor': other_layers
        }
        return param_groups

In [12]:
model = LSTMNetwork(num_classes=num_cls)
model = model.to('cuda')



whitelist_weight_modules = (torch.nn.Linear, torch.nn.Conv1d, torch.nn.Conv2d, nn.LSTM)
blacklist_weight_modules = (torch.nn.LayerNorm, torch.nn.BatchNorm1d, torch.nn.BatchNorm2d, nn.Embedding)

def get_weight_decay_params(model):
    """ Adapted from the implementation at https://github.com/karpathy/minGPT/blob/3ed14b2cec0dfdad3f4b2831f2b4a86d11aef150/mingpt/model.py#L136"""
    decay = set()
    no_decay = set()
    for module_name, module in model.named_modules():
        for param_name, _ in module.named_parameters():
            fpn = '%s.%s' % (module_name, param_name) if module_name else param_name # full param name

            if 'bias' in param_name:
                # all biases will not be decayed
                no_decay.add(fpn)
            elif 'weight' in param_name and isinstance(module, whitelist_weight_modules):
                # weights of whitelist modules will be weight decayed
                decay.add(fpn)
            elif 'weight' in param_name and isinstance(module, blacklist_weight_modules):
                # weights of blacklist modules will NOT be weight decayed
                no_decay.add(fpn)
        # validate that we considered every parameter
    param_dict = {pn: p for pn, p in model.named_parameters()}
    inter_params = decay & no_decay
    union_params = decay | no_decay
    assert len(inter_params) == 0, "parameters %s made it into both decay/no_decay sets!" % (str(inter_params), )
    assert len(param_dict.keys() - union_params) == 0, "parameters %s were not separated into either decay/no_decay set!" \
                                                % (str(param_dict.keys() - union_params), )

    decay =  [param_dict[pn] for pn in sorted(list(decay))]
    no_decay =  [param_dict[pn] for pn in sorted(list(no_decay))]

    return decay, no_decay

decay, no_decay = get_weight_decay_params(model)
optim_groups = [
        {'params': decay, 'weight_decay': 0.0001},
        {'params': no_decay, 'weight_decay': 0.0}
    ]

optimizer = optimizer = optim.SGD(
            optim_groups,
            lr=1e-3,
            momentum=0.9,
            nesterov=True)
criterion = nn.BCEWithLogitsLoss()

In [13]:
epochs = 20
device = 'cuda'

In [None]:
from tqdm import tqdm
from sklearn.metrics import accuracy_score

metrics = {'acc': accuracy_score}

def initialize_logs_dict(loader_train, loader_val):
    logs = {
        'epoch_num': 0,
        'train_batches_per_epoch': len(loader_train),
        'val_batches_per_epoch': len(loader_val) if loader_val is not None else None,
        'train': {'loss': 0.0, 'predictions': [], 'labels': [], 'batch_idx': 0},
        'val': {'loss': 0.0, 'predictions': [], 'labels': [], 'batch_idx': 0},
        'metrics': {'train_' + metric: 0.0 for metric in metrics.keys()} | {'val_' + metric: 0.0 for metric in metrics.keys()}
    }
    return logs

def run_epoch(model, criterion, optimizer, data_loader, device, mode, logs):
    model.train() if mode == 'train' else model.eval()
    total_loss = 0.0
    running_metrics = {metric: 0.0 for metric in metrics.keys()}
    pbar = tqdm(data_loader, desc=f'{mode.capitalize()}ing...')

    for data in pbar:
        images, labels = data['image'].to(device), data['label'].to(device)

        with torch.set_grad_enabled(mode == 'train'):
            outputs = model(images)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            preds = (outputs >= 0.0).float()
            logs[mode]['predictions'] += preds.tolist()
            logs[mode]['labels'] += labels.tolist()
            logs[mode]['loss'] = total_loss / (logs[mode]['batch_idx'] + 1)

            for metric_name, metric_func in metrics.items():
                running_metrics[metric_name] += metric_func(labels.tolist(), preds.tolist())
                logs['metrics'][mode + '_' + metric_name] = running_metrics[metric_name] / (logs[mode]['batch_idx'] + 1)

            logs[mode]['batch_idx'] += 1
            pbar.set_postfix({'loss': logs[mode]['loss'], **{metric_name: logs['metrics'][mode + '_' + metric_name] for metric_name in metrics.keys()}})

            if mode == 'train':
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

def main_training_loop(model, criterion, optimizer, loader_train, loader_val, device, epochs):
    for epoch in range(epochs):
        print(f'Epoch {epoch}:')
        logs = initialize_logs_dict(loader_train, loader_val)
        logs['epoch_num'] = epoch

        run_epoch(model, criterion, optimizer, loader_train, device, 'train', logs)
        run_epoch(model, criterion, optimizer, loader_val, device, 'val', logs)

        print('train_loss: {:.4f} | val_loss: {:.4f} |'.format(logs['train']['loss'], logs['val']['loss']), end=' ')
        print(" | ".join(['{}: {:.4f}'.format(metric_name, metric_val) for metric_name, metric_val in logs['metrics'].items()]), end='\n\n')

    # run_epoch(model, criterion, optimizer, loader_test, device, 'val', logs)


main_training_loop(model, criterion, optimizer, loader_train, loader_val, device, epochs)

Epoch 0:


Training...: 100%|██████████| 2481/2481 [08:36<00:00,  4.80it/s, loss=0.578, acc=0.332]
Valing...: 100%|██████████| 338/338 [00:57<00:00,  5.85it/s, loss=0.55, acc=0.339]


train_loss: 0.5785 | val_loss: 0.5498 | train_acc: 0.3322 | val_acc: 0.3393

Epoch 1:


Training...: 100%|██████████| 2481/2481 [08:29<00:00,  4.87it/s, loss=0.549, acc=0.34]
Valing...: 100%|██████████| 338/338 [00:59<00:00,  5.69it/s, loss=0.549, acc=0.339]


train_loss: 0.5493 | val_loss: 0.5489 | train_acc: 0.3398 | val_acc: 0.3393

Epoch 2:


Training...: 100%|██████████| 2481/2481 [08:25<00:00,  4.91it/s, loss=0.548, acc=0.34]
Valing...: 100%|██████████| 338/338 [00:57<00:00,  5.87it/s, loss=0.547, acc=0.339]


train_loss: 0.5483 | val_loss: 0.5474 | train_acc: 0.3397 | val_acc: 0.3393

Epoch 3:


Training...:  45%|████▌     | 1119/2481 [03:50<04:27,  5.10it/s, loss=0.546, acc=0.341]

In [None]:
torch.save(model.state_dict(), 'model.pt')