In [7]:
!pip install torchinfo
!pip install datasets
!pip install hexbytes



In [9]:
import torch
import numpy as np

from torch import nn
from torch import optim
from torchinfo import summary
from datasets import load_dataset
from torchvision import transforms
from argparse import ArgumentParser
from torch.utils.data import DataLoader
from torch.nn.functional import normalize
from sklearn.metrics import accuracy_score
from hexbytes import HexBytes

In [10]:
IMAGENET_MEAN = [0.485, 0.456, 0.406]
IMAGENET_STD = [0.229, 0.224, 0.225]

In [11]:
train_ds = load_dataset("mwritescode/slither-audited-smart-contracts", 'big-multilabel', split='train', ignore_verifications=True)
val_ds = load_dataset("mwritescode/slither-audited-smart-contracts", 'big-multilabel', split='validation', ignore_verifications=True)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Generating train split:   0%|          | 0/79641 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/15972 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10861 [00:00<?, ? examples/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [12]:
train_ds[0]['bytecode']

'0x608060405234801561001057600080fd5b5060043610610202576000357c01000000000000000000000000000000000000000000000000000000009004806386d1a69f1161012c578063b414d4b6116100bf578063f2fde38b1161008e578063f2fde38b146104fd578063fca3b5aa14610523578063fedacfa414610549578063ff1970381461055157610202565b8063b414d4b614610473578063d9aa188114610499578063dd62ed3e146104a1578063e724529c146104cf57610202565b806393c32e06116100fb57806393c32e061461041157806395d89b4114610437578063a42d91d81461043f578063a9059cbb1461044757610202565b806386d1a69f146103d157806387829c65146103db5780638da5cb5b146104015780638f32d59b1461040957610202565b8063313ce567116101a45780634e680654116101735780634e6806541461039357806353b841721461039b5780635f6e84ee146103a357806370a08231146103ab57610202565b8063313ce5671461032c57806335e061fc1461034a57806342966c68146103525780634d853ee51461036f57610202565b806318160ddd116101e057806318160ddd146102de57806323b872dd146102e657806326e5be361461031c57806330cf480b1461032457610202565b806306fdde031461020757806307e4498c1

In [13]:
train_ds = train_ds.filter(lambda elem: elem['bytecode'] != '0x')
val_ds = val_ds.filter(lambda elem: elem['bytecode'] != '0x')

Filter:   0%|          | 0/79641 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10861 [00:00<?, ? examples/s]

In [14]:
train_ds[0].keys()

dict_keys(['address', 'source_code', 'bytecode', 'slither'])

In [15]:
SAFE_IDX = 4 # the index of safe smart contract

def __get_one_hot_encoded_label(label):
    one_hot = np.zeros(5)
    for elem in label:
        if elem < SAFE_IDX:
            one_hot[elem] = 1
        elif elem > SAFE_IDX:
            one_hot[elem-1] = 1
    return one_hot


def generate_signal_and_label(example):
    code = HexBytes(example['bytecode']) # convert from string to bytes
    image = np.frombuffer(code, dtype=np.uint8)
    example['image'] = image
    example['label'] = __get_one_hot_encoded_label(example['slither'])
    return example

In [16]:
map_func = generate_signal_and_label

train_ds_mapped = train_ds.map(map_func, remove_columns=['address', 'source_code', 'bytecode', 'slither'])
val_ds_mapped = val_ds.map(map_func, remove_columns=['address', 'source_code', 'bytecode', 'slither'])

Map:   0%|          | 0/79414 [00:00<?, ? examples/s]

Map:   0%|          | 0/10831 [00:00<?, ? examples/s]

In [17]:
train_ds_mapped[0].keys()

dict_keys(['image', 'label'])

In [18]:
padding_val = 0

In [20]:
max_len = 512

def img_label_to_tensor(examples):
  if 'image' in examples.keys():
      examples['image'] = [np.pad(img, pad_width=(0, max_len - len(img)), constant_values=padding_val) if len(img) < max_len else img[:max_len] for img in examples['image']]
      examples['image'] = [torch.tensor(img) for img in examples['image']]
  if 'label' in examples.keys():
      examples['label'] = torch.tensor(examples['label'])
      return examples

In [21]:
train_ds_mapped.set_transform(img_label_to_tensor)
val_ds_mapped.set_transform(img_label_to_tensor)

In [22]:
pos_weights = None

In [23]:
model_name = 'lstm'
num_cls = 5

In [45]:
batch_size = 16

loader_train = DataLoader(train_ds_mapped,
                    batch_size=batch_size,
                    drop_last=True,
                    shuffle=True)
loader_val = DataLoader(val_ds_mapped,
                    batch_size=batch_size,
                    drop_last=True,
                    shuffle=False)

In [25]:
class LSTMNetwork(nn.Module):
    def __init__(self, num_classes=1, classify=True, vocabulary_size=257):
        super(LSTMNetwork, self).__init__()
        self.layers = 3
        self.hidden_size = 128
        self.classify = classify
        self.embedding = nn.Embedding(num_embeddings=vocabulary_size, embedding_dim=150, padding_idx=256)
        self.dropout1 = nn.Dropout(0.1)
        self.lstm = nn.LSTM(bidirectional=True, input_size=150, hidden_size=self.hidden_size, batch_first=True, num_layers=self.layers)
        self.dense1 = nn.Linear(in_features=256, out_features=512)
        self.dropout2 = nn.Dropout(0.1)
        if classify:
            self.dense2 = nn.Linear(in_features=512, out_features=num_classes)
        self.relu = nn.ReLU()

    def forward(self, inputs):
        lenghts = inputs.shape[1] - (inputs == 256).sum(dim=1).to('cpu')
        out = self.dropout1(self.embedding(inputs))

        out = nn.utils.rnn.pack_padded_sequence(out, lenghts, batch_first=True, enforce_sorted=False)
        _, (h_n, _) = self.lstm(out)
        h_n = h_n.view(self.layers, 2, inputs.shape[0], self.hidden_size)
        last_hidden = h_n[-1]
        last_hidden_fwd = last_hidden[0]
        last_hidden_bwd = last_hidden[1]
        out = torch.cat((last_hidden_fwd, last_hidden_bwd), 1)

        out = self.dropout2(self.relu(self.dense1(out)))
        if self.classify:
            out = self.dense2(out)
        return out

    def get_layer_groups(self):
        linear_layers = [elem[1] for elem in self.dense2.named_parameters()]
        other_layers = [elem[1] for elem in filter(lambda param_tuple: 'dense2' not in param_tuple[0], self.named_parameters())]
        param_groups = {
            'classifier': linear_layers,
            'feature_extractor': other_layers
        }
        return param_groups

In [26]:
model = LSTMNetwork(num_classes=num_cls)
model = model.to('cuda')

optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss(weight=pos_weights)

In [27]:
epochs = 10
device = 'cuda'

In [35]:
from tqdm import tqdm
from sklearn.metrics import accuracy_score


metrics = {'acc': accuracy_score}
def __init_logs_dict():
  logs = {
      'epoch_num': 0,
      'train_batches_per_epoch': len(loader_train),
      'val_batches_per_epoch': len(loader_val) if loader_val is not None else None,
      'train': {},
      'val': {},
      'metrics': {}}
  for step in ['train', 'val']:
      logs[step]['loss'] = 0.0
      logs[step]['predictions'] = []
      logs[step]['labels'] = []
      logs[step]['batch_idx'] = 0
  for metric in metrics.keys():
      logs['metrics']['train_'+metric] = 0.0
      logs['metrics']['val_'+metric] = 0.0
  return logs


def __val_epoch():
  model.eval()
  total_loss = 0.0
  running_metrics = [0 for _ in metrics]
  pbar = tqdm(loader_val, desc='Validation...')
  mode = 'test'
  for data in pbar:
    images = data['image'].to(device)
    labels = data['label'].to(device)

    outputs = model(images)
    loss = criterion(outputs, labels)
    total_loss[mode] += loss.item()
    preds = (outputs >= 0.0).float()
    logs[mode]['predictions'] += preds.tolist()
    logs[mode]['labels'] += labels.tolist()
    logs[mode]['loss'] = total_loss[mode]/(logs[mode]['batch_idx']+1)


    for i, (metric_name, metric_func) in enumerate(metrics.items()):
      running_metrics[i] += metric_func(labels.tolist(), preds.tolist())
      logs['metrics'][mode + '_' + metric_name] = running_metrics[i]/(logs[mode]['batch_idx']+1)
    logs[mode]['batch_idx'] += 1
    pbar.set_postfix({'loss': logs['train']['loss'], **{metric_name: logs['metrics']['train_' + metric_name] for metric_name in metrics.keys()}})


for i in range(epochs):
  print(f'Epoch {i}:')
  logs = __init_logs_dict()
  logs['epoch_num'] = i
  mode = 'train'

  total_loss = {
      'train': 0.0,
      'val': 0.0
  }

  model.train()
  running_metrics = [0 for _ in metrics]
  pbar = tqdm(loader_train, desc='Training...')
  for data in pbar:
    images = data['image'].to(device)
    labels = data['label'].to(device)

    outputs = model(images)
    loss = criterion(outputs, labels)
    total_loss[mode] += loss.item()
    preds = (outputs >= 0.0).float()
    logs[mode]['predictions'] += preds.tolist()
    logs[mode]['labels'] += labels.tolist()
    logs[mode]['loss'] = total_loss[mode]/(logs[mode]['batch_idx']+1)

    for i, (metric_name, metric_func) in enumerate(metrics.items()):
      running_metrics[i] += metric_func(labels.tolist(), preds.tolist())
      logs['metrics'][mode + '_' + metric_name] = running_metrics[i]/(logs[mode]['batch_idx']+1)
    logs[mode]['batch_idx'] += 1
    pbar.set_postfix({'loss': logs['train']['loss'], **{metric_name: logs['metrics']['train_' + metric_name] for metric_name in metrics.keys()}})


    optimizer.zero_grad()
    loss.backward()
    optimizer.step()


  __val_epoch()

  total_loss = {
      'train': 0.0,
      'val': 0.0
  }

  print('train_loss: {:.4f} | val_loss: {:.4f} |'.format(logs['train']['loss'], logs['val']['loss']), end=' ')
  print(" | ".join([ '{}: {:.4f}'.format(metric_name, metric_val) for metric_name, metric_val in logs['metrics'].items()]), end='\n\n')


Epoch 0:


Training...:  11%|█▏        | 566/4963 [01:24<10:54,  6.71it/s, loss=1.63, acc=0.276]


KeyboardInterrupt: ignored

In [40]:
from tqdm import tqdm
from sklearn.metrics import accuracy_score

metrics = {'acc': accuracy_score}

def initialize_logs_dict(loader_train, loader_val):
    logs = {
        'epoch_num': 0,
        'train_batches_per_epoch': len(loader_train),
        'val_batches_per_epoch': len(loader_val) if loader_val is not None else None,
        'train': {'loss': 0.0, 'predictions': [], 'labels': [], 'batch_idx': 0},
        'val': {'loss': 0.0, 'predictions': [], 'labels': [], 'batch_idx': 0},
        'metrics': {'train_acc': 0.0, 'val_acc': 0.0}
    }
    return logs

def validate_epoch(model, criterion, loader_val, device, logs):
    model.eval()
    mode = 'val'
    total_loss = 0.0
    running_metrics = {metric: 0.0 for metric in metrics.keys()}
    pbar = tqdm(loader_val, desc='Validation...')

    for data in pbar:
        images, labels = data['image'].to(device), data['label'].to(device)

        outputs = model(images)
        loss = criterion(outputs, labels)
        total_loss += loss.item()

        preds = (outputs >= 0.0).float()
        logs[mode]['predictions'] += preds.tolist()
        logs[mode]['labels'] += labels.tolist()
        logs[mode]['loss'] = total_loss / (logs[mode]['batch_idx'] + 1)

        for metric_name, metric_func in metrics.items():
            running_metrics[metric_name] += metric_func(labels.tolist(), preds.tolist())
            logs['metrics'][mode + '_' + metric_name] = running_metrics[metric_name] / (logs[mode]['batch_idx'] + 1)

        logs[mode]['batch_idx'] += 1
        pbar.set_postfix({'loss': logs['train']['loss'], **{metric_name: logs['metrics']['train_' + metric_name] for metric_name in metrics.keys()}})

def train_epoch(model, criterion, optimizer, loader_train, device, logs):
    model.train()
    mode = 'train'
    total_loss = 0.0
    running_metrics = {metric: 0.0 for metric in metrics.keys()}
    pbar = tqdm(loader_train, desc='Training...')

    for data in pbar:
        images, labels = data['image'].to(device), data['label'].to(device)

        outputs = model(images)
        loss = criterion(outputs, labels)
        total_loss += loss.item()

        preds = (outputs >= 0.0).float()
        logs[mode]['predictions'] += preds.tolist()
        logs[mode]['labels'] += labels.tolist()
        logs[mode]['loss'] = total_loss / (logs[mode]['batch_idx'] + 1)

        for metric_name, metric_func in metrics.items():
            running_metrics[metric_name] += metric_func(labels.tolist(), preds.tolist())
            logs['metrics'][mode + '_' + metric_name] = running_metrics[metric_name] / (logs[mode]['batch_idx'] + 1)

        logs[mode]['batch_idx'] += 1
        pbar.set_postfix({'loss': logs['train']['loss'], **{metric_name: logs['metrics']['train_' + metric_name] for metric_name in metrics.keys()}})

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

def main_training_loop(model, criterion, optimizer, loader_train, loader_val, device, epochs):
    for epoch in range(epochs):
        print(f'Epoch {epoch}:')
        logs = initialize_logs_dict(loader_train, loader_val)
        logs['epoch_num'] = epoch
        train_epoch(model, criterion, optimizer, loader_train, device, logs)
        validate_epoch(model, criterion, loader_val, device, logs)

        print('train_loss: {:.4f} | val_loss: {:.4f} |'.format(logs['train']['loss'], logs['val']['loss']), end=' ')
        print(" | ".join(['{}: {:.4f}'.format(metric_name, metric_val) for metric_name, metric_val in logs['metrics'].items()]), end='\n\n')

# Example usage:
main_training_loop(model, criterion, optimizer, loader_train, loader_val, device, epochs)


Epoch 0:


Training...:  11%|█         | 545/4963 [01:20<10:53,  6.76it/s, loss=1.64, acc=0.279]


KeyboardInterrupt: ignored

In [None]:
from tqdm import tqdm
from sklearn.metrics import accuracy_score

metrics = {'acc': accuracy_score}

def initialize_logs_dict(loader_train, loader_val):
    logs = {
        'epoch_num': 0,
        'train_batches_per_epoch': len(loader_train),
        'val_batches_per_epoch': len(loader_val) if loader_val is not None else None,
        'train': {'loss': 0.0, 'predictions': [], 'labels': [], 'batch_idx': 0},
        'val': {'loss': 0.0, 'predictions': [], 'labels': [], 'batch_idx': 0},
        'metrics': {'train_' + metric: 0.0 for metric in metrics.keys()} | {'val_' + metric: 0.0 for metric in metrics.keys()}
    }
    return logs

def run_epoch(model, criterion, optimizer, data_loader, device, mode, logs):
    model.train() if mode == 'train' else model.eval()
    total_loss = 0.0
    running_metrics = {metric: 0.0 for metric in metrics.keys()}
    pbar = tqdm(data_loader, desc=f'{mode.capitalize()}ing...')

    for data in pbar:
        images, labels = data['image'].to(device), data['label'].to(device)

        with torch.set_grad_enabled(mode == 'train'):
            outputs = model(images)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            preds = (outputs >= 0.0).float()
            logs[mode]['predictions'] += preds.tolist()
            logs[mode]['labels'] += labels.tolist()
            logs[mode]['loss'] = total_loss / (logs[mode]['batch_idx'] + 1)

            for metric_name, metric_func in metrics.items():
                running_metrics[metric_name] += metric_func(labels.tolist(), preds.tolist())
                logs['metrics'][mode + '_' + metric_name] = running_metrics[metric_name] / (logs[mode]['batch_idx'] + 1)

            logs[mode]['batch_idx'] += 1
            pbar.set_postfix({'loss': logs[mode]['loss'], **{metric_name: logs['metrics'][mode + '_' + metric_name] for metric_name in metrics.keys()}})

            if mode == 'train':
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

def main_training_loop(model, criterion, optimizer, loader_train, loader_val, device, epochs):
    for epoch in range(epochs):
        print(f'Epoch {epoch}:')
        logs = initialize_logs_dict(loader_train, loader_val)
        logs['epoch_num'] = epoch

        run_epoch(model, criterion, optimizer, loader_train, device, 'train', logs)
        run_epoch(model, criterion, optimizer, loader_val, device, 'val', logs)

        print('train_loss: {:.4f} | val_loss: {:.4f} |'.format(logs['train']['loss'], logs['val']['loss']), end=' ')
        print(" | ".join(['{}: {:.4f}'.format(metric_name, metric_val) for metric_name, metric_val in logs['metrics'].items()]), end='\n\n')

main_training_loop(model, criterion, optimizer, loader_train, loader_val, device, epochs)

Epoch 0:


Training...: 100%|██████████| 4963/4963 [12:09<00:00,  6.80it/s, loss=1.58, acc=0.315]
Valing...: 100%|██████████| 676/676 [01:07<00:00, 10.04it/s, loss=1.62, acc=0.315]


train_loss: 1.5829 | val_loss: 1.6186 | train_acc: 0.3148 | val_acc: 0.3145

Epoch 1:


Training...: 100%|██████████| 4963/4963 [12:09<00:00,  6.80it/s, loss=1.55, acc=0.33]
Valing...: 100%|██████████| 676/676 [01:07<00:00, 10.02it/s, loss=1.62, acc=0.328]


train_loss: 1.5459 | val_loss: 1.6169 | train_acc: 0.3298 | val_acc: 0.3282

Epoch 2:


Training...:  53%|█████▎    | 2634/4963 [06:39<05:11,  7.49it/s, loss=1.51, acc=0.346]