In [3]:
!pip install yacs
!pip install torchinfo
!pip install datasets
!pip install hexbytes

Collecting yacs
  Downloading yacs-0.1.8-py3-none-any.whl (14 kB)
Installing collected packages: yacs
Successfully installed yacs-0.1.8
Collecting torchinfo
  Downloading torchinfo-1.8.0-py3-none-any.whl (23 kB)
Installing collected packages: torchinfo
Successfully installed torchinfo-1.8.0
Collecting datasets
  Downloading datasets-2.16.0-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [3

In [1]:
import torch
import numpy as np
import pandas as pd
import math

from PIL import Image
from torch import nn
from torch import optim
from torchinfo import summary
from datasets import load_dataset
from torchvision import transforms
from argparse import ArgumentParser
from torch.utils.data import DataLoader
from torch.nn.functional import normalize
from sklearn.metrics import accuracy_score
from hexbytes import HexBytes

ModuleNotFoundError: ignored

In [None]:
whitelist_weight_modules = (torch.nn.Linear, torch.nn.Conv1d, torch.nn.Conv2d, nn.LSTM)
blacklist_weight_modules = (torch.nn.LayerNorm, torch.nn.BatchNorm1d, torch.nn.BatchNorm2d, nn.Embedding)

def get_weight_decay_params(model):
    """ Adapted from the implementation at https://github.com/karpathy/minGPT/blob/3ed14b2cec0dfdad3f4b2831f2b4a86d11aef150/mingpt/model.py#L136"""
    decay = set()
    no_decay = set()
    for module_name, module in model.named_modules():
        for param_name, _ in module.named_parameters():
            fpn = '%s.%s' % (module_name, param_name) if module_name else param_name # full param name

            if 'bias' in param_name:
                # all biases will not be decayed
                no_decay.add(fpn)
            elif 'weight' in param_name and isinstance(module, whitelist_weight_modules):
                # weights of whitelist modules will be weight decayed
                decay.add(fpn)
            elif 'weight' in param_name and isinstance(module, blacklist_weight_modules):
                # weights of blacklist modules will NOT be weight decayed
                no_decay.add(fpn)

    # validate that we considered every parameter
    param_dict = {pn: p for pn, p in model.named_parameters()}
    inter_params = decay & no_decay
    union_params = decay | no_decay
    assert len(inter_params) == 0, "parameters %s made it into both decay/no_decay sets!" % (str(inter_params), )
    assert len(param_dict.keys() - union_params) == 0, "parameters %s were not separated into either decay/no_decay set!" \
                                                % (str(param_dict.keys() - union_params), )

    decay =  [param_dict[pn] for pn in sorted(list(decay))]
    no_decay =  [param_dict[pn] for pn in sorted(list(no_decay))]

    return decay, no_decay

In [None]:
IMAGENET_MEAN = [0.485, 0.456, 0.406]
IMAGENET_STD = [0.229, 0.224, 0.225]

In [None]:
SAFE_IDX = 4 # the index of safe smart contract

def __get_RGB_image(bytecode):
    image = np.frombuffer(bytecode, dtype=np.uint8)
    length = int(math.ceil(len(image)/3))
    image = np.pad(image, pad_width=(0, length*3 - len(image)))
    image = image.reshape((-1, 3))
    sqrt_len = int(math.ceil(math.sqrt(image.shape[0])))
    image = np.pad(image,  pad_width=((0, sqrt_len**2 - image.shape[0]),(0,0)))
    image = image.reshape((sqrt_len, sqrt_len, 3))
    image = Image.fromarray(image)
    return image

def __get_one_hot_encoded_label(label):
    one_hot = np.zeros(5)
    for elem in label:
        if elem < SAFE_IDX:
            one_hot[elem] = 1
        elif elem > SAFE_IDX:
            one_hot[elem-1] = 1
    return one_hot

def generate_image_and_label(example):
    code = HexBytes(example['bytecode'])
    example['image'] = __get_RGB_image(code)
    example['label'] = __get_one_hot_encoded_label(example['slither'])
    return example

In [17]:
train_ds = load_dataset("mwritescode/slither-audited-smart-contracts", 'big-multilabel', split='train', ignore_verifications=True)
val_ds = load_dataset("mwritescode/slither-audited-smart-contracts", 'big-multilabel', split='validation', ignore_verifications=True)


train_ds = train_ds.filter(lambda elem: elem['bytecode'] != '0x')
val_ds = val_ds.filter(lambda elem: elem['bytecode'] != '0x')

map_func = generate_image_and_label
train_ds = train_ds.map(map_func, remove_columns=['address', 'source_code', 'bytecode', 'slither'])
val_ds = val_ds.map(map_func, remove_columns=['address', 'source_code', 'bytecode', 'slither'])

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [18]:
from torch import nn
from torchvision import models

class ResNetModel(nn.Module):
    def __init__(self, num_classes=5, classify=True):
        super(ResNetModel, self).__init__()
        self.resnet = models.resnet18(pretrained=True)

        if classify:
            self.resnet.fc = nn.Linear(512, num_classes)
        else:
            features = nn.ModuleList(self.resnet.children())[:-1]
            self.resnet = nn.Sequential(*features).append(nn.Flatten())

    def forward(self, inputs):
        return self.resnet(inputs)

    def get_layer_groups(self):
        linear_layers = [elem[1] for elem in filter(lambda param_tuple: 'fc' in param_tuple[0], self.resnet.named_parameters())]
        other_layers = [elem[1] for elem in filter(lambda param_tuple: 'fc' not in param_tuple[0], self.resnet.named_parameters())]
        param_groups = {
            'classifier': linear_layers,
            'feature_extractor': other_layers
        }
        return param_groups

In [8]:
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

metrics = {'acc': accuracy_score}

def initialize_logs_dict(loader_train, loader_val):
    logs = {
        'epoch_num': 0,
        'train_batches_per_epoch': len(loader_train),
        'val_batches_per_epoch': len(loader_val) if loader_val is not None else None,
        'train': {'loss': 0.0, 'predictions': [], 'labels': [], 'batch_idx': 0},
        'val': {'loss': 0.0, 'predictions': [], 'labels': [], 'batch_idx': 0},
        'metrics': {'train_acc': 0.0} | {'val_' + metric: 0.0 for metric in metrics.keys()}
    }
    return logs

def run_epoch(model, criterion, optimizer, data_loader, device, mode, logs):
    model.train() if mode == 'train' else model.eval()
    total_loss = 0.0
    running_metrics = {metric: 0.0 for metric in metrics.keys()}
    pbar = tqdm(data_loader, desc=f'{mode.capitalize()}ing...')

    for data in pbar:
        images, labels = data['image'].to(device), data['label'].to(device)

        with torch.set_grad_enabled(mode == 'train'):
            outputs = model(images)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            preds = (outputs >= 0.0).float()
            logs[mode]['predictions'] += preds.tolist()
            logs[mode]['labels'] += labels.tolist()
            logs[mode]['loss'] = total_loss / (logs[mode]['batch_idx'] + 1)

            for metric_name, metric_func in metrics.items():
                running_metrics[metric_name] += metric_func(labels.tolist(), preds.tolist())
                logs['metrics'][mode + '_' + metric_name] = running_metrics[metric_name] / (logs[mode]['batch_idx'] + 1)

            logs[mode]['batch_idx'] += 1
            pbar.set_postfix({'loss': logs[mode]['loss'], **{metric_name: logs['metrics'][mode + '_' + metric_name] for metric_name in metrics.keys()}})

            if mode == 'train':
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

def main_training_loop(model, criterion, optimizer, loader_train, loader_val, device, epochs):
    for epoch in range(epochs):
        print(f'Epoch {epoch}:')
        logs = initialize_logs_dict(loader_train, loader_val)
        logs['epoch_num'] = epoch

        run_epoch(model, criterion, optimizer, loader_train, device, 'train', logs)
        run_epoch(model, criterion, optimizer, loader_val, device, 'val', logs)

        print('train_loss: {:.4f} | val_loss: {:.4f} |'.format(logs['train']['loss'], logs['val']['loss']), end=' ')
        print(" | ".join(['{}: {:.4f}'.format(metric_name, metric_val) for metric_name, metric_val in logs['metrics'].items()]), end='\n\n')

In [19]:
from torch.utils.data import DataLoader
from torchvision import transforms

class GetMeanStd:
    """
    Inspired by the implementation of https://github.com/Nikronic/CoarseNet/blob/master/utils/preprocess.py#L142-L200
    """
    def __init__(self, dataset, batch_size, img_size):
        self.img_transform =transforms.Compose([
        transforms.Resize(img_size),
        transforms.CenterCrop(img_size),
        transforms.ToTensor()])

        dataset.set_transform(self.__to_tensor)
        self.data_loader = DataLoader(dataset=dataset,
                            batch_size=batch_size,
                            shuffle=False,
                            num_workers=0,
                            pin_memory=0)

    def __to_tensor(self, examples):
        examples['image'] = [self.img_transform(elem) for elem in examples['image']]
        return examples

    def __call__(self):
        mean = 0.
        std = 0.
        nb_samples = 0.
        for data in tqdm.tqdm(self.data_loader, desc='Computing stats..'):
            data = data['image']
            batch_samples = data.size(0)
            data = data.view(batch_samples, data.size(1), -1)
            mean += data.mean(2).sum(0)
            std += data.std(2).sum(0)
            nb_samples += batch_samples

        mean /= nb_samples
        std /= nb_samples

        return mean, std

In [20]:
max_len = 512
img_size = 224
get_stats = GetMeanStd(train_ds, batch_size=16, img_size=img_size)
mean, std = get_stats()
img_transform = transforms.Compose([
    transforms.Resize(img_size),
    transforms.CenterCrop(img_size),
    transforms.ToTensor(),
    transforms.Normalize(mean=mean, std=std),
])

def img_label_to_tensor(examples):
    if 'image' in examples.keys():
        examples['image'] = [img_transform(elem) for elem in examples['image']]
    if 'label' in examples.keys():
        examples['label'] = torch.tensor(examples['label'])
        return examples

train_ds.set_transform(img_label_to_tensor)
val_ds.set_transform(img_label_to_tensor)

Computing stats..: 100%|██████████| 4964/4964 [03:34<00:00, 23.17it/s]


In [21]:
batch_size = 16
loader_train = DataLoader(train_ds,
                    batch_size=batch_size,
                    drop_last=True,
                    shuffle=True)
loader_val = DataLoader(val_ds,
                    batch_size=batch_size,
                    drop_last=True,
                    shuffle=False)

In [22]:
num_cls = 5
model_name = 'resnet'

model = ResNetModel(num_classes=num_cls)
model = model.to('cuda')

TRAIN_FROM_SCRATCH = False
if not TRAIN_FROM_SCRATCH:
    param_groups = model.get_layer_groups()
    for param in param_groups['feature_extractor'][:-6]:
        param.requires_grad = False

summary(model)
print(model)

decay, no_decay = get_weight_decay_params(model)
optim_groups = [
    {'params': decay, 'weight_decay': 0.0001},
    {'params': no_decay, 'weight_decay': 0.0}
]

optimizer = optim.SGD(
        optim_groups,
        lr=TRAIN_FROM_SCRATCH,
        momentum=0.9,
        nesterov=True)

criterion = nn.BCEWithLogitsLoss()



ResNetModel(
  (resnet): ResNet(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_

In [23]:
device = 'cuda'
epochs= 20
main_training_loop(model, criterion, optimizer, loader_train, loader_val, device, epochs)

Epoch 0:


TypeError: ignored

In [4]:
import torch
import numpy as np
import pandas as pd
import math

from PIL import Image
from torch import nn
from torch import optim
from torchinfo import summary
from datasets import load_dataset
from torchvision import transforms
from argparse import ArgumentParser
from torch.utils.data import DataLoader
from torch.nn.functional import normalize
from sklearn.metrics import accuracy_score
from hexbytes import HexBytes

In [5]:
from yacs.config import CfgNode as ConfigurationNode

# YACS overwrite these settings using YAML
__C = ConfigurationNode()

__C.MODEL = ConfigurationNode()
__C.MODEL.NAME = 'resnet' # ADD MODEl CONFIGURATIONS AS THE CODING PROGRESSES
__C.MODEL.N_CLASSES = 5

__C.DATASET = ConfigurationNode()
__C.DATASET.RGB_IMAGES = True
__C.DATASET.IMG_SHAPE = 224
__C.DATASET.USE_IMAGENET_STATS = True
__C.DATASET.AUGUMENTATION = False
__C.DATASET.BINARY_LABELS = False
__C.DATASET.NORMALIZE = True
__C.DATASET.MAX_SEQ_LEN = 512

__C.DATASET.LOADER = ConfigurationNode()
__C.DATASET.LOADER.BATCH_SIZE = 16

__C.TRAINING = ConfigurationNode()
__C.TRAINING.N_EPOCHS = 100
__C.TRAINING.TRAIN_FROM_SCRATCH = False
__C.TRAINING.LAYERS_TO_FINETUNE = 6
__C.TRAINING.LOSS = 'binary_crossentropy'

__C.TRAINING.OPTIMIZER = ConfigurationNode()
__C.TRAINING.OPTIMIZER.NAME = 'sgd'
__C.TRAINING.OPTIMIZER.LR = 1e-3
__C.TRAINING.OPTIMIZER.WEIGHT_DECAY = 0.0001
__C.TRAINING.OPTIMIZER.MOMENTUM = 0.9
__C.TRAINING.OPTIMIZER.USE_WEIGHTS = False

__C.TRAINING.EARLY_STOPPING = ConfigurationNode()
__C.TRAINING.EARLY_STOPPING.USE = True
__C.TRAINING.EARLY_STOPPING.MONITOR = 'val_acc'
__C.TRAINING.EARLY_STOPPING.DECREASING = False
__C.TRAINING.EARLY_STOPPING.PATIENCE = 10

__C.TRAINING.CHECKPOINTS = ConfigurationNode()
__C.TRAINING.CHECKPOINTS.USE = True
__C.TRAINING.CHECKPOINTS.MONITOR = 'val_acc'
__C.TRAINING.CHECKPOINTS.DECREASING = False
__C.TRAINING.CHECKPOINTS.PATH = 'checkpoints/<config_name>.pkl'

__C.TRAINING.LOGGER = ConfigurationNode()
__C.TRAINING.LOGGER.USE = True
__C.TRAINING.LOGGER.RUN_TAG = '<config_name>'

__C.TRAINING.TRACK_METRICS = ConfigurationNode()
__C.TRAINING.TRACK_METRICS.USE = True
__C.TRAINING.TRACK_METRICS.NAMES = ('f1', 'precision', 'recall')
__C.TRAINING.TRACK_METRICS.AVERAGE = ['macro', 'micro'] #Optionally change to/add micro and weighted

def get_cfg_defaults():
    """
    Get a yacs CfgNode object with default values
    """
    # Return a clone so that the defaults will not be altered
    # It will be subsequently overwritten with local YAML.
    return __C.clone()

In [6]:
SAFE_IDX = 4 # the index of safe smart contract

def __get_RGB_image(bytecode):
    image = np.frombuffer(bytecode, dtype=np.uint8)
    length = int(math.ceil(len(image)/3))
    image = np.pad(image, pad_width=(0, length*3 - len(image)))
    image = image.reshape((-1, 3))
    sqrt_len = int(math.ceil(math.sqrt(image.shape[0])))
    image = np.pad(image,  pad_width=((0, sqrt_len**2 - image.shape[0]),(0,0)))
    image = image.reshape((sqrt_len, sqrt_len, 3))
    image = Image.fromarray(image)
    return image

def __get_one_hot_encoded_label(label):
    one_hot = np.zeros(5)
    for elem in label:
        if elem < SAFE_IDX:
            one_hot[elem] = 1
        elif elem > SAFE_IDX:
            one_hot[elem-1] = 1
    return one_hot

def generate_image_and_label(example):
    code = HexBytes(example['bytecode'])
    example['image'] = __get_RGB_image(code)
    example['label'] = __get_one_hot_encoded_label(example['slither'])
    return example

In [7]:

from torch.utils.data import DataLoader
from torchvision import transforms

class GetMeanStd:
    """
    Inspired by the implementation of https://github.com/Nikronic/CoarseNet/blob/master/utils/preprocess.py#L142-L200
    """
    def __init__(self, dataset, batch_size, img_size):
        self.img_transform =transforms.Compose([
        transforms.Resize(img_size),
        transforms.CenterCrop(img_size),
        transforms.ToTensor()])

        dataset.set_transform(self.__to_tensor)
        self.data_loader = DataLoader(dataset=dataset,
                            batch_size=batch_size,
                            shuffle=False,
                            num_workers=0,
                            pin_memory=0)

    def __to_tensor(self, examples):
        examples['image'] = [self.img_transform(elem) for elem in examples['image']]
        return examples

    def __call__(self):
        mean = 0.
        std = 0.
        nb_samples = 0.
        for data in tqdm(self.data_loader, desc='Computing stats..'):
            data = data['image']
            batch_samples = data.size(0)
            data = data.view(batch_samples, data.size(1), -1)
            mean += data.mean(2).sum(0)
            std += data.std(2).sum(0)
            nb_samples += batch_samples

        mean /= nb_samples
        std /= nb_samples

        return mean, std

In [8]:
from torch import nn
from torchvision import models

class ResNetModel(nn.Module):
    def __init__(self, num_classes=5, classify=True):
        super(ResNetModel, self).__init__()
        self.resnet = models.resnet18(pretrained=True)

        if classify:
            self.resnet.fc = nn.Linear(512, num_classes)
        else:
            features = nn.ModuleList(self.resnet.children())[:-1]
            self.resnet = nn.Sequential(*features).append(nn.Flatten())

    def forward(self, inputs):
        return self.resnet(inputs)

    def get_layer_groups(self):
        linear_layers = [elem[1] for elem in filter(lambda param_tuple: 'fc' in param_tuple[0], self.resnet.named_parameters())]
        other_layers = [elem[1] for elem in filter(lambda param_tuple: 'fc' not in param_tuple[0], self.resnet.named_parameters())]
        param_groups = {
            'classifier': linear_layers,
            'feature_extractor': other_layers
        }
        return param_groups

In [9]:
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

metrics = {'acc': accuracy_score}

def initialize_logs_dict(loader_train, loader_val):
    logs = {
        'epoch_num': 0,
        'train_batches_per_epoch': len(loader_train),
        'val_batches_per_epoch': len(loader_val) if loader_val is not None else None,
        'train': {'loss': 0.0, 'predictions': [], 'labels': [], 'batch_idx': 0},
        'val': {'loss': 0.0, 'predictions': [], 'labels': [], 'batch_idx': 0},
        'metrics': {'train_acc': 0.0} | {'val_' + metric: 0.0 for metric in metrics.keys()}
    }
    return logs

def run_epoch(model, criterion, optimizer, data_loader, device, mode, logs):
    model.train() if mode == 'train' else model.eval()
    total_loss = 0.0
    running_metrics = {metric: 0.0 for metric in metrics.keys()}
    pbar = tqdm(data_loader, desc=f'{mode.capitalize()}ing...')

    for data in pbar:
        images, labels = data['image'].to(device), data['label'].to(device)

        with torch.set_grad_enabled(mode == 'train'):
            outputs = model(images)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            preds = (outputs >= 0.0).float()
            logs[mode]['predictions'] += preds.tolist()
            logs[mode]['labels'] += labels.tolist()
            logs[mode]['loss'] = total_loss / (logs[mode]['batch_idx'] + 1)

            for metric_name, metric_func in metrics.items():
                running_metrics[metric_name] += metric_func(labels.tolist(), preds.tolist())
                logs['metrics'][mode + '_' + metric_name] = running_metrics[metric_name] / (logs[mode]['batch_idx'] + 1)

            logs[mode]['batch_idx'] += 1
            pbar.set_postfix({'loss': logs[mode]['loss'], **{metric_name: logs['metrics'][mode + '_' + metric_name] for metric_name in metrics.keys()}})

            if mode == 'train':
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

def main_training_loop(model, criterion, optimizer, loader_train, loader_val, device, epochs):
    for epoch in range(epochs):
        print(f'Epoch {epoch}:')
        logs = initialize_logs_dict(loader_train, loader_val)
        logs['epoch_num'] = epoch

        run_epoch(model, criterion, optimizer, loader_train, device, 'train', logs)
        run_epoch(model, criterion, optimizer, loader_val, device, 'val', logs)

        print('train_loss: {:.4f} | val_loss: {:.4f} |'.format(logs['train']['loss'], logs['val']['loss']), end=' ')
        print(" | ".join(['{}: {:.4f}'.format(metric_name, metric_val) for metric_name, metric_val in logs['metrics'].items()]), end='\n\n')

    run_epoch(model, criterion, optimizer, loader_test, device, 'val', logs)


In [None]:
import torch
import numpy as np

from torch import nn
from torch import optim
from torchinfo import summary
from datasets import load_dataset
from torchvision import transforms
from argparse import ArgumentParser
from torch.utils.data import DataLoader
from torch.nn.functional import normalize
from sklearn.metrics import accuracy_score


whitelist_weight_modules = (torch.nn.Linear, torch.nn.Conv1d, torch.nn.Conv2d, nn.LSTM)
blacklist_weight_modules = (torch.nn.LayerNorm, torch.nn.BatchNorm1d, torch.nn.BatchNorm2d, nn.Embedding)

def get_weight_decay_params(model):
    """ Adapted from the implementation at https://github.com/karpathy/minGPT/blob/3ed14b2cec0dfdad3f4b2831f2b4a86d11aef150/mingpt/model.py#L136"""
    decay = set()
    no_decay = set()
    for module_name, module in model.named_modules():
        for param_name, _ in module.named_parameters():
            fpn = '%s.%s' % (module_name, param_name) if module_name else param_name # full param name

            if 'bias' in param_name:
                # all biases will not be decayed
                no_decay.add(fpn)
            elif 'weight' in param_name and isinstance(module, whitelist_weight_modules):
                # weights of whitelist modules will be weight decayed
                decay.add(fpn)
            elif 'weight' in param_name and isinstance(module, blacklist_weight_modules):
                # weights of blacklist modules will NOT be weight decayed
                no_decay.add(fpn)

    # validate that we considered every parameter
    param_dict = {pn: p for pn, p in model.named_parameters()}
    inter_params = decay & no_decay
    union_params = decay | no_decay
    assert len(inter_params) == 0, "parameters %s made it into both decay/no_decay sets!" % (str(inter_params), )
    assert len(param_dict.keys() - union_params) == 0, "parameters %s were not separated into either decay/no_decay set!" \
                                                % (str(param_dict.keys() - union_params), )

    decay =  [param_dict[pn] for pn in sorted(list(decay))]
    no_decay =  [param_dict[pn] for pn in sorted(list(no_decay))]

    return decay, no_decay


def train_pipeline(path):
    train_ds = load_dataset("mwritescode/slither-audited-smart-contracts", 'big-multilabel', split='train', ignore_verifications=True)
    val_ds = load_dataset("mwritescode/slither-audited-smart-contracts", 'big-multilabel', split='validation', ignore_verifications=True)


    train_ds = train_ds.filter(lambda elem: elem['bytecode'] != '0x')
    val_ds = val_ds.filter(lambda elem: elem['bytecode'] != '0x')

    CFG_PATH = path

    cfg = get_cfg_defaults()
    cfg.merge_from_file(CFG_PATH)
    cfg.freeze()

    map_func = generate_image_and_label

    train_ds = train_ds.map(map_func, remove_columns=['address', 'source_code', 'bytecode', 'slither'])
    val_ds = val_ds.map(map_func, remove_columns=['address', 'source_code', 'bytecode', 'slither'])

    max_len = 512

    if cfg.DATASET.RGB_IMAGES:
        img_size = cfg.DATASET.IMG_SHAPE
        get_stats = GetMeanStd(train_ds, batch_size=16, img_size=img_size)
        mean, std = get_stats()

        img_transform = transforms.Compose([
            transforms.Resize(img_size),
            transforms.CenterCrop(img_size),
            transforms.ToTensor(),
            transforms.Normalize(mean=mean, std=std),
        ])

    padding_val = 0

    def img_label_to_tensor(examples):
        if 'image' in examples.keys():
            examples['image'] = [img_transform(elem) for elem in examples['image']]
        if 'label' in examples.keys():
            examples['label'] = torch.tensor(examples['label'])
        return examples

    train_ds.set_transform(img_label_to_tensor)
    val_ds.set_transform(img_label_to_tensor)

    pos_weights = None

    num_cls = 5
    model_name = 'resnet'
    device = torch.device('cuda')

    model = ResNetModel(num_classes=num_cls)
    model = model.to(device)
    # train_heper = REGISTRY['inception_train_helper'] if 'inception' in model_name else REGISTRY['default_train_helper']

    if not cfg.TRAINING.TRAIN_FROM_SCRATCH:
        param_groups = model.get_layer_groups()
        for param in param_groups['feature_extractor'][:-cfg.TRAINING.LAYERS_TO_FINETUNE]:
            param.requires_grad = False

    summary(model)

    batch_size = cfg.DATASET.LOADER.BATCH_SIZE

    loader_train = DataLoader(train_ds,
                        batch_size=batch_size,
                        drop_last=True,
                        shuffle=True)
    loader_val = DataLoader(val_ds,
                        batch_size=batch_size,
                        drop_last=True,
                        shuffle=False)

    # trainer = Trainer(model=model, train_dataloader=loader_train, val_dataloader=loader_val, train_helper=train_heper)

    decay, no_decay = get_weight_decay_params(model)
    optim_groups = [
        {'params': decay, 'weight_decay': cfg.TRAINING.OPTIMIZER.WEIGHT_DECAY},
        {'params': no_decay, 'weight_decay': 0.0}
    ]

    optimizer = optim.SGD(
        optim_groups,
        lr=cfg.TRAINING.OPTIMIZER.LR,
        momentum=cfg.TRAINING.OPTIMIZER.MOMENTUM,
        nesterov=True)
    criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weights)

    # trainer.compile(loss=criterion, optimizer=optimizer, metrics={'acc': accuracy_score})

    callbacks = []
    epochs = 20
    main_training_loop(model, criterion, optimizer, loader_train, loader_val, device, epochs)

    # if cfg.TRAINING.TRACK_METRICS.USE:
    #     metrics = {}
    #     for avg in cfg.TRAINING.TRACK_METRICS.AVERAGE:
    #         print(avg)
    #         metrics.update({avg + '_' + metric: REGISTRY[metric](average=avg, labels=np.arange(0, num_cls)) for metric in cfg.TRAINING.TRACK_METRICS.NAMES})
    #     callbacks.append(MetricsCallback(metrics=metrics))

    # if cfg.TRAINING.LOGGER.USE:
    #     add_to_logging = [] if not cfg.TRAINING.TRACK_METRICS.USE else metrics.keys()
    #     callbacks.append(TensorBoardLogger(
    #         track_epochwise=['loss', 'acc', *add_to_logging],
    #         run_tag=cfg.TRAINING.LOGGER.RUN_TAG))

    # if cfg.TRAINING.EARLY_STOPPING.USE:
    #     callbacks.append(EarlyStopper(
    #         model=model,
    #         metric_name=cfg.TRAINING.EARLY_STOPPING.MONITOR,
    #         decreasing=cfg.TRAINING.EARLY_STOPPING.DECREASING,
    #         restore_best_weights=True,
    #         patience=cfg.TRAINING.EARLY_STOPPING.PATIENCE))


    # if cfg.TRAINING.CHECKPOINTS.USE:
    #     callbacks.append(CheckpointSaver(
    #         model=model,
    #         optimizer=optimizer,
    #         monitor=cfg.TRAINING.CHECKPOINTS.MONITOR,
    #         decreasing=cfg.TRAINING.CHECKPOINTS.DECREASING,
    #         path=cfg.TRAINING.CHECKPOINTS.PATH))

    # trainer.fit(epochs=cfg.TRAINING.N_EPOCHS, callbacks=callbacks)


train_pipeline('./default.yaml')

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/8.00k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/19.9k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.12k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/203M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/197M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/193M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/224M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/227M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/232M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/230M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/233M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.04M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.26k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.97M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/659k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/79641 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/15972 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10861 [00:00<?, ? examples/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Filter:   0%|          | 0/79641 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10861 [00:00<?, ? examples/s]

Map:   0%|          | 0/79414 [00:00<?, ? examples/s]

Map:   0%|          | 0/10831 [00:00<?, ? examples/s]

Computing stats..: 100%|██████████| 4964/4964 [03:32<00:00, 23.39it/s]
Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 147MB/s]


Epoch 0:


Training...:  24%|██▍       | 1192/4963 [01:06<03:18, 19.00it/s, loss=0.468, acc=0.388]

In [None]:
print(model)

NameError: ignored

In [None]:
train_ds = load_dataset("mwritescode/slither-audited-smart-contracts", 'big-multilabel', split='train', ignore_verifications=True)
val_ds = load_dataset("mwritescode/slither-audited-smart-contracts", 'big-multilabel', split='validation', ignore_verifications=True)
test_ds = load_dataset("mwritescode/slither-audited-smart-contracts", 'big-multilabel', split='test', ignore_verifications=True)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [None]:
train_ds = train_ds.filter(lambda elem: elem['bytecode'] != '0x')
val_ds = val_ds.filter(lambda elem: elem['bytecode'] != '0x')
test_ds = test_ds.filter(lambda elem: elem['bytecode'] != '0x')

In [None]:
SAFE_IDX = 4 # the index of safe smart contract

def __get_RGB_image(bytecode):
    image = np.frombuffer(bytecode, dtype=np.uint8)
    length = int(math.ceil(len(image)/3))
    image = np.pad(image, pad_width=(0, length*3 - len(image)))
    image = image.reshape((-1, 3))
    sqrt_len = int(math.ceil(math.sqrt(image.shape[0])))
    image = np.pad(image,  pad_width=((0, sqrt_len**2 - image.shape[0]),(0,0)))
    image = image.reshape((sqrt_len, sqrt_len, 3))
    image = Image.fromarray(image)
    return image

def __get_one_hot_encoded_label(label):
    one_hot = np.zeros(5)
    for elem in label:
        if elem < SAFE_IDX:
            one_hot[elem] = 1
        elif elem > SAFE_IDX:
            one_hot[elem-1] = 1
    return one_hot

def generate_image_and_label(example):
    code = HexBytes(example['bytecode'])
    example['image'] = __get_RGB_image(code)
    example['label'] = __get_one_hot_encoded_label(example['slither'])
    return example

In [None]:
map_func = generate_image_and_label

train_ds = train_ds.map(map_func, remove_columns=['address', 'source_code', 'bytecode', 'slither'])
val_ds = val_ds.map(map_func, remove_columns=['address', 'source_code', 'bytecode', 'slither'])
test_ds = test_ds.map(map_func, remove_columns=['address', 'source_code', 'bytecode', 'slither'])

In [None]:
max_len = 512
padding_val = 0
IMAGENET_MEAN = [0.485, 0.456, 0.406]
IMAGENET_STD = [0.229, 0.224, 0.225]

def img_label_to_tensor(examples):

  img_size = 224
  mean, std = IMAGENET_MEAN, IMAGENET_STD
  img_transform = transforms.Compose([
            transforms.Resize(img_size),
            transforms.CenterCrop(img_size),
            transforms.ToTensor(),
            transforms.Normalize(mean=mean, std=std),
        ])
  if 'image' in examples.keys():
      examples['image'] = [img_transform(elem) for elem in examples['image']]
  if 'image' in examples.keys():
      examples['image'] = [np.pad(img, pad_width=(0, max_len - len(img)), constant_values=padding_val) if len(img) < max_len else img[:max_len] for img in examples['image']]
      examples['image'] = [torch.tensor(img) for img in examples['image']]
  if 'label' in examples.keys():
      examples['label'] = torch.tensor(examples['label'])
      return examples

In [None]:
train_ds.set_transform(img_label_to_tensor)
val_ds.set_transform(img_label_to_tensor)
test_ds.set_transform(img_label_to_tensor)

In [None]:
model_name = 'resnet'
num_cls = 5

In [None]:
batch_size = 8

loader_train = DataLoader(train_ds,
                    batch_size=batch_size,
                    drop_last=True,
                    shuffle=True)
loader_val = DataLoader(val_ds,
                    batch_size=batch_size,
                    drop_last=True,
                    shuffle=False)
loader_test = DataLoader(test_ds,
                    batch_size=batch_size,
                    shuffle=False)

In [None]:
from torch import nn
from torchvision import models

class ResNetModel(nn.Module):
    def __init__(self, num_classes=5, classify=True):
        super(ResNetModel, self).__init__()
        self.resnet = models.resnet18(pretrained=True)

        if classify:
            self.resnet.fc = nn.Linear(512, num_classes)
        else:
            features = nn.ModuleList(self.resnet.children())[:-1]
            self.resnet = nn.Sequential(*features).append(nn.Flatten())

    def forward(self, inputs):
        return self.resnet(inputs)

    def get_layer_groups(self):
        linear_layers = [elem[1] for elem in filter(lambda param_tuple: 'fc' in param_tuple[0], self.resnet.named_parameters())]
        other_layers = [elem[1] for elem in filter(lambda param_tuple: 'fc' not in param_tuple[0], self.resnet.named_parameters())]
        param_groups = {
            'classifier': linear_layers,
            'feature_extractor': other_layers
        }
        return param_groups

In [None]:
model = ResNetModel(num_classes=num_cls)
model = model.to('cuda')

whitelist_weight_modules = (torch.nn.Linear, torch.nn.Conv1d, torch.nn.Conv2d, nn.LSTM)
blacklist_weight_modules = (torch.nn.LayerNorm, torch.nn.BatchNorm1d, torch.nn.BatchNorm2d, nn.Embedding)

def get_weight_decay_params(model):
    """ Adapted from the implementation at https://github.com/karpathy/minGPT/blob/3ed14b2cec0dfdad3f4b2831f2b4a86d11aef150/mingpt/model.py#L136"""
    decay = set()
    no_decay = set()
    for module_name, module in model.named_modules():
        for param_name, _ in module.named_parameters():
            fpn = '%s.%s' % (module_name, param_name) if module_name else param_name # full param name

            if 'bias' in param_name:
                # all biases will not be decayed
                no_decay.add(fpn)
            elif 'weight' in param_name and isinstance(module, whitelist_weight_modules):
                # weights of whitelist modules will be weight decayed
                decay.add(fpn)
            elif 'weight' in param_name and isinstance(module, blacklist_weight_modules):
                # weights of blacklist modules will NOT be weight decayed
                no_decay.add(fpn)
        # validate that we considered every parameter
    param_dict = {pn: p for pn, p in model.named_parameters()}
    inter_params = decay & no_decay
    union_params = decay | no_decay
    assert len(inter_params) == 0, "parameters %s made it into both decay/no_decay sets!" % (str(inter_params), )
    assert len(param_dict.keys() - union_params) == 0, "parameters %s were not separated into either decay/no_decay set!" \
                                                % (str(param_dict.keys() - union_params), )

    decay =  [param_dict[pn] for pn in sorted(list(decay))]
    no_decay =  [param_dict[pn] for pn in sorted(list(no_decay))]

    return decay, no_decay

decay, no_decay = get_weight_decay_params(model)
optim_groups = [
        {'params': decay, 'weight_decay': 0.0001},
        {'params': no_decay, 'weight_decay': 0.0}
    ]

optimizer = optimizer = optim.SGD(
            optim_groups,
            lr=1e-3,
            momentum=0.9,
            nesterov=True)
criterion = nn.BCEWithLogitsLoss()



In [None]:
epochs = 20
device = 'cuda'

In [None]:
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

metrics = {'acc': accuracy_score, 'f1': f1_score, 'precision': precision_score, 'recall': recall_score}

def initialize_logs_dict(loader_train, loader_val):
    logs = {
        'epoch_num': 0,
        'train_batches_per_epoch': len(loader_train),
        'val_batches_per_epoch': len(loader_val) if loader_val is not None else None,
        'train': {'loss': 0.0, 'predictions': [], 'labels': [], 'batch_idx': 0},
        'val': {'loss': 0.0, 'predictions': [], 'labels': [], 'batch_idx': 0},
        'metrics': {'train_acc': 0.0} | {'val_' + metric: 0.0 for metric in metrics.keys()}
    }
    return logs

def run_epoch(model, criterion, optimizer, data_loader, device, mode, logs):
    model.train() if mode == 'train' else model.eval()
    total_loss = 0.0
    running_metrics = {metric: 0.0 for metric in metrics.keys()}
    pbar = tqdm(data_loader, desc=f'{mode.capitalize()}ing...')

    for data in pbar:
        images, labels = data['image'].to(device), data['label'].to(device)

        with torch.set_grad_enabled(mode == 'train'):
            outputs = model(images)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            preds = (outputs >= 0.0).float()
            logs[mode]['predictions'] += preds.tolist()
            logs[mode]['labels'] += labels.tolist()
            logs[mode]['loss'] = total_loss / (logs[mode]['batch_idx'] + 1)

            for metric_name, metric_func in metrics.items():
                running_metrics[metric_name] += metric_func(labels.tolist(), preds.tolist())
                logs['metrics'][mode + '_' + metric_name] = running_metrics[metric_name] / (logs[mode]['batch_idx'] + 1)

            logs[mode]['batch_idx'] += 1
            pbar.set_postfix({'loss': logs[mode]['loss'], **{metric_name: logs['metrics'][mode + '_' + metric_name] for metric_name in metrics.keys()}})

            if mode == 'train':
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

def main_training_loop(model, criterion, optimizer, loader_train, loader_val, device, epochs):
    for epoch in range(epochs):
        print(f'Epoch {epoch}:')
        logs = initialize_logs_dict(loader_train, loader_val)
        logs['epoch_num'] = epoch

        run_epoch(model, criterion, optimizer, loader_train, device, 'train', logs)
        run_epoch(model, criterion, optimizer, loader_val, device, 'val', logs)

        print('train_loss: {:.4f} | val_loss: {:.4f} |'.format(logs['train']['loss'], logs['val']['loss']), end=' ')
        print(" | ".join(['{}: {:.4f}'.format(metric_name, metric_val) for metric_name, metric_val in logs['metrics'].items()]), end='\n\n')

    run_epoch(model, criterion, optimizer, loader_test, device, 'val', logs)


main_training_loop(model, criterion, optimizer, loader_train, loader_val, device, epochs)

NameError: ignored

In [None]:
torch.save(model.state_dict(), 'model.pt')