In [1]:
import re
import pandas as pd
import seaborn as sns
from hexbytes import HexBytes
import matplotlib.pyplot as plt
from datasets import load_dataset

train_set = load_dataset("mwritescode/slither-audited-smart-contracts", 'small-multilabel', split='train', verification_mode='no_checks', trust_remote_code=True)
test_set = load_dataset("mwritescode/slither-audited-smart-contracts", 'small-multilabel', split='test', verification_mode='no_checks', trust_remote_code=True)
val_set = load_dataset("mwritescode/slither-audited-smart-contracts", 'small-multilabel', split='validation', verification_mode='no_checks', trust_remote_code=True)

In [2]:
train_df = train_set.to_pandas()
test_df = test_set.to_pandas()
val_df = val_set.to_pandas()
print("Train DataFrame Head:\n", train_df.head(), train_df.describe(include='all'))

Train DataFrame Head:
                                       address  \
0  0x01b23286ff60a543ec29366ae8d6b6274ca20541   
1  0x0cfb151de2c34aceb532f43683e5b7bed62f298f   
2  0x0e68432827674ad048b803d1ee289ae78b3917b9   
3  0x1149d772bce9a636d0d7535ec865f3c6c8ee3b5c   
4  0x11c26446b5ce3b895ef6a9a594cf9df6e8badbd7   

                                         source_code  \
0  pragma solidity 0.4.26;\n\ninterface IERC20 {\...   
1  pragma solidity 0.6.12;\npragma experimental A...   
2  pragma solidity 0.6.12;\npragma experimental A...   
3  pragma solidity 0.6.10;\npragma experimental A...   
4  pragma solidity 0.6.5;\npragma experimental AB...   

                                            bytecode             slither  
0  0x608060405260043610610112576000357c0100000000...                 [6]  
1  0x608060405260043610620002475760003560e01c8063...  [1, 5, 6, 2, 3, 0]  
2  0x6080604052600436106101185760003560e01c80638d...  [1, 5, 6, 2, 3, 0]  
3  0x608060405234801561001057600080fd5b506004

In [3]:
from transformers import AutoModel

sourcecode_train = train_df.source_code.tolist()
sourcecode_test = test_df.source_code.tolist()
sourcecode_val = val_df.source_code.tolist()

model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-small-en', trust_remote_code=True)

In [4]:
len(model.encode(sourcecode_train[0]))
## 512 vectors for our source code embeddings

512

In [5]:
import pandas as pd
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from datasets import load_metric
import os


def compute_embeddings_with_checkpoint(sourcecode, model, checkpoint_file, checkpoint_freq=10, progress_freq=1):
    if os.path.exists(checkpoint_file):
        embeddings = np.load(checkpoint_file)
        print(f"Resuming computation from checkpoint. Loaded {embeddings.shape[0]} embeddings.")
    else:
        embeddings = np.empty((0, model.embedding_size))  
    
    start_index = embeddings.shape[0]

    for i, code in enumerate(sourcecode[start_index:]):
        embedding = model.encode(code)
        embeddings = np.append(embeddings, [embedding], axis=0) 

        if (i + 1) % progress_freq == 0:
            print(f"Embedding {start_index + i + 1} Done")
        
        if (start_index + i + 1) % checkpoint_freq == 0:
            np.save(checkpoint_file, embeddings)
            print(f"Embeddings saved to {checkpoint_file}")

    return embeddings


train_embeddings = compute_embeddings_with_checkpoint(sourcecode_train, model, "source_train_embeddings_checkpoint.npy")
test_embeddings = compute_embeddings_with_checkpoint(sourcecode_test, model, "source_test_embeddings_checkpoint.npy")
val_embeddings = compute_embeddings_with_checkpoint(sourcecode_val, model, "source_val_embeddings_checkpoint.npy")

np.save('source_code_embeddings_train.npy', train_embeddings)
np.save('source_code_embeddings_test.npy', test_embeddings)
np.save('source_code_embeddings_val.npy', val_embeddings)

mlb = MultiLabelBinarizer()
# Fit and transform the labels
train_labels = mlb.fit_transform(train_df['slither'])
test_labels = mlb.fit_transform(test_df['slither'])
val_labels = mlb.fit_transform(val_df['slither'])
train_df['labels'] = list(train_labels)
test_df['labels'] = list(test_labels)
val_df['labels'] = list(val_labels)

class EmbeddingsDataset(Dataset):
    def __init__(self, embeddings, labels):
        self.embeddings = embeddings
        self.labels = labels

    def __len__(self):
        return len(self.embeddings)

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.embeddings[idx], dtype=torch.float),
            'labels': torch.tensor(self.labels.iloc[idx], dtype=torch.float)
        }

print("DONE")

Resuming computation from checkpoint. Loaded 140 embeddings.


KeyboardInterrupt: 

In [19]:
loaded_data = np.load('source_train_embeddings_checkpoint.npy')

10


In [4]:
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.optimizers import Adam

mlb = MultiLabelBinarizer()
train_labels_bin = mlb.fit_transform(train_labels)
test_labels_bin = mlb.transform(test_labels)

def create_model(hidden_layer_sizes, activation):
    model = Sequential()
    model.add(Dense(512, activation=activation, input_shape=(train_embeddings.shape[1],)))
    for size in hidden_layer_sizes:
        model.add(Dense(size, activation=activation))
        model.add(Dropout(0.25))
    model.add(Dense(num_classes, activation='softmax'))
    return model

num_classes = train_labels_bin.shape[1]
input_shape = (train_embeddings.shape[1],)

param_grid = {
    'hidden_layer_sizes': [(256,), (512,), (256, 128), (512, 256)],
    'activation': ['relu', 'sigmoid', 'tanh']
}

model = create_model(input_shape, num_classes, hidden_layer_sizes=(512,), activation='relu')
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

checkpoint = ModelCheckpoint('best_model.h5', monitor='val_accuracy', save_best_only=True, mode='max', verbose=1)
early_stopping = EarlyStopping(monitor='val_loss', patience=10, verbose=1)
callbacks_list = [checkpoint, early_stopping]

history = model.fit(train_embeddings, train_labels_bin, validation_data=(test_embeddings, test_labels_bin), epochs=20, batch_size=32, callbacks=callbacks_list)

random_search = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=5, cv=3, verbose=2)
random_search.fit(train_embeddings, train_labels_bin)

model.evaluate(test_embeddings, test_labels_bin)

NameError: name 'train_labels' is not defined

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

# See how simple model performs
clf = LogisticRegression(max_iter=1000)
clf.fit(train_embeddings, train_labels)

preds = clf.predict(test_embeddings)
print(f"F1 Micro: {f1_score(test_labels, preds, average='micro')}")
print(f"F1 Macro: {f1_score(test_labels, preds, average='macro')}")
