
## Download and Preproces Dataset
Install and import the necessary Python libraries.

In [1]:
!pip install datasets[vision]
!pip install hexbytes

In [38]:
import re
import pandas as pd
import json
import seaborn as sns
from hexbytes import HexBytes
import matplotlib.pyplot as plt
from datasets import load_dataset
import numpy as np
import os

In [None]:
# Due to a bug in the HuggingFace dataset, at the moment two file checksums do not correspond to what
# is in the dataset metadata, thus we have to load the data splits with the flag ignore_verification
# set to true
train_set = load_dataset("mwritescode/slither-audited-smart-contracts", 'big-multilabel', split='train', ignore_verifications=True)
test_set = load_dataset("mwritescode/slither-audited-smart-contracts", 'big-multilabel', split='test', ignore_verifications=True)
val_set = load_dataset("mwritescode/slither-audited-smart-contracts", 'big-multilabel', split='validation', ignore_verifications=True)

# Extract preferred contracts

Extract all contracts marked with *access_control* and *unchecked_calls* vulnerabilities 

In [2]:
val_set_df = val_set.to_pandas()
test_set_df = test_set.to_pandas()
train_set_df = train_set.to_pandas()

In [33]:
train_access_control = train_set_df[train_set_df['slither'].apply(lambda x: 0 in x)].drop(columns=['source_code', 'slither'])
train_unchecked_calls = train_set_df[train_set_df['slither'].apply(lambda x: 5 in x)].drop(columns=['source_code', 'slither'])

test_access_control = test_set_df[test_set_df['slither'].apply(lambda x: 0 in x)].drop(columns=['source_code', 'slither'])
test_unchecked_calls = test_set_df[test_set_df['slither'].apply(lambda x: 5 in x)].drop(columns=['source_code', 'slither'])

val_access_control = val_set_df[val_set_df['slither'].apply(lambda x: 0 in x)].drop(columns=['source_code', 'slither'])
val_unchecked_calls = val_set_df[val_set_df['slither'].apply(lambda x: 5 in x)].drop(columns=['source_code', 'slither'])

In [2]:
print(train_access_control.shape, train_unchecked_calls.shape)
print(val_access_control.shape, val_unchecked_calls.shape)
print(test_access_control.shape, test_unchecked_calls.shape)

# Save to disk

Save each row of each dataframe as a `json` file

In [39]:
def save_to_json(data, out_dir):
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)

    for index, row in data.iterrows():
        address_value = row['address']
        filename = f"{out_dir}/{address_value}.json"
        with open(filename, 'w') as file:
            json.dump(row.to_dict(), file)

In [40]:
save_to_json(train_access_control, './json/access_control/train')
save_to_json(train_unchecked_calls, './json/unchecked_calls/train')

save_to_json(test_access_control, './json/access_control/test')
save_to_json(test_unchecked_calls, './json/unchecked_calls/test')

save_to_json(val_access_control, './json/access_control/val')
save_to_json(val_unchecked_calls, './json/unchecked_calls/val')