# Datasets - SmartBugs Wilds

In [9]:
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

In [10]:
def remove_comments(string):
    pattern = r"(\".*?\"|\'.*?\')|(/\*.*?\*/|//[^\r\n]*$)"
    # first group captures quoted strings (double or single)
    # second group captures comments (//single-line or /* multi-line */)
    regex = re.compile(pattern, re.MULTILINE|re.DOTALL)
    def _replacer(match):
        # if the 2nd group is not None, then we have captured a real comment string.
        if match.group(2) is not None:
            return "" 
        else: # otherwise, we will return the 1st group
            return match.group(1) 
    return regex.sub(_replacer, string)

In [11]:
path="/home/bombbom/Documents/smartbugs-wild/contracts/"
data = []
for d in os.listdir(path):
    sub = os.path.join(path, d)
    contract = open(sub, 'r')
    source = contract.read()
    row = {"address":d[:-4], "source_code":source}
    data.append(row)

In [13]:
len(data)

47398

In [14]:
data = pd.DataFrame(data)
data.to_pickle("dataset_example/data_raw.pkl")

In [15]:
del data

## Load datasets

In [16]:
data = pd.read_pickle("/home/bombbom/Documents/NLP_in_Detection_System/dataset_example/data_raw.pkl")

In [17]:
data

Unnamed: 0,address,source_code
0,0x21e13cb3f3f26f92a62ac7adab4093e8997d1fb1,pragma solidity ^0.4.18;\n\n// ---------------...
1,0x344005c29af957567f0b40950b425ed018b92170,pragma solidity ^0.4.21;\n\n\ncontract DSMath...
2,0x44e320110176c11c93e116f6770f13d96deded43,pragma solidity ^0.4.20;\n\n\ncontract Ownable...
3,0x5ecd84482176db90bb741ddc8c2f9ccc290e29ce,pragma solidity ^0.4.8;\ncontract Token{\n ...
4,0xcaf187eb618d2335b4130d784a697be96f4b07b9,pragma solidity 0.4.15;\n\ncontract RegistryIC...
...,...,...
47393,0x7f650f3b231d3a32c2b0e2940e870acdd4aa9961,pragma solidity ^0.4.24;\n\n\n\n\n/**\n * @tit...
47394,0xc73f2474001ad1d6aed615af53631148cf98de6b,pragma solidity ^0.4.18;\n\n// ---------------...
47395,0xb2105f178abe620ddbb86a70afa94bfa9daa4d01,pragma solidity ^0.4.23;\n\nlibrary SafeMath {...
47396,0xe2d4b960d0c639633582cddef57528461e62083d,pragma solidity ^0.4.23;\n\n\n/// @title Multi...


In [18]:
smartbugs_results = pd.read_csv("/home/bombbom/Documents/NLP_in_Detection_System/dataset_example/smartbugs_wild_full.csv", index_col=None)

In [19]:
smartbugs_results

Unnamed: 0,address,mythril_vulnerabilities,slither_vulnerabilities,oyente_vulnerabilities,osiris_vulnerabilities,smartcheck_vulnerabilities,manticore_vulnerabilities,maian_vulnerabilities,securify_vulnerabilities,honeybadger_vulnerabilities,categories,lines,nb_vulnerabilities
0,0x8d12a197cb00d4747a1fe03395095ce2a5cc6819,[],"['reentrancy-no-eth', 'reentrancy-benign', 'lo...",['Integer Underflow.'],[],"['SOLIDITY_CALL_WITHOUT_DATA', 'SOLIDITY_FUNCT...",[],[],[],[],"['reentrancy', 'unchecked_low_calls', 'arithme...","[32, 229, 38, 136, 268, 45, 51, 254, 56, 28, 2...",15
1,0x2a0c0dbecc7e4d658f48e01e3fa353f44050c208,"['Unchecked CALL return value', 'Message call ...",['reentrancy-benign'],['Integer Overflow.'],[],['SOLIDITY_FUNCTIONS_RETURNS_TYPE_AND_NO_RETURN'],[],[],['DAOConstantGas'],[],"['unchecked_low_calls', 'reentrancy', 'arithme...","[163, 36, 101, 133, 110, 145, 122, 127]",10
2,0x174bfa6600bf90c885c7c01c7031389ed1461ab9,[],[],[],[],[],[],[],[],[],[],[],0
3,0x06012c8cf97bead5deae237070f9587f8e7a266d,"['Unchecked CALL return value', 'Message call ...","['arbitrary-send', 'reentrancy-eth', 'incorrec...","['Integer Underflow.', 'Integer Overflow.', 'C...","['underflow_bugs', 'truncation_bugs', 'callsta...","['SOLIDITY_LOCKED_MONEY', 'SOLIDITY_UNCHECKED_...",[],[],[],[],"['unchecked_low_calls', 'reentrancy', 'access_...","[769, 1414, 1287, 1686, 1175, 1048, 1817, 924,...",82
4,0x86fa049857e0209aa7d9e616f7eb3b3b78ecfdb0,[],[],[],['underflow_bugs'],[],[],[],[],[],['arithmetic'],[115],1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
47446,0x000000eade0fe9269d0412a6055b6f3c5d968488,['Message call to external contract'],[],[],[],[],,,[],,['reentrancy'],"[47, 49, 51, 52, 54, 56]",6
47447,0x000000d4e883e304c7f9574ebeecf238eb55a40f,[],[],['Integer Underflow.'],[],[],,,[],,['arithmetic'],[24],1
47448,0x000000961d1ac83a67d0ce61612b36f18c10c9b7,[],[],['Integer Underflow.'],[],[],,,[],[],['arithmetic'],[26],1
47449,0x00000000e82eb0431756271f0d00cfb143685e7b,[],[],[],[],[],,,['UnrestrictedWrite'],,['access_control'],"[4069, 4029]",2


In [20]:
smartbugs_results = smartbugs_results[["address","categories", "lines"]]

In [21]:
smartbugs_results

Unnamed: 0,address,categories,lines
0,0x8d12a197cb00d4747a1fe03395095ce2a5cc6819,"['reentrancy', 'unchecked_low_calls', 'arithme...","[32, 229, 38, 136, 268, 45, 51, 254, 56, 28, 2..."
1,0x2a0c0dbecc7e4d658f48e01e3fa353f44050c208,"['unchecked_low_calls', 'reentrancy', 'arithme...","[163, 36, 101, 133, 110, 145, 122, 127]"
2,0x174bfa6600bf90c885c7c01c7031389ed1461ab9,[],[]
3,0x06012c8cf97bead5deae237070f9587f8e7a266d,"['unchecked_low_calls', 'reentrancy', 'access_...","[769, 1414, 1287, 1686, 1175, 1048, 1817, 924,..."
4,0x86fa049857e0209aa7d9e616f7eb3b3b78ecfdb0,['arithmetic'],[115]
...,...,...,...
47446,0x000000eade0fe9269d0412a6055b6f3c5d968488,['reentrancy'],"[47, 49, 51, 52, 54, 56]"
47447,0x000000d4e883e304c7f9574ebeecf238eb55a40f,['arithmetic'],[24]
47448,0x000000961d1ac83a67d0ce61612b36f18c10c9b7,['arithmetic'],[26]
47449,0x00000000e82eb0431756271f0d00cfb143685e7b,['access_control'],"[4069, 4029]"


In [22]:
data = data.merge(smartbugs_results, how='inner', on='address')

In [75]:
data

Unnamed: 0,address,source_code,categories,lines
0,0x21e13cb3f3f26f92a62ac7adab4093e8997d1fb1,pragma solidity ^0.4.18;\n\n// ---------------...,"['Other', 'arithmetic', 'arithmetic']","[154, 141, 103]"
1,0x344005c29af957567f0b40950b425ed018b92170,pragma solidity ^0.4.21;\n\n\ncontract DSMath...,"['arithmetic', 'reentrancy', 'arithmetic', 'ar...","[201, 16, 209, 176, 146, 154, 29]"
2,0x44e320110176c11c93e116f6770f13d96deded43,pragma solidity ^0.4.20;\n\n\ncontract Ownable...,['arithmetic'],"[82, 46]"
3,0x5ecd84482176db90bb741ddc8c2f9ccc290e29ce,pragma solidity ^0.4.8;\ncontract Token{\n ...,"['arithmetic', 'unchecked_low_calls', 'arithme...","[99, 38, 78, 48, 49, 60, 93]"
4,0xcaf187eb618d2335b4130d784a697be96f4b07b9,pragma solidity 0.4.15;\n\ncontract RegistryIC...,['Other'],"[80, 28, 63]"
...,...,...,...,...
47326,0x7f650f3b231d3a32c2b0e2940e870acdd4aa9961,pragma solidity ^0.4.24;\n\n\n\n\n/**\n * @tit...,"['reentrancy', 'arithmetic', 'time_manipulatio...","[258, 264, 245, 253, 287]"
47327,0xc73f2474001ad1d6aed615af53631148cf98de6b,pragma solidity ^0.4.18;\n\n// ---------------...,"['Other', 'arithmetic', 'arithmetic']","[200, 218, 128, 102]"
47328,0xb2105f178abe620ddbb86a70afa94bfa9daa4d01,pragma solidity ^0.4.23;\n\nlibrary SafeMath {...,"['arithmetic', 'arithmetic', 'front_running']","[124, 20, 205]"
47329,0xe2d4b960d0c639633582cddef57528461e62083d,pragma solidity ^0.4.23;\n\n\n/// @title Multi...,"['arithmetic', 'Other', 'unchecked_low_calls',...","[140, 142, 23, 282, 410, 157, 287, 296, 302, 3..."


In [24]:
data.to_pickle("dataset_example/labeled_SBW_datasets.pkl")

## Exploratory Data Analysis
- access_control
- arithmetic 
- denial_service
- front_running
- reentrancy
- time_manipulation
- unchecked_low_calls
- Other

In [97]:
data = pd.read_pickle("dataset_example/labeled_SBW_datasets.pkl")

In [98]:
def mapping(input):
    # return input
    input = input[1:-1].replace("'", "").replace(" ", "").split(",")
    label = [0,0,0,0,0,0,0,0]
    if 'access_control' in input:
        label[0] = 1
    if 'arithmetic' in input:
        label[1] = 1
    if "denial_service" in input:
        label[2] = 1
    if "front_running" in input:
        label[3] = 1
    if "reentrancy" in input:
        label[4] = 1
    if "time_manipulation" in input:
        label[5] = 1
    if "unchecked_low_calls" in input:
        label[6] = 1
    if "Other" in input:
        label[7] = 1
    
    return list(label)
     

In [99]:
data.categories = data.categories.apply(mapping)

In [100]:
data

Unnamed: 0,address,source_code,categories,lines
0,0x21e13cb3f3f26f92a62ac7adab4093e8997d1fb1,pragma solidity ^0.4.18;\n\n// ---------------...,"[0, 1, 0, 0, 0, 0, 0, 1]","[154, 141, 103]"
1,0x344005c29af957567f0b40950b425ed018b92170,pragma solidity ^0.4.21;\n\n\ncontract DSMath...,"[0, 1, 0, 0, 1, 0, 0, 0]","[201, 16, 209, 176, 146, 154, 29]"
2,0x44e320110176c11c93e116f6770f13d96deded43,pragma solidity ^0.4.20;\n\n\ncontract Ownable...,"[0, 1, 0, 0, 0, 0, 0, 0]","[82, 46]"
3,0x5ecd84482176db90bb741ddc8c2f9ccc290e29ce,pragma solidity ^0.4.8;\ncontract Token{\n ...,"[0, 1, 0, 0, 0, 0, 1, 0]","[99, 38, 78, 48, 49, 60, 93]"
4,0xcaf187eb618d2335b4130d784a697be96f4b07b9,pragma solidity 0.4.15;\n\ncontract RegistryIC...,"[0, 0, 0, 0, 0, 0, 0, 1]","[80, 28, 63]"
...,...,...,...,...
47326,0x7f650f3b231d3a32c2b0e2940e870acdd4aa9961,pragma solidity ^0.4.24;\n\n\n\n\n/**\n * @tit...,"[0, 1, 0, 1, 1, 1, 1, 0]","[258, 264, 245, 253, 287]"
47327,0xc73f2474001ad1d6aed615af53631148cf98de6b,pragma solidity ^0.4.18;\n\n// ---------------...,"[0, 1, 0, 0, 0, 0, 0, 1]","[200, 218, 128, 102]"
47328,0xb2105f178abe620ddbb86a70afa94bfa9daa4d01,pragma solidity ^0.4.23;\n\nlibrary SafeMath {...,"[0, 1, 0, 1, 0, 0, 0, 0]","[124, 20, 205]"
47329,0xe2d4b960d0c639633582cddef57528461e62083d,pragma solidity ^0.4.23;\n\n\n/// @title Multi...,"[0, 1, 1, 0, 0, 0, 1, 1]","[140, 142, 23, 282, 410, 157, 287, 296, 302, 3..."


In [101]:
data.source_code = data.source_code.apply(remove_comments)

In [102]:
data

Unnamed: 0,address,source_code,categories,lines
0,0x21e13cb3f3f26f92a62ac7adab4093e8997d1fb1,pragma solidity ^0.4.18;\n\n\n\n\n\n\n\n\n\n\n...,"[0, 1, 0, 0, 0, 0, 0, 1]","[154, 141, 103]"
1,0x344005c29af957567f0b40950b425ed018b92170,pragma solidity ^0.4.21;\n\n\ncontract DSMath...,"[0, 1, 0, 0, 1, 0, 0, 0]","[201, 16, 209, 176, 146, 154, 29]"
2,0x44e320110176c11c93e116f6770f13d96deded43,pragma solidity ^0.4.20;\n\n\ncontract Ownable...,"[0, 1, 0, 0, 0, 0, 0, 0]","[82, 46]"
3,0x5ecd84482176db90bb741ddc8c2f9ccc290e29ce,pragma solidity ^0.4.8;\ncontract Token{\n ...,"[0, 1, 0, 0, 0, 0, 1, 0]","[99, 38, 78, 48, 49, 60, 93]"
4,0xcaf187eb618d2335b4130d784a697be96f4b07b9,pragma solidity 0.4.15;\n\ncontract RegistryIC...,"[0, 0, 0, 0, 0, 0, 0, 1]","[80, 28, 63]"
...,...,...,...,...
47326,0x7f650f3b231d3a32c2b0e2940e870acdd4aa9961,pragma solidity ^0.4.24;\n\n\n\n\n\ncontract E...,"[0, 1, 0, 1, 1, 1, 1, 0]","[258, 264, 245, 253, 287]"
47327,0xc73f2474001ad1d6aed615af53631148cf98de6b,pragma solidity ^0.4.18;\n\n\n\n\n\n\n\n\n\n\n...,"[0, 1, 0, 0, 0, 0, 0, 1]","[200, 218, 128, 102]"
47328,0xb2105f178abe620ddbb86a70afa94bfa9daa4d01,pragma solidity ^0.4.23;\n\nlibrary SafeMath {...,"[0, 1, 0, 1, 0, 0, 0, 0]","[124, 20, 205]"
47329,0xe2d4b960d0c639633582cddef57528461e62083d,pragma solidity ^0.4.23;\n\n\n\n\ncontract Mul...,"[0, 1, 1, 0, 0, 0, 1, 1]","[140, 142, 23, 282, 410, 157, 287, 296, 302, 3..."
