In [3]:
import json
import numpy as np
from web3 import Web3, HTTPProvider
from pyevmasm import disassemble_hex
from dotenv import load_dotenv
from sklearn.feature_extraction.text import TfidfVectorizer
import os
import pandas as pd
# Load environment variables from .env file
load_dotenv()
FAST_NODE_URL=os.getenv('FAST_NODE_URL')  

In [4]:
# load addresses from json and numpy files
with open('data/valdiated_addresses.json', 'r') as f:
    valid_addresses = json.load(f)

malicious_addresses = np.load('data/malicious_addresses.npy', allow_pickle=True)

In [5]:
malicious_addresses

array(['0x000014688fd28b29b761cc1ec77b532bc923d400',
       '0x000875cd6125a5c2c5bbb17791e690df9a6d6000',
       '0x026d440e742e2e4e94ce0651ea7215e181652c68',
       '0x0332d00ae8e9baa609edc48844f48fdd94ca9547',
       '0x124a743ef8c391ea2e3cf8271c3050dee045d9b9',
       '0x142eaba1ace3649afbb2eb0fe2133a0caf984e77',
       '0x14f1495e78fdf6733e4f340798a5822daf7ba3af',
       '0x1e891e6c7ea7a7c32d4b9643b90b8a9fa313c77f',
       '0x263855ebfe85a1532ffd3ef21dee111378be8514',
       '0x2ceee24f8d03fc25648c68c8e6569aa0512f6ac3',
       '0x2e46bee733305b85ba05b3fb8ecf322a635a7ba3',
       '0x318d51fa877a79d9696d001d061df1aaf386a1ff',
       '0x34075d8c3a64cb686603b2879e0b19c3253ecb21',
       '0x35e93c60b9f0b9d85933e48434ba9ccd970c7a4e',
       '0x377e8885df58d0a1d03f64c4eb317793559e3f26',
       '0x39ea2b4cbd087f7c2abb5fa2ab2416a24f2b4a9f',
       '0x416bf8ffff8e99408abbfb35536d89cd0adaa764',
       '0x4504bfbf4ae479f179453db2f5b65abb9ccd5502',
       '0x57b818a1070373e21fcedf48d4368e1703c7

In [6]:
# connect to Ethereum node
w3 = Web3(HTTPProvider(FAST_NODE_URL))
w3.is_connected()

True

In [7]:
short_valid = valid_addresses[0:2000]
short_mal = malicious_addresses

# function to get bytecode and parse to opcodes
def get_opcodes(address):
    check_address= w3.to_checksum_address(address)
    bytecode = w3.eth.get_code(check_address).hex()[2:]
    instructions = disassemble_hex(bytecode)
    instructions = instructions.split('\n')
    return ' '.join(instr.split(' ')[0] for instr in instructions)

# get opcodes for each contract
valid_opcodes = [get_opcodes(addr['address']) for addr in short_valid]
malicious_opcodes = [get_opcodes(addr) for addr in short_mal]



# create labels: 0 for valid, 1 for malicious
valid_labels = [0]*len(valid_opcodes)
malicious_labels = [1]*len(malicious_opcodes)

# join opcodes and labels
all_opcodes = valid_opcodes + malicious_opcodes
all_labels = valid_labels + malicious_labels

KeyboardInterrupt: 

In [None]:
df = pd.DataFrame({
    'Opcodes': all_opcodes,
    'Label': all_labels
})
df = df[df['Opcodes'].notnull() & (df['Opcodes'].str.len() > 0)]
df.reset_index(inplace=True)
df.to_csv('./data/small_dataset.csv')