In [15]:
import json
import numpy as np
from web3 import Web3, HTTPProvider
from pyevmasm import disassemble_hex
from dotenv import load_dotenv
from sklearn.feature_extraction.text import TfidfVectorizer
import os
import pandas as pd
# Load environment variables from .env file
load_dotenv()
FAST_NODE_URL=os.getenv('FAST_NODE_URL')  

In [16]:
# load addresses from json and numpy files
with open('data/valdiated_addresses.json', 'r') as f:
    valid_addresses = json.load(f)

malicious_addresses = np.load('data/malicious_addresses.npy', allow_pickle=True)

In [17]:
# connect to Ethereum node
w3 = Web3(HTTPProvider(FAST_NODE_URL))
w3.is_connected()

True

In [18]:
short_valid = valid_addresses[0:2000]
short_mal = malicious_addresses

# function to get bytecode and parse to opcodes
def get_opcodes(address):
    check_address= w3.to_checksum_address(address)
    bytecode = w3.eth.get_code(check_address).hex()[2:]
    instructions = disassemble_hex(bytecode)
    instructions = instructions.split('\n')
    return ' '.join(instr.split(' ')[0] for instr in instructions)

# get opcodes for each contract
valid_opcodes = [get_opcodes(addr['address']) for addr in short_valid]
malicious_opcodes = [get_opcodes(addr) for addr in short_mal]



# create labels: 0 for valid, 1 for malicious
valid_labels = [0]*len(valid_opcodes)
malicious_labels = [1]*len(malicious_opcodes)

# join opcodes and labels
all_opcodes = valid_opcodes + malicious_opcodes
all_labels = valid_labels + malicious_labels

In [29]:
df = pd.DataFrame({
    'Opcodes': all_opcodes,
    'Label': all_labels
})
df = df[df['Opcodes'].notnull() & (df['Opcodes'].str.len() > 0)]
df.reset_index(inplace=True)
df.to_csv('./data/small_dataset.csv')