In [1]:
import csv
import pandas as pd

# 操作码对照表
def get_opcodes(opcodes_file):
    opcodes = {}
    opcode_length = {}
    with open('opcode.txt','r',encoding='utf-8') as file:
        for line in file.readlines():
            opcode = line.replace('\n','').split('\t')
            opcodes[opcode[0].lower()] = opcode[1]
            opcode_length[opcode[0].lower()] = int(opcode[2])
    return opcodes,opcode_length


# 获取合约地址和对应的字节码
def get_address_bytecode(contracts_file):
    address = []
    bytecodes = []
    
    with open(contracts_file, 'r') as file:
        reader = csv.DictReader(file)
        for row in reader:
            if row['bytecode'] != '0x':
                address.append(row['address'])
                bytecodes.append(row['bytecode'].replace('0x',''))
    return address,bytecodes


# 获取合约字节码和对应的标签
def get_labels_bytecode(contracts_file):
    labels = []
    bytecodes = []

    labels_bytecode_df = pd.read_csv(contracts_file)
    labels_bytecode_df['type'] = labels_bytecode_df['type'].apply(eval)
    for i in labels_bytecode_df.itertuples():
        bytecode = i[2].replace('0x','').lower()
        swc_id = [item.replace('.','') for item in i[3]]
        if bytecode != '' and bytecode not in bytecodes and swc_id is not None:
            labels.append(list(set(swc_id)))
            bytecodes.append(bytecode)
    
    return labels,bytecodes


# 将字节码转为操作码
def bytecode_to_opcodes(bytecode):
    """
    EVM操作码对照表 opcodes
    每个操作码的操作数的字节长度 opcode_length
    每个操作码都占一个字节，PUSH是唯一带有操作数的操作码,与黄皮书里不对应的操作用XX代替
    """
    """
    操作码合并：
    PUSH1~PUSH32 => PUSH1 0x60
    DUP1~DUP16 => DUP1 0x80
    SWAP1~SWAP16 => SWAP1 0x90
    LOG0~LOG4 => LOG0 0xa0
    CREATE,CREATE2 => CREATE 0Xf0
    JUMP,JUMP1 => JUMP 0x56
    """
    
    opcodes_file = 'opcode.txt'
    opcodes,opcode_length = get_opcodes(opcodes_file)

    opcodes_list = []
    i = 0
    while i < len(bytecode):
        opcode = bytecode[i:i+2] 
        if opcode in opcodes:
            if opcode_length[opcode] != 1:
                # value = bytecode[i+2:i+opcode_length[opcode] * 2]  # 带参数
                pass
            else:
                # value = ''  # 不带操作数
                pass
            
            if int(opcode,16) >= 96 and int(opcode,16) <= 127:
                opcodes_list.append(opcodes['60'])
            elif int(opcode,16) >= 128 and int(opcode,16) <=143:
                opcodes_list.append(opcodes['80'])
            elif int(opcode,16) >= 144 and int(opcode,16) <=159:
                opcodes_list.append(opcodes['90'])
            elif int(opcode,16) >= 160 and int(opcode,16) <=164:
                opcodes_list.append(opcodes['a0'])
            elif int(opcode,16) == 240 or int(opcode,16) ==245:
                opcodes_list.append(opcodes['f0'])
            elif int(opcode,16) == 86 or int(opcode,16) ==87:
                opcodes_list.append(opcodes['56'])
            else:
                opcodes_list.append(opcodes[opcode])
            
            i += opcode_length[opcode] * 2
        else:
            opcodes_list.append('XX')  # 无效操作
            i += 2
    return opcodes_list


## dataset

In [7]:
# test
with open('labels.txt','r') as file:
    data = [line.replace('\n','').split(',') for line in file.readlines()]
data

[['ARTHM', 'Arithmetic', 'Integer Overflow', 'Integer Underflow'],
 ['CDAV', 'Callstack Depth Attack Vulnerability'],
 ['DOS', 'Denial of Service'],
 ['LE', 'Locked Ether'],
 ['RENT', 'Reentrancy', 'Re-Entrancy Vulnerability'],
 ['TimeM', 'Time Manipulation'],
 ['TimeO',
  'Timestamp Ordering',
  'Transaction-Ordering Dependency',
  'Timestamp Dependency'],
 ['Tx-Origin', 'Authorization through tx.origin'],
 ['UE',
  'Unhandled Exception',
  'Unchecked Call Return Value',
  'Unchecked_External_Call']]

In [None]:
# 整理数据集 train.csv,vul.csv,vuls.csv
# 统一标签类型

def get_labels(labels_file):
    with open('labels.txt','r') as file:
        labels = [line.replace('\n','').split(',') for line in file.readlines()]
    return labels
    

# 获取合约字节码和对应的标签
def get_labels_bytecode(contracts_file_1,contracts_file_2,contracts_file_3):
    labels = []
    bytecodes = []
    labels_names_list = get_labels('labels.txt')

    labels_bytecode_df = pd.read_csv(contracts_file_1)
    labels_bytecode_df['type'] = labels_bytecode_df['type'].apply(eval)
    for i in labels_bytecode_df.itertuples():
        bytecode = i[2].replace('0x','').lower()
        swc_ids = []
        for swc_id in i[3]:
            for label_names in labels_names_list:
                # print(label_names)
                if swc_id.replace('.','') in label_names:
                    swc_ids.append(label_names[0])
                    # print('match success')
                    break               
        if bytecode != '' and bytecode not in bytecodes and swc_ids is not None:
            labels.append(list(set(swc_ids)))
            bytecodes.append(bytecode)
        else:
            print('dup data: ',i[1],bytecode[:100],swc_ids)
    print('dataset size: ',len(labels),len(bytecodes))
                
    labels_bytecode_df = pd.read_csv(contracts_file_2)
    for i in labels_bytecode_df.itertuples():
        bytecode = i[2].replace('0x','').lower()
        swc_ids = []
        for swc_id in i[3].split(','):
            for label_names in labels_names_list:
                if swc_id.replace('.','') in label_names:
                    swc_ids.append(label_names[0])
                    break
        if bytecode != '' and bytecode not in bytecodes and swc_ids is not None:
            labels.append(list(set(swc_ids)))
            bytecodes.append(bytecode)
        else:
            print('dup data: ',bytecode[:100],swc_ids)
    print('dataset size: ',len(labels),len(bytecodes))
                
    labels_bytecode_df = pd.read_csv(contracts_file_3)
    labels_bytecode_df['type'] = labels_bytecode_df['type'].apply(eval)
    for i in labels_bytecode_df.itertuples():
        bytecode = i[2].replace('0x','').lower()
        swc_ids = []
        for swc_id in i[3]:
            for label_names in labels_names_list:
                if swc_id.replace('.','') in label_names:
                    swc_ids.append(label_names[0])
                    break               
        if bytecode != '' and bytecode not in bytecodes and swc_ids is not None:
            labels.append(list(set(swc_ids)))
            bytecodes.append(bytecode)
        else:
            print('dup data: ',bytecode[:100],swc_ids)
    print('dataset size: ',len(labels),len(bytecodes))
            
    return labels,bytecodes


# 全部数据集
contracts_file_1 = 'train.csv'
contracts_file_2 = 'vul.csv'
contracts_file_3 = 'vuls.csv'
labels,bytecodes = get_labels_bytecode(contracts_file_1,contracts_file_2,contracts_file_3)
print('dataset size: ',len(labels),len(bytecodes))

# DataFrame格式
columns = ['bytecode','swc_id','opcodes']
df = pd.DataFrame(columns=columns)
for i in range(len(labels)):  
    opcodes_list = bytecode_to_opcodes(bytecodes[i])
    line = [bytecodes[i], labels[i], ' '.join(opcodes_list)]
    df.loc[i] = line


# 完整数据集经过处理后写入csv
df.to_csv('./new-dataset/new-dataset.csv', encoding='utf-8')
print('done')

df

In [19]:
# 读取完整数据集
import pandas as pd
cur_df = pd.read_csv('./new-dataset/new-dataset.csv')  # ,delimiter='\t')
cur_df.drop(['Unnamed: 0'], axis=1, inplace=True)
cur_df['swc_id'] = cur_df['swc_id'].apply(eval) 

# 判断dataframe里是否有nan
for i,row in cur_df.iterrows():
    if pd.isna(row['bytecode']):
        print(i,row)

cur_df 

Unnamed: 0,bytecode,swc_id,opcodes
0,6080604052600436106100615763ffffffff7c01000000...,"[TimeO, ARTHM]",PUSH1 PUSH1 MSTORE PUSH1 CALLDATASIZE LT PUSH1...
1,6080604052600436106100775763ffffffff7c01000000...,"[TimeO, ARTHM]",PUSH1 PUSH1 MSTORE PUSH1 CALLDATASIZE LT PUSH1...
2,60606040526004361061006c5763ffffffff7c01000000...,"[TimeO, ARTHM]",PUSH1 PUSH1 MSTORE PUSH1 CALLDATASIZE LT PUSH1...
3,60606040526004361061030b576000357c010000000000...,[ARTHM],PUSH1 PUSH1 MSTORE PUSH1 CALLDATASIZE LT PUSH1...
4,6060604052600436106101275763ffffffff7c01000000...,"[TimeO, ARTHM]",PUSH1 PUSH1 MSTORE PUSH1 CALLDATASIZE LT PUSH1...
...,...,...,...
10888,6080604052600436106100f05763ffffffff7c01000000...,"[DOS, ARTHM]",PUSH1 PUSH1 MSTORE PUSH1 CALLDATASIZE LT PUSH1...
10889,6080604052600436106100985763ffffffff7c01000000...,[ARTHM],PUSH1 PUSH1 MSTORE PUSH1 CALLDATASIZE LT PUSH1...
10890,606060405236156100ce576000357c0100000000000000...,[ARTHM],PUSH1 PUSH1 MSTORE CALLDATASIZE ISZERO PUSH1 J...
10891,6060604052600436106100fc576000357c010000000000...,[ARTHM],PUSH1 PUSH1 MSTORE PUSH1 CALLDATASIZE LT PUSH1...


In [20]:
# 列出标签类型
res = cur_df['swc_id']
all_labels = []
for line in res:
    for l in line:
        if l  not in all_labels:
            all_labels.append(l)
all_labels

['TimeO', 'ARTHM', 'RENT', 'CDAV', 'UE', 'LE', 'TimeM', 'DOS']

In [None]:
# 统计各标签的个数
count_TimeO = cur_df['swc_id'].astype(str).str.count('TimeO').sum()  
count_ARTHM = cur_df['swc_id'].astype(str).str.count('ARTHM').sum()  
count_RENT = cur_df['swc_id'].astype(str).str.count('RENT').sum() 
count_CDAV = cur_df['swc_id'].astype(str).str.count('CDAV').sum() 
count_UE = cur_df['swc_id'].astype(str).str.count('UE').sum() 
count_LE = cur_df['swc_id'].astype(str).str.count('LE').sum() 
count_TimeM = cur_df['swc_id'].astype(str).str.count('TimeM').sum() 
count_DOS = cur_df['swc_id'].astype(str).str.count('DOS').sum()
# [cur_df['swc_id'].astype(str).str.count(a).sum() for a in all_labels[::-1]]

In [None]:
# 制作条形图
import matplotlib.pyplot as plt
plt.figure(figsize=(10,8))

x = list(range(0, 100000,10000)) 
y = [df['swc_id'].astype(str).str.count(a).sum() for a in all_labels[::-1]] 
print(x,y)

plt.barh(x, y, align='center', color='c', tick_label=all_labels, hatch='', height=5000) 
plt.xlabel("") 
plt.ylabel("") 
 
# plt.show函数用于显示图形
plt.show() 

import pandas as pd
df = pd.DataFrame({'data': data}, index=labels)
ax = df.plot(kind='bar', color='green')

for i, val in enumerate(df['data'].values):
    ax.text(i, val, int(val), horizontalalignment='center', verticalalignment='bottom')

ax.set(title='Bar chart with Values Shown on Top', xlabel='Categories', ylabel='Values')
plt.show()


In [None]:
# 制作饼图
kinds = all_labels
colors = ['#e41a1c', '#377eb8', '#4daf4a', '#984ea3','#e41a1c', '#377eb8', '#4daf4a', '#984ea3','#e41a1c', '#377eb8']
sold_nums = [df['swc_id'].astype(str).str.count(a).sum() for a in all_labels[::-1]] 
plt.pie(x=sold_nums, 
		labels=kinds,
		autopct="%3.1f%%", 
		startangle=60, 
		colors=colors)
plt.show()

In [None]:
# 列出重复元素
from collections import Counter
 
def find_duplicates(lst):
    # 使用Counter来计数
    counter = Counter(lst)
    # 筛选出计数大于1的元素
    return [count for item, count in counter.items() if count > 1]
 
duplicates = find_duplicates(res)
print(duplicates)
print(len(duplicates))

In [None]:
# 读取parquet格式文件
import pyarrow.parquet as pq

file = pq.ParquetFile('0000.parquet')
data = file.read().to_pandas()
# data
a = 0
for line in data['bytecode']:
    if a == 2:
        break
    print(line)
    print('\n')
    a = a + 1 

In [None]:
from google.cloud import bigquery
from google.oauth2 import service_account
bytecode = [] #字节码数据
swc_id = []  #标签数据
def retrive_dataset():  #python连接bigquery表（自己在bigquery上建的）并读取数据
    credentials = service_account.Credentials.from_service_account_file('citric-cistern-420507-81f74d5b1bfa.json') #凭据路径(json文件)
    project_id = 'citric-cistern-420507' #Google project名称
    client = bigquery.Client(credentials= credentials,project=project_id)

    query_job = client.query("""
        SELECT bytecode,swc_id FROM `blockchain.label` 
        """)
    results = query_job.result() 
    for row in results:
        bytecode.append(row[0])
        swc_id.append(row[1])
        break
    print(bytecode,swc_id)

retrive_dataset()