In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm 
from time import time

In [2]:
# read data.
ponzi_df = pd.read_csv("ponzi_transactions.csv")
legit_df = pd.read_csv("legit_transactions.csv")

In [3]:
print("ponzi:")
print(ponzi_df.isna().sum(), end="\n\n")
print("legit:")
print(legit_df.isna().sum())

ponzi:
Unnamed: 0      0
blockNumber     0
from            0
to              0
gasLimit       13
value           0
dtype: int64

legit:
Unnamed: 0.1          0
Unnamed: 0            0
blockNumber           0
from                  0
to                  120
gasLimit        2930711
value                 0
dtype: int64


In [4]:
# drop nan rows.
ponzi_df = ponzi_df.dropna()
legit_df = legit_df.dropna()

In [5]:
print("ponzi:")
print(ponzi_df.isna().sum(), end="\n\n")
print("legit:")
print(legit_df.isna().sum())

ponzi:
Unnamed: 0     0
blockNumber    0
from           0
to             0
gasLimit       0
value          0
dtype: int64

legit:
Unnamed: 0.1    0
Unnamed: 0      0
blockNumber     0
from            0
to              0
gasLimit        0
value           0
dtype: int64


In [6]:
print("ponzi:")
print(ponzi_df.dtypes, end="\n\n")
print("legit:")
print(legit_df.dtypes)

ponzi:
Unnamed: 0       int64
blockNumber      int64
from            object
to              object
gasLimit       float64
value           object
dtype: object

legit:
Unnamed: 0.1      int64
Unnamed: 0        int64
blockNumber       int64
from             object
to               object
gasLimit        float64
value            object
dtype: object


In [7]:
# covert value to compatiable form. 
ponzi_df["value"] = ponzi_df["value"].apply(lambda v: float(v.split(".")[0]) / 1e18)
ponzi_df["gasLimit"] = ponzi_df["gasLimit"].apply(lambda v: int(v))

legit_df["value"] = legit_df["value"].apply(lambda v: float(v.split(".")[0]) / 1e18)
legit_df["gasLimit"] = legit_df["gasLimit"].apply(lambda v: int(v))

In [8]:
print("ponzi:")
print(ponzi_df.dtypes, end="\n\n")
print("legit:")
print(legit_df.dtypes)

ponzi:
Unnamed: 0       int64
blockNumber      int64
from            object
to              object
gasLimit         int64
value          float64
dtype: object

legit:
Unnamed: 0.1      int64
Unnamed: 0        int64
blockNumber       int64
from             object
to               object
gasLimit          int64
value           float64
dtype: object


In [9]:
def BENFORD(n, B):
    def ben(d):
        if n == 1:
            return 1/np.log(B) * np.log(1 + 1/d)/np.log(10)
        return 1/np.log(B) * np.sum([np.log(1 + 1/(k*B+d))/np.log(10) for k in range(B**(n-2), B**(n-1)-1)])
    return ben

In [10]:
def address_to_index_map(from_sorted, to_sorted):
    addr_idx_map = {}
    
    curr_from = from_sorted[0]
    curr_to = to_sorted[0]
    
    addr_idx_map[curr_from] = {"from_start": 0, "from_end": -1, "to_start": -1, "to_end":-1}
    addr_idx_map[curr_to] = {"from_start": -1, "from_end": -1, "to_start": 0, "to_end":-1}
    
    for i, (f, t) in enumerate(zip(from_sorted[1:], to_sorted[1:])):
        idx = i+1
        if not f in addr_idx_map:
            addr_idx_map[curr_from]["from_end"] = idx
            curr_from = f
            addr_idx_map[curr_from] = {"from_start": idx, "from_end": -1, "to_start": -1, "to_end":-1}
        elif f != curr_from:
            addr_idx_map[curr_from]["from_end"] = idx
            curr_from = f
            addr_idx_map[curr_from]["from_start"] = idx
        
        if not t in addr_idx_map:
            addr_idx_map[curr_to]["to_end"] = idx
            curr_to = t
            addr_idx_map[curr_to] = {"from_start": -1, "from_end": -1, "to_start": idx, "to_end":-1}
        elif t != curr_to:
            addr_idx_map[curr_to]["to_end"] = idx
            curr_to = t
            addr_idx_map[curr_to]["to_start"] = idx
    
    addr_idx_map[curr_from]["from_end"] = len(from_sorted)
    addr_idx_map[curr_to]["to_end"] = len(to_sorted)

    return addr_idx_map

def create_feature_dataframe(data_df, label):
    def benfords(values, idx):
        idx = idx-1
        def pull_val(value):
            i = idx
            val_str = str(value)
            while val_str[i] in ('0','x'):
                i += 1
            i += idx # Offset to the idx-index from beginning
            return val_str[i]

        counts = {i:0 for i in ['1','2','3','4','5','6','7','8','9','a','b','c','d','e','f']}
        total = 0
        for value in values:
            total += 1
            # Catch 0x0...0
            if len(set(value.replace('x',''))) == 1:
                continue
            v = pull_val(value)
            if v == '0':
                continue
            counts[pull_val(value)] += 1
        return [v/total for _,v in counts.items()]

    def chi_square(counted, expected):
        stat = 0
        for count, expectation in zip(counted, expected):
            stat += (count - expectation)**2 / expectation
        return stat

    bens_first = [BENFORD(1, 16)(d) for d in range(1,16)]
    bens_second = [BENFORD(2, 16)(d) for d in range(1,16)]
    from_sorted_df = data_df.sort_values(by=['from'], ignore_index=True)
    to_sorted_df = data_df.sort_values(by=['to'], ignore_index=True)
    
    ai_map = address_to_index_map(from_sorted_df["from"], to_sorted_df["to"])
    addresses = list(ai_map.keys())
    
    feature_dict = {
        "address": addresses,
        "label": np.full(len(addresses), label),
        "total_count": [],
        "in_count": [],
        "in_unique": [],
        "in_value_avg": [],
        "in_value_med": [],
        "in_value_std": [],
        "in_gas_limit_avg": [],
        "in_gas_limit_med": [],
        "in_gas_limit_std": [],
        "from_benfords_first": [],
        "from_benfords_second": [],
        "out_count": [],
        "out_unique": [],
        "out_value_avg": [],
        "out_value_med": [],
        "out_value_std": [],
        "out_gas_limit_avg": [],
        "out_gas_limit_med": [],
        "out_gas_limit_std": [],
        "to_benfords_first": [],
        "to_benfords_second": [],
        "merged_benfords_first": [],
        "merged_benfords_second": [],
    }
    
    for address in tqdm(addresses, mininterval=5):                
        addr_to_df = to_sorted_df[ai_map[address]["to_start"]:ai_map[address]["to_end"]]
        feature_dict["in_count"].append(len(addr_to_df))
        
        if len(addr_to_df) != 0:
            feature_dict["in_unique"].append(len(addr_to_df["from"].unique()))

            addr_to_desc = addr_to_df.describe()

            feature_dict["in_value_avg"].append(addr_to_desc["value"]["mean"])
            feature_dict["in_value_med"].append(addr_to_desc["value"]["50%"])
            feature_dict["in_value_std"].append(addr_to_desc["value"]["std"])

            feature_dict["in_gas_limit_avg"].append(addr_to_desc["gasLimit"]["mean"])
            feature_dict["in_gas_limit_med"].append(addr_to_desc["gasLimit"]["50%"])
            feature_dict["in_gas_limit_std"].append(addr_to_desc["gasLimit"]["std"])
        else:
            feature_dict["in_unique"].append(0)
            feature_dict["in_value_avg"].append(0.0)
            feature_dict["in_value_med"].append(0.0)
            feature_dict["in_value_std"].append(0.0)
            feature_dict["in_gas_limit_avg"].append(0.0)
            feature_dict["in_gas_limit_med"].append(0.0)
            feature_dict["in_gas_limit_std"].append(0.0)
        
        if len(addr_to_df["from"]) == 0:
            feature_dict["from_benfords_first"].append(0.)
            feature_dict["from_benfords_second"].append(0.)
        else:
            from_first_benfords = benfords(addr_to_df["from"], 1)
            from_second_benfords = benfords(addr_to_df["from"], 2)
            from_first_chi_squared = chi_square(from_first_benfords, bens_first)
            from_second_chi_squared = chi_square(from_second_benfords, bens_second)
            feature_dict["from_benfords_first"].append(from_first_chi_squared)
            feature_dict["from_benfords_second"].append(from_second_chi_squared)


        addr_from_df = from_sorted_df[ai_map[address]["from_start"]:ai_map[address]["from_end"]]
        feature_dict["out_count"].append(len(addr_from_df))
        
        if len(addr_from_df) != 0:
            feature_dict["out_unique"].append(len(addr_from_df["to"].unique()))

            addr_from_desc = addr_from_df.describe()

            feature_dict["out_value_avg"].append(addr_from_desc["value"]["mean"])
            feature_dict["out_value_med"].append(addr_from_desc["value"]["50%"])
            feature_dict["out_value_std"].append(addr_from_desc["value"]["std"])

            feature_dict["out_gas_limit_avg"].append(addr_from_desc["gasLimit"]["mean"])
            feature_dict["out_gas_limit_med"].append(addr_from_desc["gasLimit"]["50%"])
            feature_dict["out_gas_limit_std"].append(addr_from_desc["gasLimit"]["std"])
        else:
            feature_dict["out_unique"].append(0)
            feature_dict["out_value_avg"].append(0.0)
            feature_dict["out_value_med"].append(0.0)
            feature_dict["out_value_std"].append(0.0)
            feature_dict["out_gas_limit_avg"].append(0.0)
            feature_dict["out_gas_limit_med"].append(0.0)
            feature_dict["out_gas_limit_std"].append(0.0)
        
        feature_dict["total_count"].append(len(addr_to_df) + len(addr_from_df))
        
        if len(addr_from_df["to"]) == 0:
            feature_dict["to_benfords_first"].append(0.)
            feature_dict["to_benfords_second"].append(0.)
        else:
            from_first_benfords = benfords(addr_from_df["to"], 1)
            from_second_benfords = benfords(addr_from_df["to"], 2)
            from_first_chi_squared = chi_square(from_first_benfords, bens_first)
            from_second_chi_squared = chi_square(from_second_benfords, bens_second)
            feature_dict["to_benfords_first"].append(from_first_chi_squared)
            feature_dict["to_benfords_second"].append(from_second_chi_squared)
        
        if len(addr_from_df["to"]) == 0:
            mbf = feature_dict["from_benfords_first"][-1]
            mbs = feature_dict["from_benfords_second"][-1]
        elif len(addr_to_df["from"]) == 0:
            mbf = feature_dict["to_benfords_first"][-1]
            mbs = feature_dict["to_benfords_second"][-1]
        else:
            merged_addresses = pd.concat([addr_from_df["to"], addr_to_df["from"]])
            merged_benfords_first = benfords(merged_addresses, 1)
            merged_benfords_second = benfords(merged_addresses, 2)
            mbf = chi_square(merged_benfords_first, bens_first)
            mbs = chi_square(merged_benfords_second, bens_second)
        
        feature_dict["merged_benfords_first"].append(mbf)
        feature_dict["merged_benfords_second"].append(mbs)
    
    return pd.DataFrame(feature_dict).fillna(0)

def construct_dataset(pos_df, neg_df):
    pos_feature_df = create_feature_dataframe(pos_df, label=1)
    neg_feature_df = create_feature_dataframe(neg_df, label=-1)
    
    feature_df = pd.concat([pos_feature_df, neg_feature_df], ignore_index=True)
    return feature_df

In [11]:
ponzi_feature_df = create_feature_dataframe(ponzi_df, 1)

100%|█████████████████████████████████████████████████████████████████████████████| 5887/5887 [00:44<00:00, 132.67it/s]


In [12]:
ponzi_feature_df[[col for col in ponzi_feature_df.columns if "benfords" in col]]

Unnamed: 0,from_benfords_first,from_benfords_second,to_benfords_first,to_benfords_second,merged_benfords_first,merged_benfords_second
0,20.625881,31.196614,0.000000,0.000000,20.625881,31.196614
1,20.625881,31.196614,20.625881,31.196614,20.625881,31.196614
2,20.625881,31.196614,20.625881,31.196614,20.625881,31.196614
3,20.625881,31.196614,20.625881,31.196614,20.625881,31.196614
4,20.625881,31.196614,20.625881,31.196614,20.625881,31.196614
...,...,...,...,...,...,...
5882,20.625881,31.196614,20.625881,31.196614,20.625881,31.196614
5883,20.625881,31.196614,20.625881,31.196614,20.625881,31.196614
5884,0.000000,0.000000,97.353844,38.521728,97.353844,38.521728
5885,20.625881,31.196614,20.625881,31.196614,20.625881,31.196614


In [19]:
l_feature_df = create_feature_dataframe(legit_df, 0)

100%|███████████████████████████████████████████████████████████████████████| 935091/935091 [1:31:00<00:00, 171.23it/s]


In [20]:
full_df = pd.concat([ponzi_feature_df, l_feature_df], ignore_index=True)

In [21]:
full_df = full_df.fillna(0.)

In [22]:
full_df.to_csv("transactions_merged_benfords.csv", index=False)

In [23]:
full_df

Unnamed: 0,address,label,total_count,in_count,in_unique,in_value_avg,in_value_med,in_value_std,in_gas_limit_avg,in_gas_limit_med,...,out_value_avg,out_value_med,out_value_std,out_gas_limit_avg,out_gas_limit_med,out_gas_limit_std,to_benfords_first,to_benfords_second,merged_benfords_first,merged_benfords_second
0,0x00083da8b2dc22bbab767842eb413dcd753705d0,1,2,2,1,0.006538,0.006538,0.000000,69780.000000,69780.0,...,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,20.625881,31.196614
1,0x0015f6538194879dd073c0decd70b181652a99d9,1,3,1,1,0.076803,0.076803,0.000000,200000.000000,200000.0,...,0.000000,0.000000,0.0,200000.0,200000.0,0.000000,20.625881,31.196614,20.625881,31.196614
2,0x0021e99b454ca2bfc343746b59ea100beaabfaa7,1,4,2,1,0.006981,0.006981,0.000000,150000.000000,150000.0,...,0.000000,0.000000,0.0,150000.0,150000.0,70710.678119,20.625881,31.196614,20.625881,31.196614
3,0x002c9146127e26024a60f86ff2d808dcb4329950,1,4,2,1,0.061516,0.061516,0.057997,70000.000000,70000.0,...,0.000000,0.000000,0.0,70000.0,70000.0,0.000000,20.625881,31.196614,20.625881,31.196614
4,0x0039f22efb07a647557c7c5d17854cfd6d489ef3,1,3,1,1,0.016650,0.016650,0.000000,69780.000000,69780.0,...,0.000000,0.000000,0.0,45390.0,45390.0,34492.668786,20.625881,31.196614,20.625881,31.196614
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
940973,0xffffd2ff9b840f6bd74f80df8e532b4d7886ffff,0,5,0,0,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.0,154775.2,97040.0,96588.122948,20.625881,41.409844,20.625881,41.409844
940974,0xffffdfc7204e27dc7a1c190064db74cfd0dad7a0,0,1,0,0,0.000000,0.000000,0.000000,0.000000,0.0,...,0.274559,0.274559,0.0,21000.0,21000.0,0.000000,27.044225,34.632418,27.044225,34.632418
940975,0xffffeada6c8e9fc94cb94c4d8e48a5339016f5fb,0,1,0,0,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.0,163703.0,163703.0,0.000000,33.450019,36.232685,33.450019,36.232685
940976,0xfffffffff15abf397da76f1dcc1a1604f45126db,0,9,9,6,0.000000,0.000000,0.000000,50840.666667,48934.0,...,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,4.335205,9.453430


In [24]:
ponzi_df.gasLimit

0        250000
1        200000
2        100000
3         46028
4         21000
          ...  
45970    300000
45971    300000
45972    300000
45973    300000
45974    300000
Name: gasLimit, Length: 45962, dtype: int64