In [None]:
import os
import json
import time
import requests
import pandas as pd
from datetime import datetime
from statistics import stdev as std
from collections import Counter
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm.notebook import tqdm

In [24]:
ETHERSCAN_API = "https://api.etherscan.io/api"
ETHERSCAN_KEY = os.getenv("ETHERSCAN_API_KEY") or "ISI_API_KEY_KAMU_DI_SINI"
CACHE_FILE = "features_cache.json"
MAX_WORKERS = 5
RATE_LIMIT_DELAY = 0.25

In [25]:
if os.path.exists(CACHE_FILE):
    with open(CACHE_FILE, "r") as f:
        features = json.load(f)
else:
    features = []

# Sudah diproses
processed_addresses = {item['address'] for item in features}

In [26]:
def get_transactions(address: str):
    params = {
        "module": "account",
        "action": "txlist",
        "address": address,
        "startblock": 0,
        "endblock": 99999999,
        "sort": "asc",
        "apikey": ETHERSCAN_KEY
    }
    try:
        response = requests.get(ETHERSCAN_API, params=params, timeout=10)
        data = response.json()
        if data["status"] != "1":
            return []
        return data["result"]
    except Exception as e:
        print(f"Error getting txs for {address}: {e}")
        return []

def extract_features(wallet: str):
    txs = get_transactions(wallet)
    if not txs:
        return None

    try:
        dates = [datetime.fromtimestamp(int(tx["timeStamp"])).date() for tx in txs]
        block_numbers = [int(tx["blockNumber"]) for tx in txs]
        amounts = [int(tx["value"]) / 1e18 for tx in txs]
        hours = [datetime.fromtimestamp(int(tx["timeStamp"])).hour for tx in txs]
        counterparties = set(tx["to"] for tx in txs if tx["to"])

        tx_per_block = Counter(block_numbers)
        date_in = {datetime.fromtimestamp(int(tx["timeStamp"])).date()
                   for tx in txs if tx["to"].lower() == wallet.lower()}
        date_out = {datetime.fromtimestamp(int(tx["timeStamp"])).date()
                    for tx in txs if tx["from"].lower() == wallet.lower()}
        same_day_in_out = len(date_in & date_out)

        def most_common(arr):
            return Counter(arr).most_common(1)[0][0] if arr else None

        feats = dict(
            chain="ETH",
            address=wallet,
            tx_total=len(txs),
            avg_tx_per_day=round(len(txs) / len(set(dates)), 2),
            avg_tx_per_block=round(len(txs) / len(set(block_numbers)), 4) if block_numbers else 0,
            max_tx_per_block=max(tx_per_block.values()) if tx_per_block else 0,
            min_tx_per_block=min(tx_per_block.values()) if tx_per_block else 0,
            std_tx_amount=round(std(amounts), 6) if len(amounts) > 1 else 0,
            tx_between_00_04=round(sum(1 for h in hours if 0 <= h < 4) / len(txs), 4),
            num_unique_counterparties=len(counterparties),
            same_day_deposit_withdraw=int(same_day_in_out > 0),
            most_common_amount=most_common(amounts),
        )

        return feats
    except Exception as e:
        print(f"Error extracting features for {wallet}: {e}")
        return None

In [27]:
def process_wallet(wallet):
    if wallet in processed_addresses:
        return None  # sudah diproses

    feats = extract_features(wallet)
    time.sleep(RATE_LIMIT_DELAY)
    return feats

In [28]:
df = pd.read_csv("../datasets/ETH/gambling_address_dataset.csv")

In [29]:
df.count()

address_hash    90346
address_type    90346
dtype: int64

In [30]:
index_to_drop = df[df['address_type'] == -1].index

In [31]:
if not index_to_drop.empty:
    df = df.drop(index_to_drop[0])

In [32]:
df.count()

address_hash    90345
address_type    90345
dtype: int64

In [33]:
df = df[df['address_type'] != -1]

In [34]:
df.count()

address_hash    61427
address_type    61427
dtype: int64

In [35]:
count_0 = df[df['address_type'] == 0]
count_1 = df[df['address_type'] == 1]

In [36]:
count_0.count()

address_hash    51004
address_type    51004
dtype: int64

In [37]:
count_1.count()

address_hash    10423
address_type    10423
dtype: int64

In [38]:
count_0 = count_0.head(10000)

In [39]:
count_0.count()

address_hash    10000
address_type    10000
dtype: int64

In [None]:
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    futures = {executor.submit(process_wallet, wallet): wallet for wallet in count_1 if wallet not in processed_addresses}
    for future in tqdm(as_completed(futures), total=len(futures)):
        print(f"[{future}/{len(count_1)}]")
        wallet = futures[future]
        try:
            result = future.result()
            if result:
                features.append(result)
                # Simpan cache langsung
                with open(CACHE_FILE, "w") as f:
                    json.dump(features, f)
        except Exception as e:
            print(f"Error in thread for {wallet}: {e}")