# Arbiscan


In [1]:
import os
import pickle
import pandas as pd
import numpy as np
from tqdm import tqdm
import warnings

# Подавить предупреждения от numpy и pandas о пустых срезах
warnings.filterwarnings("ignore", category=RuntimeWarning)


# ===== Функция для извлечения признаков одного адреса =====
def extract_sybil_features_arbiscan(df: pd.DataFrame) -> dict:
    df = df.copy()
    # Конвертация числовых полей
    num_cols = [
        "blockNumber",
        "nonce",
        "transactionIndex",
        "value",
        "gas",
        "gasPrice",
        "gasPriceBid",
        "cumulativeGasUsed",
        "gasUsed",
        "confirmations",
        "isError",
    ]
    for col in num_cols:
        df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0)

    # Преобразование timestamp
    df["timeStamp"] = pd.to_numeric(df["timeStamp"], errors="coerce")
    df["timeStamp"] = pd.to_datetime(df["timeStamp"], unit="s", errors="coerce")
    df = df.sort_values("timeStamp").reset_index(drop=True)

    # Фильтрация по дате
    cutoff = pd.Timestamp("2024-06-20")
    df = df[df["timeStamp"] < cutoff]
    if df.empty:
        return {}  # нет транзакций до cutoff

    addr = df["address"].iloc[0].lower()
    features = {}

    # ===== Базовые счётчики =====
    tx_count = len(df)
    days_active = df["timeStamp"].dt.date.nunique()
    features["tx_count"] = tx_count
    features["active_days"] = days_active
    features["lifetime_days"] = (
        (df["timeStamp"].max() - df["timeStamp"].min()).total_seconds() / 86400
        if tx_count > 1
        else 0.0
    )

    # ===== Интервалы между транзакциями =====
    df["time_diff"] = df["timeStamp"].diff().dt.total_seconds()
    features.update(
        {
            "mean_tx_interval": df["time_diff"].mean(),
            "median_tx_interval": df["time_diff"].median(),
            "std_tx_interval": df["time_diff"].std(),
            "max_tx_interval": df["time_diff"].max(),
            "fast_tx_ratio": (df["time_diff"] < 1).mean(),
            "short_interval_ratio": (df["time_diff"] < 60).mean(),
            "tx_per_day": tx_count / days_active if days_active > 0 else tx_count,
        }
    )

    # ===== Блок/nonce метрики =====
    block_diff = df["blockNumber"].diff().fillna(0)
    nonce_diff = df["nonce"].diff().fillna(0)
    features.update(
        {
            "mean_block_gap": block_diff.mean(),
            "std_block_gap": block_diff.std(),
            "mean_nonce_gap": nonce_diff.mean(),
            "max_nonce_gap": nonce_diff.max(),
        }
    )

    # ===== Статистики значений переводов =====
    df["value_eth"] = df["value"] / 1e18
    v = df["value_eth"]
    features.update(
        {
            "total_value_eth": v.sum(),
            "mean_value_eth": v.mean(),
            "median_value_eth": v.median(),
            "std_value_eth": v.std(),
            "min_value_eth": v.min(),
            "max_value_eth": v.max(),
            "zero_value_tx_ratio": (v == 0).mean(),
        }
    )
    if tx_count > 0:
        vc = v.value_counts()
        top_val, top_freq = vc.index[0], vc.iloc[0]
        q1, q3 = v.quantile([0.25, 0.75])
        probs = vc / tx_count
        features.update(
            {
                "most_common_value": top_val,
                "most_common_value_freq": top_freq,
                "most_common_value_ratio": top_freq / tx_count,
                "value_entropy": -(probs * np.log(probs + 1e-9)).sum(),
                "below_q1_ratio": (v < q1).mean(),
                "above_q3_ratio": (v > q3).mean(),
                "value_skew": v.skew(),
                "value_kurtosis": v.kurtosis(),
                "value_gini": (
                    2
                    * (np.arange(1, len(v) + 1) * np.sort(v)).sum()
                    / (len(v) * v.sum())
                    - (len(v) + 1) / len(v)
                ),
            }
        )

    # ===== Потоки средств =====
    out_vals = df[df["from"].str.lower() == addr]["value_eth"]
    in_vals = df[df["to"].str.lower() == addr]["value_eth"]
    features.update(
        {
            "outgoing_value_eth": out_vals.sum(),
            "incoming_value_eth": in_vals.sum(),
            "net_value_eth": in_vals.sum() - out_vals.sum(),
        }
    )

    # ===== Газ =====
    df["gas_eth"] = df["gasPrice"] * df["gasUsed"] / 1e18
    g = df["gas_eth"]
    gp = df["gasPrice"]
    bid = df["gasPriceBid"]
    gas_used_ratio = np.where(df["gas"] > 0, df["gasUsed"] / df["gas"], np.nan)
    features.update(
        {
            "total_gas_eth": g.sum(),
            "mean_gas_eth": g.mean(),
            "std_gas_eth": g.std(),
            "mean_gas_price": gp.mean(),
            "std_gas_price": gp.std(),
            "mean_gas_bid_ratio": np.nanmean(bid / gp),
            "gas_used_ratio_mean": np.nanmean(gas_used_ratio),
        }
    )

    # ===== Ошибки и направления =====
    errs = df["isError"].astype(int)
    out = df[df["from"].str.lower() == addr]
    inc = df[df["to"].str.lower() == addr]
    features.update(
        {
            "tx_errors": errs.sum(),
            "error_ratio": errs.mean(),
            "outgoing_tx_count": len(out),
            "incoming_tx_count": len(inc),
            "unique_receivers": out["to"].nunique(),
            "unique_senders": inc["from"].nunique(),
            "out_in_ratio": len(out) / len(inc) if len(inc) > 0 else np.nan,
            "receiver_per_tx": (out["to"].nunique() / len(out))
            if len(out) > 0
            else np.nan,
        }
    )

    # ===== Токены и контракты =====
    tokens = df[df["tokenSymbol"].notna()]
    features.update(
        {
            "token_tx_count": len(tokens),
            "token_tx_ratio": len(tokens) / tx_count if tx_count > 0 else 0,
            "unique_tokens": tokens["tokenSymbol"].nunique(),
            "unique_contracts": df["contractAddress"].nunique(),
            "distinct_contract_ratio": df["contractAddress"].nunique() / tx_count
            if tx_count > 0
            else 0,
            "unique_methods": df["functionName"].nunique(),
            "unique_methodIds": df["methodId"].nunique(),
            "contract_call_ratio": (df["input"].astype(str) != "0x").mean(),
        }
    )

    # ===== Повторы транзакций =====
    dup_cols = ["to", "value", "gas", "gasPrice", "functionName"]
    dup_count = df.duplicated(subset=dup_cols).sum()
    features.update(
        {
            "duplicate_tx_count": dup_count,
            "duplicate_tx_ratio": dup_count / tx_count if tx_count > 0 else 0,
        }
    )

    # ===== Время суток и типы транзакций =====
    hours = df["timeStamp"].dt.hour.dropna()
    hour_probs = hours.value_counts(normalize=True)
    type_probs = df["tx_type"].dropna().value_counts(normalize=True)
    features.update(
        {
            "peak_hour": hour_probs.idxmax() if not hour_probs.empty else np.nan,
            "peak_hour_ratio": hour_probs.max() if not hour_probs.empty else np.nan,
            "hour_entropy": -(hour_probs * np.log(hour_probs + 1e-9)).sum()
            if not hour_probs.empty
            else np.nan,
            "weekend_tx_ratio": (df["timeStamp"].dt.weekday >= 5).mean(),
            "unique_tx_types": df["tx_type"].nunique(),
            "tx_type_entropy": -(type_probs * np.log(type_probs + 1e-9)).sum()
            if not type_probs.empty
            else np.nan,
        }
    )

    # ===== Подтверждения =====
    conf = df["confirmations"]
    features.update(
        {
            "confirmations_mean": conf.mean(),
            "confirmations_std": conf.std(),
            "low_conf_ratio": (conf < conf.quantile(0.25)).mean()
            if len(conf) > 0
            else np.nan,
        }
    )

    return features


# ===== Функция для обработки всех .pkl файлов и агрегации с пропуском пустых и прогрессом =====
def process_pickles(folder_path: str) -> pd.DataFrame:
    features_list = []
    pkl_files = [f for f in os.listdir(folder_path) if f.endswith(".pkl")]
    for fname in tqdm(pkl_files, desc="Processing files"):
        file_path = os.path.join(folder_path, fname)
        with open(file_path, "rb") as f:
            list_of_dfs = pickle.load(f)
        for df in tqdm(list_of_dfs, desc=f"File {fname}", leave=False):
            for addr in df["address"].unique():
                subdf = df[df["address"] == addr]
                feats = extract_sybil_features_arbiscan(subdf)
                if not feats:  # пропустить адреса без транзакций до cutoff
                    continue
                feats["address"] = addr
                features_list.append(feats)
    return pd.DataFrame(features_list)


# ===== Пример запуска =====
if __name__ == "__main__":
    folder = "arbiscan_txns"
    all_features_df = process_pickles(folder)
    all_features_df.to_pickle("sybil_features_all_addresses.pickle")
    print(f"Extracted features for {len(all_features_df)} addresses.")

Processing files: 100%|██████████| 40/40 [09:46<00:00, 14.66s/it]


Extracted features for 56896 addresses.


# Etherscan

In [1]:
import pandas as pd
import warnings

# Подавить предупреждения от numpy и pandas о пустых срезах
warnings.filterwarnings("ignore", category=RuntimeWarning)


def extract_sybil_features_etherscan(df: pd.DataFrame) -> dict:
    """
    Извлекает статистические признаки для одного адреса из DataFrame транзакций Etherscan.
    Колонки могут частично отсутствовать — функции это не сломает.
    """
    df = df.copy()
    cols = set(df.columns)

    # ===== Приведение типов =====
    num_cols = [
        "blockNumber",
        "nonce",
        "transactionIndex",
        "value",
        "gas",
        "gasPrice",
        "cumulativeGasUsed",
        "gasUsed",
        "confirmations",
        "isError",
        "txreceipt_status",
        "errCode",
        "tokenDecimal",
    ]
    for col in num_cols:
        if col in cols:
            df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0)

    # timestamp → datetime
    if "timeStamp" in cols:
        df["timeStamp"] = pd.to_numeric(df["timeStamp"], errors="coerce")
        df["timeStamp"] = pd.to_datetime(df["timeStamp"], unit="s", errors="coerce")
        df = df.sort_values("timeStamp").reset_index(drop=True)
    else:
        return {}

    # Фильтрация по дате
    cutoff = pd.Timestamp("2024-06-20")
    df = df[df["timeStamp"] < cutoff]
    if df.empty:
        return {}

    addr = df["address"].iloc[0].lower()
    features = {}

    # ===== Базовые счётчики =====
    tx_count = len(df)
    days_active = df["timeStamp"].dt.date.nunique()
    lifetime = (
        (df["timeStamp"].max() - df["timeStamp"].min()).total_seconds() / 86400
        if tx_count > 1
        else 0.0
    )
    features.update(
        {"tx_count": tx_count, "active_days": days_active, "lifetime_days": lifetime}
    )

    # ===== Интервалы между Tx =====
    df["time_diff"] = df["timeStamp"].diff().dt.total_seconds()
    features.update(
        {
            "mean_tx_interval": df["time_diff"].mean(),
            "median_tx_interval": df["time_diff"].median(),
            "std_tx_interval": df["time_diff"].std(),
            "max_tx_interval": df["time_diff"].max(),
            "fast_tx_ratio": (df["time_diff"] < 1).mean(),
            "short_interval_ratio": (df["time_diff"] < 60).mean(),
            "tx_per_day": tx_count / days_active if days_active else float(tx_count),
        }
    )

    # ===== Блок и nonce =====
    features.update(
        {
            "mean_block_gap": df["blockNumber"].diff().fillna(0).mean(),
            "std_block_gap": df["blockNumber"].diff().fillna(0).std(),
            "mean_nonce_gap": df["nonce"].diff().fillna(0).mean(),
            "max_nonce_gap": df["nonce"].diff().fillna(0).max(),
        }
    )

    # ===== ETH-значения =====
    df["value_eth"] = df["value"] / 1e18
    v = df["value_eth"]
    features.update(
        {
            "total_value_eth": v.sum(),
            "mean_value_eth": v.mean(),
            "median_value_eth": v.median(),
            "std_value_eth": v.std(),
            "min_value_eth": v.min(),
            "max_value_eth": v.max(),
            "zero_value_tx_ratio": (v == 0).mean(),
        }
    )

    if tx_count:
        vc = v.value_counts()
        top_val, top_freq = vc.index[0], vc.iloc[0]
        probs = vc / tx_count
        q1, q3 = v.quantile([0.25, 0.75])
        features.update(
            {
                "most_common_value": top_val,
                "most_common_value_freq": top_freq,
                "most_common_value_ratio": top_freq / tx_count,
                "value_entropy": -(probs * np.log(probs + 1e-9)).sum(),
                "below_q1_ratio": (v < q1).mean(),
                "above_q3_ratio": (v > q3).mean(),
                "value_skew": v.skew(),
                "value_kurtosis": v.kurtosis(),
                "value_gini": (
                    2
                    * (np.arange(1, len(v) + 1) * np.sort(v)).sum()
                    / (len(v) * v.sum())
                    - (len(v) + 1) / len(v)
                ),
            }
        )

    # ===== Потоки ETH =====
    out_vals = df[df["from"].str.lower() == addr]["value_eth"]
    in_vals = df[df["to"].str.lower() == addr]["value_eth"]
    features.update(
        {
            "outgoing_value_eth": out_vals.sum(),
            "incoming_value_eth": in_vals.sum(),
            "net_value_eth": in_vals.sum() - out_vals.sum(),
        }
    )

    # ===== Газ =====
    df["gas_eth"] = (
        df["gasPrice"] * df["gasUsed"] / 1e18 if {"gasPrice", "gasUsed"} <= cols else 0
    )
    g = df["gas_eth"] if "gas_eth" in df else pd.Series(dtype=float)
    gp = df["gasPrice"] if "gasPrice" in cols else pd.Series(dtype=float)
    gas_used_ratio = (
        (df["gasUsed"] / df["gas"]) if {"gasUsed", "gas"} <= cols else np.array([])
    )
    features.update(
        {
            "total_gas_eth": g.sum() if not g.empty else 0.0,
            "mean_gas_eth": g.mean() if not g.empty else 0.0,
            "std_gas_eth": g.std() if not g.empty else 0.0,
            "mean_gas_price": gp.mean() if not gp.empty else 0.0,
            "std_gas_price": gp.std() if not gp.empty else 0.0,
            "gas_used_ratio_mean": np.nanmean(gas_used_ratio)
            if gas_used_ratio.size
            else 0.0,
        }
    )

    # ===== Ошибки и направления =====
    errs = (
        df["isError"].astype(int) if "isError" in cols else pd.Series(0, index=df.index)
    )
    features.update({"tx_errors": errs.sum(), "error_ratio": errs.mean()})
    out = df[df["from"].str.lower() == addr]
    inc = df[df["to"].str.lower() == addr]
    features.update(
        {
            "outgoing_tx_count": len(out),
            "incoming_tx_count": len(inc),
            "unique_receivers": out["to"].nunique(),
            "unique_senders": inc["from"].nunique(),
            "out_in_ratio": len(out) / len(inc) if len(inc) else np.nan,
            "receiver_per_tx": out["to"].nunique() / len(out) if len(out) else np.nan,
        }
    )

    # ===== Токен-транзакции =====
    if {"tokenSymbol", "tokenDecimal", "value"}.issubset(cols):
        token_mask = df["tokenSymbol"].notna() & (df["tokenDecimal"] > 0)
        tokens = df[token_mask].copy()
        if not tokens.empty:
            tokens["token_value"] = tokens["value"] / (10 ** tokens["tokenDecimal"])
            tv = tokens["token_value"]
            features.update(
                {
                    "token_tx_count": len(tokens),
                    "token_tx_ratio": len(tokens) / tx_count,
                    "total_token_value": tv.sum(),
                    "mean_token_value": tv.mean(),
                    "median_token_value": tv.median(),
                    "std_token_value": tv.std(),
                    "unique_tokens": tokens["tokenSymbol"].nunique(),
                    "unique_contracts": tokens["contractAddress"].nunique(),
                }
            )
        else:
            features.update(
                {
                    k: 0
                    for k in [
                        "token_tx_count",
                        "token_tx_ratio",
                        "total_token_value",
                        "mean_token_value",
                        "median_token_value",
                        "std_token_value",
                        "unique_tokens",
                        "unique_contracts",
                    ]
                }
            )
    else:
        features.update(
            {
                k: 0
                for k in [
                    "token_tx_count",
                    "token_tx_ratio",
                    "total_token_value",
                    "mean_token_value",
                    "median_token_value",
                    "std_token_value",
                    "unique_tokens",
                    "unique_contracts",
                ]
            }
        )

    # ===== Повторы транзакций =====
    dup_cols = [
        c for c in ["to", "value", "gas", "gasPrice", "functionName"] if c in cols
    ]
    dup_count = df.duplicated(subset=dup_cols).sum() if dup_cols else 0
    features.update(
        {
            "duplicate_tx_count": dup_count,
            "duplicate_tx_ratio": dup_count / tx_count if tx_count else 0.0,
        }
    )

    # ===== Время суток и типы Tx =====
    if "timeStamp" in cols:
        hours = df["timeStamp"].dt.hour.dropna()
        hour_probs = hours.value_counts(normalize=True)
        features["peak_hour"] = hour_probs.idxmax() if not hour_probs.empty else np.nan
        features["peak_hour_ratio"] = (
            hour_probs.max() if not hour_probs.empty else np.nan
        )
        features["hour_entropy"] = (
            -(hour_probs * np.log(hour_probs + 1e-9)).sum()
            if not hour_probs.empty
            else np.nan
        )
        features["weekend_tx_ratio"] = (df["timeStamp"].dt.weekday >= 5).mean()
    if "tx_type" in cols:
        type_probs = df["tx_type"].dropna().value_counts(normalize=True)
        features["unique_tx_types"] = df["tx_type"].nunique()
        features["tx_type_entropy"] = (
            -(type_probs * np.log(type_probs + 1e-9)).sum()
            if not type_probs.empty
            else np.nan
        )

    # ===== Подтверждения =====
    if "confirmations" in cols:
        conf = df["confirmations"]
        features.update(
            {
                "confirmations_mean": conf.mean(),
                "confirmations_std": conf.std(),
                "low_conf_ratio": (conf < conf.quantile(0.25)).mean()
                if len(conf)
                else np.nan,
            }
        )

    return features


def process_pickles(folder_path: str) -> pd.DataFrame:
    """
    Обходит все .pkl в папке, извлекает признаки для каждого адреса и
    возвращает единый DataFrame.
    """
    features_list = []
    pkl_files = [f for f in os.listdir(folder_path) if f.endswith(".pkl")]
    for fname in tqdm(pkl_files, desc="Processing files"):
        file_path = os.path.join(folder_path, fname)
        with open(file_path, "rb") as f:
            list_of_dfs = pickle.load(f)

        for df in tqdm(list_of_dfs, desc=f"File {fname}", leave=False):
            if "address" not in df.columns:
                continue
            for addr in df["address"].unique():
                subdf = df[df["address"] == addr]
                feats = extract_sybil_features_etherscan(subdf)
                if feats:
                    feats["address"] = addr
                    features_list.append(feats)

    return pd.DataFrame(features_list)


# ===== Пример запуска =====
if __name__ == "__main__":
    folder = "etherscan_txns"
    all_features_df = process_pickles(folder)
    all_features_df.to_pickle("etherscan_sybil_features_all_addresses.pickle")
    print(f"Extracted features for {len(all_features_df)} addresses.")

Processing files: 100%|██████████| 25/25 [06:01<00:00, 14.45s/it]


Extracted features for 19740 addresses.
