In [1]:
import datetime

import numpy as np
import pandas as pd

from src.data_handler import CoinDataStore
from src.simple_wash_detector_utils import detect_wash_trades_nearest

In [2]:
store = CoinDataStore("AVAX", engine="fastparquet")
df_matched = store.load_all()
df_matched

Unnamed: 0,price,size,time,seller,buyer
0,19.521999,213.199997,2025-03-22 10:50:22.930512310,8,658
1,19.521999,1.000000,2025-03-22 10:50:23.731759790,19,658
2,19.525999,11.820000,2025-03-22 10:50:43.490273671,161,1260
3,19.525999,39.150002,2025-03-22 10:50:43.490273671,161,19500
4,19.525000,8.800000,2025-03-22 10:50:43.490273671,161,240
...,...,...,...,...,...
1478242,24.966999,1.210000,2025-07-27 08:44:33.762000000,8,1527
1478243,24.971001,308.359985,2025-07-27 08:44:59.437000000,161,5135
1478244,24.971001,203.639999,2025-07-27 08:44:59.437000000,892,5135
1478245,24.971001,11.940000,2025-07-27 08:45:07.130000000,19,170117


In [3]:
df_matched = df_matched.sort_values('time')
df_matched['time'] = pd.to_datetime(df_matched['time'])

In [4]:
sells = df_matched[["seller", "time", "price", "size"]].rename(columns={"seller": "wallet_id"}).copy()
sells["is_ask"] = True

buys = df_matched[["buyer", "time", "price", "size"]].rename(columns={"buyer": "wallet_id"}).copy()
buys["is_ask"] = False

df_all = pd.concat([sells, buys], ignore_index=True)

df_all["notional"] = df_all["price"] * df_all["size"]
df_all = df_all.groupby(["wallet_id", "time", "is_ask"]).agg(size=("size", "sum"), notional=("notional", "sum")).reset_index()
df_all["price"] = (df_all["notional"] / df_all["size"]).astype("float32")
df_all = df_all.drop("notional", axis=1)
df_all

Unnamed: 0,wallet_id,time,is_ask,size,price
0,1,2025-06-30 19:16:16.060,False,47.049999,18.014999
1,1,2025-06-30 19:16:46.011,False,47.049999,18.025000
2,1,2025-06-30 19:17:17.005,False,47.059998,18.030884
3,1,2025-06-30 19:17:46.024,False,47.049999,18.018999
4,1,2025-06-30 19:18:16.077,False,47.059998,18.011999
...,...,...,...,...,...
2265245,272404,2025-06-23 16:40:53.219,True,959.900024,16.377071
2265246,272404,2025-06-23 16:45:39.769,False,959.900024,16.470161
2265247,274311,2025-06-15 15:19:18.194,True,2.710000,19.183001
2265248,274311,2025-06-15 17:27:03.505,False,2.710000,19.049000


In [21]:
def inter_direction_times(group):
    directions = group['is_ask'].values
    times = group['time'].values
    change_idx = np.where(directions[1:] != directions[:-1])[0]

    if len(change_idx) == 0:
        return pd.Series({
            'inter_dir_q25': np.nan,
            'inter_dir_q75': np.nan,
            'inter_dir_gaps': []  # store gaps for later burstiness
        })

    gaps = (times[change_idx + 1] - times[change_idx]) / np.timedelta64(1, 's')

    q25, q75 = np.percentile(gaps, [25, 75]) if len(gaps) > 0 else (np.nan, np.nan)
    return pd.Series({
        'inter_dir_q25': q25,
        'inter_dir_q75': q75,
        'inter_dir_gaps': [gaps]  # keep gaps as a list
    })

inter_dir_features = (
    df_all.groupby('wallet_id')
    .apply(inter_direction_times)
    .reset_index()
)

  .apply(inter_direction_times)


In [23]:
def burstiness_ci(gaps, n_boot=200):
    # Flatten list of arrays to one vector
    if not isinstance(gaps, (list, np.ndarray)) or len(gaps) == 0:
        return pd.Series({'burstiness': np.nan, 'ci_low': np.nan, 'ci_high': np.nan})
    x = np.array(gaps).flatten()
    x = x[~np.isnan(x)]
    if len(x) < 3:
        return pd.Series({'burstiness': np.nan, 'ci_low': np.nan, 'ci_high': np.nan})

    mean, std = np.mean(x), np.std(x)
    if mean == 0:
        return pd.Series({'burstiness': np.nan, 'ci_low': np.nan, 'ci_high': np.nan})

    burst = std / mean
    boots = []
    for _ in range(n_boot):
        sample = np.random.choice(x, len(x), replace=True)
        boots.append(np.std(sample) / np.mean(sample))

    return pd.Series({
        'burstiness': burst,
        'ci_low': np.percentile(boots, 2.5),
        'ci_high': np.percentile(boots, 97.5),
        'ci_diff': np.percentile(boots, 97.5) - np.percentile(boots, 2.5)
    })

burst_features = inter_dir_features[['wallet_id', 'inter_dir_gaps']].copy()
burst_features = burst_features.join(
    burst_features['inter_dir_gaps'].apply(burstiness_ci)
)
burst_features = burst_features.drop(columns=['inter_dir_gaps'])

In [30]:
from scipy.stats import entropy
df_all['hour'] = df_all['time'].dt.hour

def hourly_entropy(x):
    counts = x.value_counts(normalize=True)
    return entropy(counts, base=np.e)

hourly_entropy_feature = (
    df_all.groupby('wallet_id')['hour']
    .apply(hourly_entropy)
    .rename('hourly_activity_entropy')
)


In [32]:
import pandas as pd
import numpy as np

# assume df_all already has columns: wallet_id, time, size, price
df_all['hour_bin'] = df_all['time'].dt.floor('H')

# --- 1. Compute per-wallet per-hour metrics ---
agg_hour = (
    df_all.groupby(['wallet_id', 'hour_bin'])
    .agg(volume=('size', 'sum'),
         price_mean=('price', 'mean'),
         price_std=('price', 'std'))
    .reset_index()
)

# --- 2. Compute total market volume per hour ---
total_volume = (
    df_all.groupby('hour_bin')['size']
    .sum()
    .rename('total_volume')
    .reset_index()
)

# --- 3. Merge total market volume into wallet-hour data ---
agg_hour = agg_hour.merge(total_volume, on='hour_bin', how='left')

# --- 4. Define correlation metrics per wallet ---
def corr_metrics(x):
    # Skip wallets with too few points
    if x['volume'].nunique() < 2:
        return pd.Series({
            'corr_vol_price_change': np.nan,
            'corr_vol_price_var': np.nan,
            'corr_vol_total': np.nan
        })
    return pd.Series({
        'corr_vol_price_change': x['volume'].corr(x['price_mean']),
        'corr_vol_price_var': x['volume'].corr(x['price_std']),
        'corr_vol_total': x['volume'].corr(x['total_volume'])
    })

# --- 5. Apply per wallet ---
corr_features = (
    agg_hour.groupby('wallet_id')
    .apply(corr_metrics)
    .reset_index()
)

corr_features.head()

  df_all['hour_bin'] = df_all['time'].dt.floor('H')
  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  c /= stddev[:, None]
  c /= stddev[None, :]
  .apply(corr_metrics)


Unnamed: 0,wallet_id,corr_vol_price_change,corr_vol_price_var,corr_vol_total
0,1,0.265787,0.974336,0.999349
1,3,0.004572,0.373802,0.570454
2,5,-0.048225,0.088345,0.446239
3,6,-0.099909,0.161523,0.418365
4,7,-0.055087,0.079607,0.476944


In [33]:
temporal_features = (
    inter_dir_features[["wallet_id", "inter_dir_q25", "inter_dir_q75"]]
    .merge(burst_features, on='wallet_id', how='outer')
    .merge(hourly_entropy_feature, on='wallet_id', how='outer')
    .merge(corr_features, on='wallet_id', how='outer')
)

In [35]:
temporal_features = temporal_features.merge(
    df_all.groupby("wallet_id")["size"].sum().reset_index().rename({"size": "volume"}, axis=1),
    on="wallet_id",
    how="inner",
).merge(
    df_all.groupby("wallet_id")["size"].count().reset_index().rename({"size": "count"}, axis=1),
    on="wallet_id",
    how="inner",
)

In [37]:
temporal_features

Unnamed: 0,wallet_id,inter_dir_q25,inter_dir_q75,burstiness,ci_low,ci_high,ci_diff,hourly_activity_entropy,corr_vol_price_change,corr_vol_price_var,corr_vol_total,volume,count
0,1,20645.635000,20645.635000,,,,,0.769738,0.265787,0.974336,0.999349,4800.000000,69
1,3,11.334127,923.333500,4.073870,2.599605,4.947999,2.348395,3.005802,0.004572,0.373802,0.570454,711997.875000,3937
2,5,2082.892340,17220.403000,8.781987,1.450178,9.492506,8.042327,3.111425,-0.048225,0.088345,0.446239,101469.781250,2156
3,6,2217.443355,14453.644126,2.196128,1.565141,2.693937,1.128796,3.115243,-0.099909,0.161523,0.418365,160148.187500,3275
4,7,785.130000,6752.205922,2.611751,1.747361,3.328240,1.580878,3.132419,-0.055087,0.079607,0.476944,216830.234375,5523
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10630,271895,5752.241000,44209.640000,1.152154,0.000000,1.200914,1.200914,1.583258,-0.385314,1.000000,0.867330,637.720032,12
10631,272337,75406.695000,75406.695000,,,,,0.693147,1.000000,,-1.000000,119.320007,2
10632,272404,286.550000,286.550000,,,,,0.000000,,,,1919.800049,2
10633,274311,7665.311000,7665.311000,,,,,0.693147,,,,5.420000,2


In [41]:
size_var_feature = df_all.groupby("wallet_id")["size"].var().reset_index()
size_var_feature

Unnamed: 0,wallet_id,size
0,1,81397.765625
1,3,74534.617188
2,5,2694.651123
3,6,3188.360840
4,7,2668.510742
...,...,...
10630,271895,354.489594
10631,272337,0.000000
10632,272404,0.000000
10633,274311,0.000000


## Entropy of trading volume across counterparties

In [46]:
import pandas as pd
import numpy as np

def compute_entropy(df_matched):
    # Ensure we only use relevant columns
    df = df_matched[['seller', 'buyer', 'size']].copy()

    # --- SELLER-SIDE ENTROPY ---
    seller_grouped = df.groupby(['seller', 'buyer'], as_index=False)['size'].sum()

    def entropy(series):
        p = series / series.sum()
        return -(p * np.log(p)).sum()

    seller_entropy = (
        seller_grouped.groupby('seller')['size']
        .apply(entropy)
        .reset_index(name='seller_entropy')
    )

    # --- BUYER-SIDE ENTROPY ---
    buyer_grouped = df.groupby(['buyer', 'seller'], as_index=False)['size'].sum()

    buyer_entropy = (
        buyer_grouped.groupby('buyer')['size']
        .apply(entropy)
        .reset_index(name='buyer_entropy')
    )

    # --- Combine both sides ---
    entropy_features = (
        pd.merge(seller_entropy, buyer_entropy, left_on='seller', right_on='buyer', how='outer')
        .rename(columns={'seller': 'wallet_id'})
    )

    entropy_features['wallet_id'] = entropy_features['wallet_id'].fillna(entropy_features['buyer'])
    entropy_features = entropy_features[['wallet_id', 'seller_entropy', 'buyer_entropy']]

    # Fill NaN for wallets that appear only on one side
    entropy_features = entropy_features.fillna(0)

    # Optionally: total entropy (combined effect)
    entropy_features['total_entropy'] = entropy_features['seller_entropy'] + entropy_features['buyer_entropy']

    return entropy_features


In [52]:
entropy_df = compute_entropy(df_matched)
entropy_df.head()

Unnamed: 0,wallet_id,seller_entropy,buyer_entropy,total_entropy
0,1.0,1.953551,2.678925,4.632475
1,3.0,4.319869,4.034076,8.353945
2,5.0,3.905228,3.976908,7.882136
3,6.0,3.53274,3.549585,7.082325
4,7.0,3.924761,3.937603,7.862364


In [53]:
entropy_df = entropy_df.merge(
    df_all.groupby("wallet_id")["size"].sum().reset_index().rename({"size": "volume"}, axis=1),
    on="wallet_id",
    how="inner",
).merge(
    df_all.groupby("wallet_id")["size"].count().reset_index().rename({"size": "count"}, axis=1),
    on="wallet_id",
    how="inner",
)

In [64]:
entropy_df[(entropy_df["count"] > 100) & (entropy_df["buyer_entropy"] < 1.5)].sort_values("buyer_entropy")

Unnamed: 0,wallet_id,seller_entropy,buyer_entropy,total_entropy,volume,count
2711,19569.0,2.511521,0.0,2.511521,8715.919922,517
2777,19893.0,2.370097,0.0,2.370097,950.820007,113
3789,27508.0,2.312943,-0.0,2.312943,376.160004,117
7143,56099.0,2.060997,0.0,2.060997,5063.0,102
9501,168874.0,2.824775,0.0,2.824775,16221.400391,4056
6546,50851.0,2.632891,0.298086,2.930977,4290.839844,185
4182,30431.0,2.064757,0.631364,2.696121,1193.400024,103
3585,25815.0,2.049526,0.650901,2.700428,916.339966,124
567,2542.0,2.530237,0.681055,3.211292,250.0,124
4989,36895.0,2.900711,0.688633,3.589343,13303.549805,197


## Reciprocity counterparty ratio

In [19]:
import pandas as pd
import numpy as np

def compute_reciprocity_features_hourly(df_matched):
    df = df_matched[['seller', 'buyer', 'size', 'time']].copy()
    df['time'] = pd.to_datetime(df['time'])

    # Step 1: create 1-hour time window
    df['hour_window'] = df['time'].dt.floor('1H')

    # Step 2: aggregate volume per direction & hour
    flow = (
        df.groupby(['hour_window', 'seller', 'buyer'], as_index=False)['size']
        .sum()
        .rename(columns={'size': 'V_1to2'})
    )

    # Step 3: prepare reverse direction (buyer→seller, same hour)
    reverse_flow = flow.rename(columns={'seller': 'buyer', 'buyer': 'seller', 'V_1to2': 'V_2to1'})

    # Step 4: merge both directions for same time window
    merged = pd.merge(flow, reverse_flow, on=['hour_window', 'seller', 'buyer'], how='outer').fillna(0)

    # Step 5: compute ratio and features
    merged['R'] = merged[['V_1to2', 'V_2to1']].min(axis=1) / merged[['V_1to2', 'V_2to1']].max(axis=1)
    merged['R'] = merged['R'].fillna(0)

    merged['R_prime'] = merged['R'] * np.log(merged['V_1to2'] + merged['V_2to1'] + 1)
    merged['D'] = (merged['V_1to2'] - merged['V_2to1']) / (merged['V_1to2'] + merged['V_2to1'])
    merged['D'] = merged['D'].fillna(0)

    return merged[['hour_window', 'seller', 'buyer', 'V_1to2', 'V_2to1', 'R', 'R_prime', 'D']]

In [20]:
def wallet_reciprocal_intensity(hourly_df):
    df = hourly_df.copy()
    df['total_vol'] = df['V_1to2'] + df['V_2to1']
    # contribution of this pair-hour to reciprocity
    df['weighted_R'] = df['R_prime'] * df['total_vol']

    # sum over both roles
    sellers = (
        df.groupby('seller', as_index=False)[['weighted_R','total_vol']]
          .sum().rename(columns={'seller':'wallet'})
    )
    buyers = (
        df.groupby('buyer', as_index=False)[['weighted_R','total_vol']]
          .sum().rename(columns={'buyer':'wallet'})
    )

    combined = pd.concat([sellers, buyers]).groupby('wallet', as_index=False).sum()
    combined['RVI'] = combined['weighted_R'] / combined['total_vol']
    return combined[['wallet','RVI']]


In [21]:
def wallet_directional_asymmetry(hourly_df):
    df = hourly_df.copy()
    df['total_vol'] = df['V_1to2'] + df['V_2to1']
    df['weighted_absD'] = np.abs(df['D']) * df['total_vol']

    sellers = (
        df.groupby('seller', as_index=False)[['weighted_absD','total_vol']]
          .sum().rename(columns={'seller':'wallet'})
    )
    buyers = (
        df.groupby('buyer', as_index=False)[['weighted_absD','total_vol']]
          .sum().rename(columns={'buyer':'wallet'})
    )

    combined = pd.concat([sellers, buyers]).groupby('wallet', as_index=False).sum()
    combined['DA'] = 1 - (combined['weighted_absD'] / combined['total_vol'])
    return combined[['wallet','DA']]


In [27]:
hourly_df = compute_reciprocity_features_hourly(df_matched)
rvi = wallet_reciprocal_intensity(hourly_df)
da  = wallet_directional_asymmetry(hourly_df)
wallet_features = pd.merge(rvi, da, on='wallet', how='outer')

  df['hour_window'] = df['time'].dt.floor('1H')


In [35]:
wallet_features = wallet_features.rename({"wallet": "wallet_id"}, axis=1).merge(
    df_all.groupby("wallet_id")["size"].sum().reset_index().rename({"size": "volume"}, axis=1),
    on="wallet_id",
    how="inner",
).merge(
    df_all.groupby("wallet_id")["size"].count().reset_index().rename({"size": "count"}, axis=1),
    on="wallet_id",
    how="inner",
)

In [44]:
wallet_features[(wallet_features["count"] > 100) & (wallet_features["volume"] > 100000)].sort_values("DA").head(50)

Unnamed: 0,wallet_id,RVI,DA,volume,count
38,82,0.0,0.0,163561.4,12813
33,64,0.0,0.0,300585.6,6780
970,5742,0.0,0.0,116914.8,495
1205,7559,0.0,0.0,639877.4,5918
1825,12367,0.0,0.0,382824.7,583
2088,14468,0.0,0.0,301350.5,31148
1409,9356,0.0,0.0,136451.3,2052
221,922,0.0,0.0,108093.7,207
916,5447,0.0,0.0,679592.2,244
3073,21464,0.0,0.0,229632.4,1199
