In [2]:
import pandas as pd
import numpy as np

def detect_twap_trades(df, size_tol=0.05, timediff_tol=0.20, min_trades=3):
    df = df.copy()
    df['time'] = pd.to_datetime(df['time'])

    # Sort by wallet → side → time
    df = df.sort_values(['wallet_id', 'is_ask', 'time'])

    twap_indices = set()

    for (wallet, side), group in df.groupby(['wallet_id', 'is_ask']):
        if len(group) < min_trades:
            continue

        g = group.copy()
        g['time_diff'] = g['time'].diff().dt.total_seconds()
        g['size_ratio'] = g['size'] / g['size'].shift(1)

        # Conditions
        cond_size = g['size_ratio'].between(1 - size_tol, 1 + size_tol)

        median_dt = g['time_diff'].median()
        if np.isnan(median_dt) or median_dt == 0:
            continue

        cond_time = g['time_diff'].between(
            median_dt * (1 - timediff_tol),
            median_dt * (1 + timediff_tol)
        )

        twap_mask = (cond_size & cond_time).fillna(False)

        # Detect consecutive runs
        run_start = None

        for idx, val in twap_mask.items():   # ← FIXED HERE
            if val and run_start is None:
                run_start = idx
            if (not val) and run_start is not None:
                run_end = idx
                run = twap_mask.loc[run_start:run_end]
                if run.sum() >= min_trades:
                    twap_indices.update(run.index)
                run_start = None

        # If run goes until the end
        if run_start is not None:
            run = twap_mask.loc[run_start:]
            if run.sum() >= min_trades:
                twap_indices.update(run.index)

    return df.loc[df.index.isin(twap_indices)].copy()


In [3]:
from src.data_handler import CoinDataStore

store = CoinDataStore("AVAX", engine="fastparquet")
df_all_matched = store.load_all()
df_all_matched

Unnamed: 0,price,size,time,seller,buyer
0,19.521999,213.199997,2025-03-22 10:50:22.930512310,8,658
1,19.521999,1.000000,2025-03-22 10:50:23.731759790,19,658
2,19.525999,11.820000,2025-03-22 10:50:43.490273671,161,1260
3,19.525999,39.150002,2025-03-22 10:50:43.490273671,161,19500
4,19.525000,8.800000,2025-03-22 10:50:43.490273671,161,240
...,...,...,...,...,...
1478242,24.966999,1.210000,2025-07-27 08:44:33.762000000,8,1527
1478243,24.971001,308.359985,2025-07-27 08:44:59.437000000,161,5135
1478244,24.971001,203.639999,2025-07-27 08:44:59.437000000,892,5135
1478245,24.971001,11.940000,2025-07-27 08:45:07.130000000,19,170117


In [4]:
df_all_matched["time"] = df_all_matched["time"].dt.floor("ms")

sells = df_all_matched[["seller", "time", "price", "size"]].rename(columns={"seller": "wallet_id"}).copy()
sells["is_ask"] = True

buys = df_all_matched[["buyer", "time", "price", "size"]].rename(columns={"buyer": "wallet_id"}).copy()
buys["is_ask"] = False

df_all = pd.concat([sells, buys], ignore_index=True)

df_all["notional"] = df_all["price"] * df_all["size"]
df_all = df_all.groupby(["wallet_id", "time", "is_ask"]).agg(size=("size", "sum"), notional=("notional", "sum")).reset_index()
df_all["price"] = (df_all["notional"] / df_all["size"]).astype("float32")
df_all = df_all.drop("notional", axis=1)
df_all

Unnamed: 0,wallet_id,time,is_ask,size,price
0,1,2025-06-30 19:16:16.060,False,47.049999,18.014999
1,1,2025-06-30 19:16:46.011,False,47.049999,18.025000
2,1,2025-06-30 19:17:17.005,False,47.059998,18.030884
3,1,2025-06-30 19:17:46.024,False,47.049999,18.018999
4,1,2025-06-30 19:18:16.077,False,47.059998,18.011999
...,...,...,...,...,...
2265245,272404,2025-06-23 16:40:53.219,True,959.900024,16.377071
2265246,272404,2025-06-23 16:45:39.769,False,959.900024,16.470161
2265247,274311,2025-06-15 15:19:18.194,True,2.710000,19.183001
2265248,274311,2025-06-15 17:27:03.505,False,2.710000,19.049000


In [5]:
df_twap = detect_twap_trades(df_all)
df_twap["twap"] = True

print(df_twap.head())
print("Detected TWAP trades:", len(df_twap))

   wallet_id                    time  is_ask       size      price  twap
1          1 2025-06-30 19:16:46.011   False  47.049999  18.025000  True
2          1 2025-06-30 19:17:17.005   False  47.059998  18.030884  True
3          1 2025-06-30 19:17:46.024   False  47.049999  18.018999  True
4          1 2025-06-30 19:18:16.077   False  47.059998  18.011999  True
5          1 2025-06-30 19:18:47.056   False  47.049999  18.016001  True
Detected TWAP trades: 129666


In [6]:
129659 / 2265250

0.057238273921200754

In [7]:
df_matched_without_twap = pd.merge(
    df_all_matched,
    df_twap,
    on="time",
    how="outer"
)
df_matched_without_twap = df_matched_without_twap[df_matched_without_twap["twap"].isna()]
df_matched_without_twap = df_matched_without_twap.drop(["wallet_id", "is_ask", "size_y", "price_y", "twap"], axis=1).rename({"price_x": "price", "size_x": "size"}, axis=1)
df_matched_without_twap

Unnamed: 0,price,size,time,seller,buyer
0,19.521999,213.199997,2025-03-22 10:50:22.930,8,658
1,19.521999,1.000000,2025-03-22 10:50:23.731,19,658
2,19.525999,11.820000,2025-03-22 10:50:43.490,161,1260
3,19.525999,39.150002,2025-03-22 10:50:43.490,161,19500
4,19.525000,8.800000,2025-03-22 10:50:43.490,161,240
...,...,...,...,...,...
1480939,24.966999,1.210000,2025-07-27 08:44:33.762,8,1527
1480940,24.971001,308.359985,2025-07-27 08:44:59.437,161,5135
1480941,24.971001,203.639999,2025-07-27 08:44:59.437,892,5135
1480942,24.971001,11.940000,2025-07-27 08:45:07.130,19,170117


### Seller Buyer randomization for TWAP

In [8]:
from src.simple_wash_detector_utils import detect_wash_trades_local

time_diff_s = 1 * 60 # 1 minute difference
price_diff_pct = 0.01
size_diff_pct = 0.01

df_detected = detect_wash_trades_local(df_matched_without_twap, time_diff_s, price_diff_pct, size_diff_pct, None)
df_detected

Unnamed: 0,wallet_id,day,open_time,close_time,duration_s,open_side,close_side,open_price,close_price,price_change_pct,open_size,close_size,size_change_pct,pairing_direction,pair_id
0,8,2025-03-22,2025-03-22 13:48:26.900,2025-03-22 13:49:26.300,59.40,buy,sell,19.362000,19.357000,0.000258,0.790000,0.790000,0.000000,buy_to_sell,0
1,8,2025-03-22,2025-03-22 20:10:04.500,2025-03-22 20:10:51.100,46.60,sell,buy,19.652000,19.658001,0.000305,6.960000,6.950000,0.001437,sell_to_buy,1
2,8,2025-03-22,2025-03-22 21:06:57.250,2025-03-22 21:06:57.350,0.10,buy,sell,19.582001,19.582001,0.000000,9.430000,9.410000,0.002121,buy_to_sell,2
3,19,2025-03-22,2025-03-22 21:37:53.000,2025-03-22 21:37:53.000,0.00,buy,sell,19.514997,19.514000,0.000051,57.939999,57.939999,0.000000,buy_to_sell,3
4,88,2025-03-22,2025-03-22 15:08:08.100,2025-03-22 15:08:20.850,12.75,sell,buy,19.379000,19.379999,0.000052,5.160000,5.160000,0.000000,sell_to_buy,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19389,15857,2025-07-27,2025-07-27 00:00:24.500,2025-07-27 00:00:31.900,7.40,buy,sell,24.979000,24.961002,0.000721,0.690000,0.690000,0.000000,buy_to_sell,19389
19390,15857,2025-07-27,2025-07-27 00:07:37.850,2025-07-27 00:08:31.300,53.45,buy,sell,24.947998,24.927002,0.000842,0.690000,0.690000,0.000000,buy_to_sell,19390
19391,22514,2025-07-27,2025-07-27 06:52:33.400,2025-07-27 06:53:23.600,50.20,sell,buy,25.155001,25.174999,0.000795,0.400000,0.400000,0.000000,sell_to_buy,19391
19392,22514,2025-07-27,2025-07-27 06:59:24.400,2025-07-27 07:00:06.550,42.15,sell,buy,25.200003,25.194000,0.000238,0.400000,0.400000,0.000000,sell_to_buy,19392


In [9]:
h0_volume = (df_detected["open_size"] + df_detected["close_size"]).sum()
h0_count = df_detected["open_size"].shape[0]
print(h0_volume, "\n", h0_count)

2059179.8 
 19394


In [10]:
n_sim_seller_buyer = []

def block_and_side_permutation_optimized(df):
    rng = np.random.default_rng()
    df_perm = df.copy()

    def permute_column(x):
        return rng.permutation(x.values)

    df_perm["wallet_id"] = df_perm.groupby(["side"], sort=False)["wallet_id"].transform(permute_column)

    out = df_perm.sort_values("time")
    return out

In [11]:
print(f"number\t - volume share\t - count share")
for i in range(2):
    sim_time_detected = detect_wash_trades_local(df_matched_without_twap, time_diff_s, price_diff_pct, size_diff_pct, block_and_side_permutation_optimized)

    sim_time_detected_share_volume = (sim_time_detected["open_size"] + sim_time_detected["close_size"]).sum() / h0_volume
    sim_time_detected_share_count = sim_time_detected["open_size"].shape[0] / h0_count
    n_sim_seller_buyer.append(sim_time_detected)
    print(f"{i}\t\t - {sim_time_detected_share_volume:.{3}f}\t\t - {sim_time_detected_share_count:.{3}f}")

number	 - volume share	 - count share
0		 - 0.885		 - 1.736
1		 - 0.892		 - 1.735


### Size permutations

In [12]:
n_sim_perm_size = []

def size_permutation_optimized(df):
    rng = np.random.default_rng()
    df_perm = df.copy()

    df_perm["wallet_id"] = rng.permutation(df_perm["wallet_id"].values)

    out = df_perm.sort_values("time")
    return out

In [13]:
print(f"number\t - volume share\t - count share")
for i in range(2):
    sim_time_detected = detect_wash_trades_local(df_matched_without_twap, time_diff_s, price_diff_pct, size_diff_pct, size_permutation_optimized)

    sim_time_detected_share_volume = (sim_time_detected["open_size"] + sim_time_detected["close_size"]).sum() / h0_volume
    sim_time_detected_share_count = sim_time_detected["open_size"].shape[0] / h0_count
    n_sim_perm_size.append(sim_time_detected)
    print(f"{i}\t\t - {sim_time_detected_share_volume:.{3}f}\t\t - {sim_time_detected_share_count:.{3}f}")

number	 - volume share	 - count share
0		 - 1.026		 - 2.086
1		 - 0.990		 - 2.059


# Size > 2

In [14]:
df_small_sizes = df_all_matched.groupby(["time"])["size"].sum().reset_index()
df_small_sizes = df_small_sizes[df_small_sizes["size"] < 2]
df_small_sizes

Unnamed: 0,time,size
1,2025-03-22 10:50:23.731,1.00
6,2025-03-22 10:51:26.271,0.79
7,2025-03-22 10:52:22.010,1.86
8,2025-03-22 10:52:30.055,0.91
9,2025-03-22 10:52:39.523,0.96
...,...,...
905727,2025-07-27 08:42:15.276,1.01
905731,2025-07-27 08:43:18.559,1.00
905732,2025-07-27 08:44:11.704,0.67
905733,2025-07-27 08:44:33.762,1.21


In [15]:
df_all_matched_big_sizes = pd.merge(
    df_all_matched,
    df_small_sizes,
    on="time",
    how="outer"
)
df_all_matched_big_sizes = df_all_matched_big_sizes[df_all_matched_big_sizes["size_y"].isna()]
df_all_matched_big_sizes = df_all_matched_big_sizes.drop(["size_y"], axis=1).rename({"size_x": "size"}, axis=1)
df_all_matched_big_sizes

Unnamed: 0,price,size,time,seller,buyer
0,19.521999,213.199997,2025-03-22 10:50:22.930,8,658
2,19.525999,11.820000,2025-03-22 10:50:43.490,161,1260
3,19.525999,39.150002,2025-03-22 10:50:43.490,161,19500
4,19.525000,8.800000,2025-03-22 10:50:43.490,161,240
5,19.527000,62.029999,2025-03-22 10:50:44.892,161,8
...,...,...,...,...,...
1478238,24.947001,23.629999,2025-07-27 08:42:56.533,892,170117
1478239,24.945000,2.070000,2025-07-27 08:43:07.141,55206,1527
1478243,24.971001,308.359985,2025-07-27 08:44:59.437,161,5135
1478244,24.971001,203.639999,2025-07-27 08:44:59.437,892,5135


In [16]:
from src.simple_wash_detector_utils import detect_wash_trades_local

time_diff_s = 1 * 60
price_diff_pct = 0.01
size_diff_pct = 0.01

df_detected = detect_wash_trades_local(df_all_matched_big_sizes, time_diff_s, price_diff_pct, size_diff_pct, None)
df_detected

Unnamed: 0,wallet_id,day,open_time,close_time,duration_s,open_side,close_side,open_price,close_price,price_change_pct,open_size,close_size,size_change_pct,pairing_direction,pair_id
0,8,2025-03-22,2025-03-22 20:10:04.500,2025-03-22 20:10:51.100,46.60,sell,buy,19.652000,19.658001,3.053388e-04,6.960000,6.950000,0.001437,sell_to_buy,0
1,8,2025-03-22,2025-03-22 21:06:57.250,2025-03-22 21:06:57.350,0.10,buy,sell,19.582001,19.582001,0.000000e+00,9.430000,9.410000,0.002121,buy_to_sell,1
2,19,2025-03-22,2025-03-22 21:37:53.000,2025-03-22 21:37:53.000,0.00,buy,sell,19.514997,19.514000,5.111675e-05,57.939999,57.939999,0.000000,buy_to_sell,2
3,88,2025-03-22,2025-03-22 13:55:41.650,2025-03-22 13:56:02.100,20.45,sell,buy,19.405001,19.408001,1.546127e-04,3.940000,3.940000,0.000000,sell_to_buy,3
4,88,2025-03-22,2025-03-22 15:08:08.100,2025-03-22 15:08:20.850,12.75,sell,buy,19.379000,19.379999,5.157390e-05,5.160000,5.160000,0.000000,sell_to_buy,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15602,1527,2025-07-27,2025-07-27 01:07:18.100,2025-07-27 01:08:05.050,46.95,buy,sell,25.037998,25.038000,7.617816e-08,10.700000,10.740000,0.003738,buy_to_sell,15602
15603,5165,2025-07-27,2025-07-27 00:49:41.100,2025-07-27 00:50:33.700,52.60,buy,sell,24.882000,24.908001,1.044971e-03,20.059999,20.100000,0.001994,buy_to_sell,15603
15604,5165,2025-07-27,2025-07-27 05:06:07.850,2025-07-27 05:06:32.650,24.80,sell,buy,25.040001,25.024000,6.390075e-04,19.990000,19.969999,0.001001,sell_to_buy,15604
15605,7409,2025-07-27,2025-07-27 02:04:26.300,2025-07-27 02:04:45.000,18.70,buy,sell,24.916000,24.912001,1.605278e-04,2.930000,2.930000,0.000000,buy_to_sell,15605


In [17]:
h0_volume = (df_detected["open_size"] + df_detected["close_size"]).sum()
h0_count = df_detected["open_size"].shape[0]
print(h0_volume, "\n", h0_count)

2199741.0 
 15607


In [18]:
n_sim_seller_buyer = []

def block_and_side_permutation_optimized(df):
    rng = np.random.default_rng()
    df_perm = df.copy()

    def permute_column(x):
        return rng.permutation(x.values)

    df_perm["wallet_id"] = df_perm.groupby(["side"], sort=False)["wallet_id"].transform(permute_column)

    out = df_perm.sort_values("time")
    return out

In [19]:
print(f"number\t - volume share\t - count share")
for i in range(2):
    sim_time_detected = detect_wash_trades_local(df_all_matched_big_sizes, time_diff_s, price_diff_pct, size_diff_pct, block_and_side_permutation_optimized)

    sim_time_detected_share_volume = (sim_time_detected["open_size"] + sim_time_detected["close_size"]).sum() / h0_volume
    sim_time_detected_share_count = sim_time_detected["open_size"].shape[0] / h0_count
    n_sim_seller_buyer.append(sim_time_detected)
    print(f"{i}\t\t - {sim_time_detected_share_volume:.{3}f}\t\t - {sim_time_detected_share_count:.{3}f}")

number	 - volume share	 - count share
0		 - 0.800		 - 1.703
1		 - 0.809		 - 1.697


# Size > 2 AND no TWAP

In [109]:
df_matched_big_sizes_no_twap = pd.merge(
    df_matched_without_twap,
    df_small_sizes,
    on="time",
    how="outer"
)
df_matched_big_sizes_no_twap = df_matched_big_sizes_no_twap[df_matched_big_sizes_no_twap["size_y"].isna()]
df_matched_big_sizes_no_twap = df_matched_big_sizes_no_twap.drop(["size_y"], axis=1).rename({"size_x": "size"}, axis=1)
df_matched_big_sizes_no_twap

Unnamed: 0,price,size,time,seller,buyer
0,19.521999,213.199997,2025-03-22 10:50:22.930,8.0,658.0
2,19.525999,11.820000,2025-03-22 10:50:43.490,161.0,1260.0
3,19.525999,39.150002,2025-03-22 10:50:43.490,161.0,19500.0
4,19.525000,8.800000,2025-03-22 10:50:43.490,161.0,240.0
5,19.527000,62.029999,2025-03-22 10:50:44.892,161.0,8.0
...,...,...,...,...,...
1377870,24.947001,23.629999,2025-07-27 08:42:56.533,892.0,170117.0
1377871,24.945000,2.070000,2025-07-27 08:43:07.141,55206.0,1527.0
1377875,24.971001,308.359985,2025-07-27 08:44:59.437,161.0,5135.0
1377876,24.971001,203.639999,2025-07-27 08:44:59.437,892.0,5135.0


In [110]:
from src.simple_wash_detector_utils import detect_wash_trades_local

time_diff_s = 1 * 60
price_diff_pct = 0.01
size_diff_pct = 0.01

df_detected = detect_wash_trades_local(df_matched_big_sizes_no_twap, time_diff_s, price_diff_pct, size_diff_pct, None)
df_detected

Unnamed: 0,wallet_id,day,open_time,close_time,duration_s,open_side,close_side,open_price,close_price,price_change_pct,open_size,close_size,size_change_pct,pairing_direction,pair_id
0,8,2025-03-22,2025-03-22 20:10:04.500,2025-03-22 20:10:51.100,46.60,sell,buy,19.652000,19.658001,3.053388e-04,6.960000,6.950000,0.001437,sell_to_buy,0
1,8,2025-03-22,2025-03-22 21:06:57.250,2025-03-22 21:06:57.350,0.10,buy,sell,19.582001,19.582001,0.000000e+00,9.430000,9.410000,0.002121,buy_to_sell,1
2,19,2025-03-22,2025-03-22 21:37:53.000,2025-03-22 21:37:53.000,0.00,buy,sell,19.514997,19.514000,5.111675e-05,57.939999,57.939999,0.000000,buy_to_sell,2
3,88,2025-03-22,2025-03-22 15:08:08.100,2025-03-22 15:08:20.850,12.75,sell,buy,19.379000,19.379999,5.157390e-05,5.160000,5.160000,0.000000,sell_to_buy,3
4,88,2025-03-22,2025-03-22 20:08:40.500,2025-03-22 20:09:36.050,55.55,buy,sell,19.653000,19.645000,4.070330e-04,5.090000,5.090000,0.000000,buy_to_sell,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13880,1527,2025-07-27,2025-07-27 01:07:18.100,2025-07-27 01:08:05.050,46.95,buy,sell,25.037998,25.038000,7.617816e-08,10.700000,10.740000,0.003738,buy_to_sell,13880
13881,5165,2025-07-27,2025-07-27 00:49:41.100,2025-07-27 00:50:33.700,52.60,buy,sell,24.882000,24.908001,1.044971e-03,20.059999,20.100000,0.001994,buy_to_sell,13881
13882,5165,2025-07-27,2025-07-27 05:06:07.850,2025-07-27 05:06:32.650,24.80,sell,buy,25.040001,25.024000,6.390075e-04,19.990000,19.969999,0.001001,sell_to_buy,13882
13883,7409,2025-07-27,2025-07-27 02:04:26.300,2025-07-27 02:04:45.000,18.70,buy,sell,24.916000,24.912001,1.605278e-04,2.930000,2.930000,0.000000,buy_to_sell,13883


In [111]:
h0_volume = (df_detected["open_size"] + df_detected["close_size"]).sum()
h0_count = df_detected["open_size"].shape[0]
print(h0_volume, "\n", h0_count)

2082289.6 
 13885


In [112]:
n_sim_seller_buyer = []

def block_and_side_permutation_optimized(df):
    rng = np.random.default_rng()
    df_perm = df.copy()

    def permute_column(x):
        return rng.permutation(x.values)

    df_perm["wallet_id"] = df_perm.groupby(["side"], sort=False)["wallet_id"].transform(permute_column)

    out = df_perm.sort_values("time")
    return out

In [113]:
print(f"number\t - volume share\t - count share")
for i in range(2):
    sim_time_detected = detect_wash_trades_local(df_matched_big_sizes_no_twap, time_diff_s, price_diff_pct, size_diff_pct, block_and_side_permutation_optimized)

    sim_time_detected_share_volume = (sim_time_detected["open_size"] + sim_time_detected["close_size"]).sum() / h0_volume
    sim_time_detected_share_count = sim_time_detected["open_size"].shape[0] / h0_count
    n_sim_seller_buyer.append(sim_time_detected)
    print(f"{i}\t\t - {sim_time_detected_share_volume:.{3}f}\t\t - {sim_time_detected_share_count:.{3}f}")

number	 - volume share	 - count share
0		 - 0.790		 - 1.652
1		 - 0.793		 - 1.635
