In [5]:
# pip install networkx pandas numpy (if not already available)
from __future__ import annotations

from collections import Counter
from typing import Dict, Iterable, Tuple, Set, FrozenSet, List

import pandas as pd
import numpy as np
import networkx as nx


from src.data_handler import CoinDataStore


In [30]:
from dataclasses import dataclass
from collections import Counter
from typing import Dict, Iterable, List, Tuple, Set, FrozenSet, Optional

import numpy as np
import pandas as pd
import networkx as nx


# =========================
# 0) SAFETY / DTYPE HELPERS
# =========================

def _ensure_schema(df: pd.DataFrame) -> pd.DataFrame:
    """
    Make a safe copy with the expected dtypes:
      price: float64 (ok if originally float32)
      size:  float64
      time:  datetime64[ns]
      seller, buyer: int (Python ints)
    """
    df = df.copy()
    df["price"]  = df["price"].astype("float64")
    df["size"]   = df["size"].astype("float64")
    df["time"]   = pd.to_datetime(df["time"], utc=False)
    df["seller"] = df["seller"].astype("uint64").astype(int)
    df["buyer"]  = df["buyer"].astype("uint64").astype(int)
    return df


# ========================
# 1) ALGORITHM 1 (SCC peel)
# ========================

def build_weighted_digraph_from_trades(df: pd.DataFrame) -> nx.DiGraph:
    """
    One edge per (seller -> buyer) with 'weight' = number of rows (trades).
    This collapses partial fills into edge weight (count of parallel trades).
    """
    df = _ensure_schema(df)

    edge_counts = (
        df.groupby(["seller", "buyer"], as_index=False)
          .size()
          .rename(columns={"size": "weight"})
    )

    G = nx.DiGraph()
    for row in edge_counts.itertuples(index=False):
        G.add_edge(int(row.seller), int(row.buyer), weight=int(row.weight))
    return G


def iterative_scc_peeling(
    G_in: nx.DiGraph,
    *,
    min_scc_size: int = 2,
    count_singletons_with_selfloop: bool = True,
) -> Counter[Frozenset[int]]:
    """
    Iteratively:
      - find SCCs (record their vertex sets)
      - decrement every edge weight by 1; remove zeros
    Stop when no edges remain. Return a Counter of {frozenset(addresses): occurrences}.
    """
    G = G_in.copy(as_view=False)

    # normalize weights
    for _, _, data in G.edges(data=True):
        w = int(data.get("weight", 1))
        if w <= 0:
            raise ValueError("Edge weight must be positive.")
        data["weight"] = w

    scc_counter: Counter[Frozenset[int]] = Counter()

    def should_count_scc(scc: Set[int]) -> bool:
        if len(scc) >= max(2, min_scc_size):
            return True
        if len(scc) == 1 and count_singletons_with_selfloop:
            n = next(iter(scc))
            return G.has_edge(n, n)
        return False

    while G.number_of_edges() > 0:
        # 1) SCCs on current (unweighted) structure
        sccs: List[Set[int]] = list(nx.strongly_connected_components(G))
        # 2) record vertex sets
        for scc in sccs:
            if should_count_scc(scc):
                scc_counter[frozenset(scc)] += 1
        # 3) peel one layer
        to_remove = []
        for u, v, data in G.edges(data=True):
            data["weight"] -= 1
            if data["weight"] <= 0:
                to_remove.append((u, v))
        G.remove_edges_from(to_remove)

    return scc_counter


def algorithm1_for_token_df(
    df_token: pd.DataFrame,
    *,
    min_scc_size: int = 2,
    count_singletons_with_selfloop: bool = True,
    candidate_threshold: int = 100,
) -> Tuple[Counter[Frozenset[int]], List[Tuple[Frozenset[int], int]]]:
    """
    Run Algorithm 1 on a single-token DataFrame.
    Returns:
      - scc_counter: frozenset(addresses) -> occurrence count
      - candidates:  list[(frozenset(addresses), count)] with count >= candidate_threshold (sorted desc)
    """
    G = build_weighted_digraph_from_trades(df_token)
    scc_counter = iterative_scc_peeling(
        G,
        min_scc_size=min_scc_size,
        count_singletons_with_selfloop=count_singletons_with_selfloop,
    )
    candidates = [(S, c) for S, c in scc_counter.items() if c >= candidate_threshold]
    candidates.sort(key=lambda x: x[1], reverse=True)
    return scc_counter, candidates


# ==========================
# 2) POSITION SUMS (HELPERS)
# ==========================

@dataclass
class PositionResult:
    positions: pd.Series          # index: account id, values: net token position
    v_mean: float                 # mean trade size in the set
    v_tol: float                  # v = m * v_mean
    max_abs_pos: float            # max_i |p_i|
    is_wash: bool                 # True if max_abs_pos <= v_tol
    n_trades: int


def compute_positions(df_window: pd.DataFrame) -> pd.Series:
    """
    Per-account token positions for a set of trades:
      buyer +size, seller -size.
    """
    if df_window.empty:
        return pd.Series(dtype="float64")

    df_window = _ensure_schema(df_window)

    pos_buy  = df_window.groupby("buyer")["size"].sum()
    pos_sell = (-df_window["size"]).groupby(df_window["seller"]).sum()

    positions = pos_buy.add(pos_sell, fill_value=0.0).astype("float64")
    return positions


def wash_test(df_window: pd.DataFrame, m: float = 0.01) -> PositionResult:
    """
    Return whether the set of trades is a 'wash result':
      max_i |position_i| <= m * mean(size)
    """
    n = len(df_window)
    if n == 0:
        return PositionResult(
            positions=pd.Series(dtype="float64"),
            v_mean=0.0, v_tol=0.0, max_abs_pos=np.inf, is_wash=False, n_trades=0
        )
    if n == 1:
        # Need >=2 trades for a meaningful loop
        positions = compute_positions(df_window)
        v_mean = float(df_window["size"].mean())
        v_tol = m * v_mean
        max_abs_pos = float(np.abs(positions).max()) if len(positions) else 0.0
        return PositionResult(positions, v_mean, v_tol, max_abs_pos, False, n)

    v_mean = float(df_window["size"].mean())
    v_tol = m * v_mean
    positions = compute_positions(df_window)
    max_abs_pos = float(np.abs(positions).max()) if len(positions) else 0.0
    is_wash = (max_abs_pos <= v_tol)
    return PositionResult(positions, v_mean, v_tol, max_abs_pos, is_wash, n)


# =====================================
# 3) ALGORITHM 2 (WINDOWED PREFIX MATCH)
# =====================================

def algorithm2_volume_matching(
    df_token: pd.DataFrame,
    candidate_wallets: Iterable[int],
    *,
    m: float = 0.01,
    windows: Tuple[str, ...] = ("1H", "1D", "7D"),
    min_trades_in_set: int = 2,
    week_floor: Optional[str] = None,   # e.g. "W-MON"
    id_prefix: Optional[str] = None,
) -> pd.DataFrame:
    """
    Label concrete wash trades among the candidate wallet set S within 1H -> 1D -> 7D tumbling windows.
    - Trades labeled in a smaller window are not reconsidered in larger windows.
    - Inside a window, scan prefixes (longest first), accept those that pass wash_test, and continue.
    Returns a copy of df_token with label columns filled.
    """
    df = _ensure_schema(df_token)

    # Only trades inside the candidate set
    S: Set[int] = set(int(x) for x in candidate_wallets)
    df_S = df.loc[df["seller"].isin(S) & df["buyer"].isin(S)].sort_values("time").copy()

    # Prepare label columns for the whole df (others remain False/NaN)
    df["is_wash"] = False
    for col, dtype in {
        "wash_id": "object",
        "wash_window": "object",
        "wash_window_start": "datetime64[ns]",
        "wash_window_end":   "datetime64[ns]",
        "wash_n_trades": "float64",
        "wash_m": "float64",
        "wash_v_mean": "float64",
        "wash_v_tol": "float64",
        "wash_max_abs_pos": "float64",
    }.items():
        df[col] = pd.Series(index=df.index, dtype=dtype)

    if df_S.empty:
        return df

    labeled_idx: Set[int] = set()
    wash_counter = 0

    def label_trades(
        idxs: List[int],
        window_tag: str,
        w_start: pd.Timestamp,
        w_end: pd.Timestamp,
        res: PositionResult,
    ):
        nonlocal wash_counter
        wash_counter += 1
        wid = f"{id_prefix or ''}{wash_counter}" if id_prefix else str(wash_counter)

        df.loc[idxs, "is_wash"] = True
        df.loc[idxs, "wash_id"] = wid
        df.loc[idxs, "wash_window"] = window_tag
        df.loc[idxs, "wash_window_start"] = w_start
        df.loc[idxs, "wash_window_end"] = w_end
        df.loc[idxs, "wash_n_trades"] = res.n_trades
        df.loc[idxs, "wash_m"] = m
        df.loc[idxs, "wash_v_mean"] = res.v_mean
        df.loc[idxs, "wash_v_tol"] = res.v_tol
        df.loc[idxs, "wash_max_abs_pos"] = res.max_abs_pos
        labeled_idx.update(idxs)

    for win in windows:
        print(f"Window {win} is in process")
        # build tumbling window key
        if week_floor and (win.upper().startswith("W") or win.upper() in {"1W", "7D"}):
            wkey = df_S["time"].dt.to_period(week_floor).dt.start_time
        else:
            wkey = df_S["time"].dt.floor(win)

        df_S = df_S.assign(_wkey=wkey)

        for w, g in df_S.groupby("_wkey", sort=True):
            # consider only unlabeled rows (by original indices in df)
            idxs_all = [int(i) for i in g.index if i not in labeled_idx]
            if len(idxs_all) < min_trades_in_set:
                continue

            # We can find multiple disjoint wash prefixes in the same window
            remaining = idxs_all.copy()
            while len(remaining) >= min_trades_in_set:
                found_any = False

                # Try longest prefix first, then shorten
                for k in range(len(remaining), min_trades_in_set - 1, -1):
                    prefix_idxs = remaining[:k]
                    df_prefix = df.loc[prefix_idxs, ["price","size","time","seller","buyer"]].sort_values("time")
                    res = wash_test(df_prefix, m=m)
                    if res.is_wash:
                        w_start = pd.to_datetime(w)
                        # rough end = start + window; special-case 7D/1W
                        if isinstance(win, str) and win[0].isdigit():
                            w_end = w_start + pd.to_timedelta(win)
                        elif win.upper() in {"1W", "7D"}:
                            w_end = w_start + pd.Timedelta(days=7)
                        else:
                            w_end = w_start + pd.Timedelta(days=7)

                        label_trades(prefix_idxs, win, w_start, w_end, res)
                        remaining = remaining[k:]  # continue after the labeled block
                        found_any = True
                        break

                if not found_any:
                    break

        # cleanup helper
        df_S = df_S.drop(columns=["_wkey"])

    return df


# ======================
# 4) PIPELINE RUNNER(S)
# ======================

def run_full_pipeline_for_token(
    df_token: pd.DataFrame,
    *,
    # Alg. 1 knobs
    min_scc_size: int = 2,
    count_singletons_with_selfloop: bool = True,
    candidate_threshold: int = 100,
    # Alg. 2 knobs
    m: float = 0.01,
    windows: Tuple[str, ...] = ("1H", "1D", "7D"),
    min_trades_in_set: int = 2,
    week_floor: Optional[str] = None,
    # labeling id prefix (e.g., token name for readability)
    id_prefix: Optional[str] = None,
) -> Tuple[pd.DataFrame, List[Tuple[Frozenset[int], int]]]:
    """
    End-to-end for ONE TOKEN:
      - Algorithm 1 -> candidate SCC sets
      - Algorithm 2 -> label concrete wash trades within each candidate
    Returns:
      - labeled DataFrame (with columns is_wash, wash_id, diagnostics, ...)
      - the list of candidates (S, count) from Algorithm 1
    """
    df_token = _ensure_schema(df_token)

    # --- Algorithm 1
    scc_counter, candidates = algorithm1_for_token_df(
        df_token,
        min_scc_size=min_scc_size,
        count_singletons_with_selfloop=count_singletons_with_selfloop,
        candidate_threshold=candidate_threshold,
    )

    # --- Algorithm 2 (apply per candidate, never double-label)
    # start with unlabeled df; for each candidate, label its trades
    labeled_df = df_token.copy()
    # initialize label columns so .loc works even if no matches
    for col in ["is_wash","wash_id","wash_window","wash_window_start","wash_window_end",
                "wash_n_trades","wash_m","wash_v_mean","wash_v_tol","wash_max_abs_pos"]:
        if col not in labeled_df:
            labeled_df[col] = np.nan if col not in {"is_wash"} else False

    for S, _count in candidates:
        # run Algorithm 2 for this candidate set S
        df_S_labeled = algorithm2_volume_matching(
            labeled_df,
            candidate_wallets=S,
            m=m,
            windows=windows,
            min_trades_in_set=min_trades_in_set,
            week_floor=week_floor,
            id_prefix=id_prefix,
        )
        # merge new labels (logical OR for is_wash; overwrite diagnostics where newly labeled)
        newly = df_S_labeled["is_wash"] & ~labeled_df["is_wash"]
        for col in ["is_wash","wash_id","wash_window","wash_window_start","wash_window_end",
                    "wash_n_trades","wash_m","wash_v_mean","wash_v_tol","wash_max_abs_pos"]:
            labeled_df.loc[newly, col] = df_S_labeled.loc[newly, col]

    return labeled_df, candidates


def summarize_wash_stats(df_labeled: pd.DataFrame) -> pd.DataFrame:
    """
    Small helper to get a quick summary for a token:
      - number of wash sets
      - share of wash trades (count-based)
      - share of wash volume (token-size-based)
    """
    df = _ensure_schema(df_labeled)
    total_trades = len(df)
    total_vol = float(df["size"].sum())
    wash_mask = df["is_wash"].fillna(False).astype(bool)
    wash_trades = int(wash_mask.sum())
    wash_vol = float(df.loc[wash_mask, "size"].sum())

    # count distinct wash episodes
    n_sets = df.loc[wash_mask, "wash_id"].nunique()

    return pd.DataFrame(
        {
            "n_trades_total": [total_trades],
            "n_trades_wash": [wash_trades],
            "wash_trade_share_pct": [100.0 * wash_trades / total_trades if total_trades else 0.0],
            "volume_total": [total_vol],
            "volume_wash": [wash_vol],
            "wash_volume_share_pct": [100.0 * wash_vol / total_vol if total_vol else 0.0],
            "n_wash_sets": [int(n_sets)],
        }
    )


In [7]:
store = CoinDataStore("AVAX")
df_avax = store.load_days(store.list_days()[-10:])
df_avax

Unnamed: 0,price,size,time,seller,buyer
0,23.941000,6.890000,2025-07-18 00:00:06.890,86,19
1,23.990999,12.930000,2025-07-18 00:00:08.507,10169,480
2,24.000000,42.169998,2025-07-18 00:00:09.188,22,3
3,24.004000,16.770000,2025-07-18 00:00:09.188,8,3
4,23.990999,42.099998,2025-07-18 00:00:10.919,22,10200
...,...,...,...,...,...
147426,24.966999,1.210000,2025-07-27 08:44:33.762,8,1527
147427,24.971001,308.359985,2025-07-27 08:44:59.437,161,5135
147428,24.971001,203.639999,2025-07-27 08:44:59.437,892,5135
147429,24.971001,11.940000,2025-07-27 08:45:07.130,19,170117


In [8]:
# store = CoinDataStore("AVAX")
# df_avax = store.load_all()
# df_avax

In [9]:
df_avax.to_csv("AVAX_small.csv", index=False)

In [14]:
import pandas as pd

records = [
    ["A", "B"],
    ["B", "A"],
    ["A", "B"],
    ["B", "A"],
    ["A", "B"],
    ["B", "A"],
    ["A", "B"],
    ["B", "A"],
    ["A", "B"],
    ["B", "A"],

    ["B", "C"],
    ["C", "A"],

    ["D", "E"],
    ["E", "D"],
]

df_test = pd.DataFrame(records, columns=["seller", "buyer"])
df_test = df_test.reset_index().rename(columns={"index": "time"})
df_test["price"] = 10
df_test["size"] = 10
df_test

Unnamed: 0,time,seller,buyer,price,size
0,0,0,1,10,10
1,1,1,0,10,10
2,2,0,1,10,10
3,3,1,0,10,10
4,4,0,1,10,10
5,5,1,0,10,10
6,6,0,1,10,10
7,7,1,0,10,10
8,8,0,1,10,10
9,9,1,0,10,10


In [15]:
df_avax = df_test

In [24]:
#presettings
df_token = df_avax.copy()
min_scc_size: int = 2
count_singletons_with_selfloop = True
candidate_threshold = 30

In [25]:
# algorithm 1
df_token = _ensure_schema(df_token)

# --- Algorithm 1
scc_counter, candidates = algorithm1_for_token_df(
    df_token,
    min_scc_size=min_scc_size,
    count_singletons_with_selfloop=count_singletons_with_selfloop,
    candidate_threshold=candidate_threshold,
)

In [26]:
m=0.01                  # 1% tolerance for position sums
windows=("1h","1D","7D")     # hour -> day -> week passes
id_prefix="AVAX-"
min_trades_in_set: int = 2
week_floor=None

In [27]:
candidates

[(frozenset({0, 1}), 4)]

In [28]:
# --- Algorithm 2 (apply per candidate, never double-label)
# start with unlabeled df; for each candidate, label its trades
labeled_df = df_token.copy()
# initialize label columns so .loc works even if no matches
for col in ["is_wash","wash_id","wash_window","wash_window_start","wash_window_end",
            "wash_n_trades","wash_m","wash_v_mean","wash_v_tol","wash_max_abs_pos"]:
    if col not in labeled_df:
        labeled_df[col] = np.nan if col not in {"is_wash"} else False

for i, (S, _count) in enumerate(candidates):
    print(f"Processing {i+1} out of {len(candidates)} candidates")
    # run Algorithm 2 for this candidate set S
    df_S_labeled = algorithm2_volume_matching(
        labeled_df,
        candidate_wallets=S,
        m=m,
        windows=windows,
        min_trades_in_set=min_trades_in_set,
        week_floor=week_floor,
        id_prefix=id_prefix,
    )

    # merge new labels (logical OR for is_wash; overwrite diagnostics where newly labeled)
    newly = df_S_labeled["is_wash"] & ~labeled_df["is_wash"]
    for col in ["is_wash","wash_id","wash_window","wash_window_start","wash_window_end",
                "wash_n_trades","wash_m","wash_v_mean","wash_v_tol","wash_max_abs_pos"]:
        labeled_df.loc[newly, col] = df_S_labeled.loc[newly, col]

Processing 1 out of 1 candidates
Window 1h is in process
Window 1D is in process
Window 7D is in process


 'AVAX-1' 'AVAX-1']' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  labeled_df.loc[newly, col] = df_S_labeled.loc[newly, col]
  labeled_df.loc[newly, col] = df_S_labeled.loc[newly, col]
['1970-01-01 00:00:00', '1970-01-01 00:00:00', '1970-01-01 00:00:00',
 '1970-01-01 00:00:00', '1970-01-01 00:00:00', '1970-01-01 00:00:00',
 '1970-01-01 00:00:00', '1970-01-01 00:00:00', '1970-01-01 00:00:00',
 '1970-01-01 00:00:00']
Length: 10, dtype: datetime64[ns]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  labeled_df.loc[newly, col] = df_S_labeled.loc[newly, col]
['1970-01-01 01:00:00', '1970-01-01 01:00:00', '1970-01-01 01:00:00',
 '1970-01-01 01:00:00', '1970-01-01 01:00:00', '1970-01-01 01:00:00',
 '1970-01-01 01:00:00', '1970-01-01 01:00:00', '1970-01-01 01:00:00',
 '1970-01-01 01:00:00']
Length: 10, dtype: datetime64[ns]' has dtype incompatible with float64, please explicitly cast to a compatible dtype f

In [29]:
summary = summarize_wash_stats(labeled_df)
print(candidates[:5])  # top candidate SCCs from Algorithm 1
print(summary)         # quick overview for the token
# labeled_df has per-trade flags/diagnostics: is_wash, wash_id, v_mean, v_tol, ...

[(frozenset({0, 1}), 4)]
   n_trades_total  n_trades_wash  wash_trade_share_pct  volume_total  \
0              14             10             71.428571         140.0   

   volume_wash  wash_volume_share_pct  n_wash_sets  
0        100.0              71.428571            1  


In [5]:
a = 1.1
import sys
sys.getsizeof(float(a))

24

In [10]:
# prepare_postmerge_csv.py
import pandas as pd
import numpy as np
import hashlib
from pathlib import Path

def make_txhash(row: pd.Series) -> str:
    # stable pseudo-hash (not a blockchain txid): hex string
    s = f"{int(row['timestamp'])}|{row['seller']}|{row['buyer']}|{row['size']:.12g}"
    return "tx-" + hashlib.sha1(s.encode("utf-8")).hexdigest()[:16]

def prepare_postmerge_csv(
    df: pd.DataFrame,
    out_csv: str,
    token_id: str,
    token_usd_const: float,
    eth_usd_const: float | None = None,
    price_is_token_in_eth: bool | None = None,
    ether_address: str = "0x0000000000000000000000000000000000000000",
) -> None:
    """
    Build the 'post-merge' table the R pipeline expects, using constant prices.

    Inputs
    ------
    df: DataFrame with columns:
        - price : float64   (optional; used only if price_is_token_in_eth=True)
        - size  : float64   (token amount)
        - time  : datetime64[ns] (tz-aware or tz-naive in UTC)
        - seller: int/str
        - buyer : int/str
    token_id:     identifier for the (non-ETH) token (symbol or address)
    token_usd_const: constant USD price for that token (applied to all rows)
    eth_usd_const:   optional constant ETH/USD (only needed if you also want ETH amounts)
    price_is_token_in_eth:
        - If True: use df['price'] as token-in-ETH.
        - If False/None: ignore df['price'] for ETH math; use eth_usd_const if provided.

    Output CSV columns (matches R 'merge_*' output shape):
        date, cut, blockNumber, timestamp, transactionHash,
        eth_buyer, eth_seller, ether, token,
        trade_amount_eth, trade_amount_dollar, trade_amount_token, token_price_in_eth
    """
    df = df.copy()

    # --- time handling (to UTC seconds) ---
    # ensure datetime dtype
    df["time"] = pd.to_datetime(df["time"], errors="coerce")
    if df["time"].dt.tz is None:
        # assume already UTC if tz-naive
        df["time"] = df["time"].dt.tz_localize("UTC")
    else:
        df["time"] = df["time"].dt.tz_convert("UTC")
    # integer seconds
    df["timestamp"] = (df["time"].view("int64") // 10**9).astype("int64")

    # start-of-day (UTC) seconds, used by the R code as 'cut'
    df["cut"] = (df["timestamp"] // 86_400) * 86_400

    # simple monotone blockNumber (only used for ordering in one place)
    df["blockNumber"] = df["timestamp"].astype("int64")

    # transactionHash: make a deterministic synthetic id per row
    df["transactionHash"] = df.apply(make_txhash, axis=1)

    # addresses as strings (R handles strings better than large ints)
    df["eth_buyer"] = df["seller"].astype(str)   # note: we set like this so that when ether=FALSE the R code flips back
    df["eth_seller"] = df["buyer"].astype(str)

    # constant token id & ether address
    df["token"] = str(token_id)
    df["ether"] = ether_address

    # trade amounts
    df["trade_amount_token"] = df["size"].astype(float)

    # token price in ETH (constant or from df['price'])
    token_price_in_eth = np.nan
    if price_is_token_in_eth is True and "price" in df.columns:
        # take from the dataframe (row-specific). If you want a constant, overwrite below.
        df["token_price_in_eth"] = df["price"].astype(float)
    elif (eth_usd_const is not None) and (token_usd_const is not None):
        # constant derived from USDs
        token_price_in_eth = float(token_usd_const) / float(eth_usd_const)
        df["token_price_in_eth"] = token_price_in_eth
    else:
        df["token_price_in_eth"] = np.nan  # optional field

    # trade_amount_eth (only useful if you plan to run ether=TRUE in R; harmless otherwise)
    if "token_price_in_eth" in df.columns and df["token_price_in_eth"].notna().any():
        df["trade_amount_eth"] = df["trade_amount_token"] * df["token_price_in_eth"]
    else:
        df["trade_amount_eth"] = 0.0

    # trade_amount_dollar using constant token USD price (your chosen quick method)
    df["trade_amount_dollar"] = df["trade_amount_token"] * float(token_usd_const)

    # nice 'date' column for summaries (UTC date)
    df["date"] = pd.to_datetime(df["cut"], unit="s", utc=True).dt.date

    # final column order (mirrors R)
    cols = [
        "date",
        "cut",
        "blockNumber",
        "timestamp",
        "transactionHash",
        "eth_buyer",
        "eth_seller",
        "ether",
        "token",
        "trade_amount_eth",
        "trade_amount_dollar",
        "trade_amount_token",
        "token_price_in_eth",
    ]
    out = df[cols].sort_values(["token", "blockNumber", "timestamp"]).reset_index(drop=True)

    # write CSV
    Path(out_csv).parent.mkdir(parents=True, exist_ok=True)
    out.to_csv(out_csv, index=False)
    print(f"Saved prepared post-merge trades to: {out_csv}")




prepare_postmerge_csv(
    df_avax,
    out_csv="prepared_trades.csv",
    token_id="AVAX",
    token_usd_const=30,     # constant token USD price (your quick method)
    eth_usd_const=3500.0,     # optional: to also compute token_price_in_eth & trade_amount_eth
    price_is_token_in_eth=False,  # set True only if you want to take per-row df['price'] as token-in-ETH
)


  df["timestamp"] = (df["time"].view("int64") // 10**9).astype("int64")


Saved prepared post-merge trades to: prepared_trades.csv


In [16]:
a = 1
import sys
import numpy as np

sys.getsizeof(np.int32(a))

28

In [11]:
a = 256
b = 256
a is b  # True (interned small int)

True

In [12]:
id(a)

9675024

In [13]:

a = 257
b = 257
a is b  # False (different objects)


False

In [14]:
id(a)

136198989375216