### ETH Whale Activity ML Pipeline

- Setup & Configuration

In [1]:
import os
import time
import pickle
import requests
import warnings
import json
from datetime import datetime, timedelta

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, classification_report, confusion_matrix, make_scorer
)

try:
    from imblearn.over_sampling import SMOTE
    from imblearn.pipeline import Pipeline as ImbPipeline
    HAS_IMBLEARN = True
except ImportError:
    HAS_IMBLEARN = False
    print("‚ö†Ô∏è imbalanced-learn not installed")

try:
    from xgboost import XGBClassifier
    HAS_XGBOOST = True
except ImportError:
    HAS_XGBOOST = False
    print("‚ö†Ô∏è XGBoost not installed")

from dotenv import load_dotenv

warnings.filterwarnings('ignore')

‚ö†Ô∏è imbalanced-learn not installed


- Loading and Configuring Environmental Varriables

In [2]:
load_dotenv()
DUNE_API_KEY = os.getenv("DUNE_WHALES_API")
COINGECKO_API_KEY = os.getenv("COINGECKO_API_KEY")

PRICE_CACHE_DIR = "data/price_cache"
os.makedirs(PRICE_CACHE_DIR, exist_ok=True)

# Configuration
QUERIES = {
    "whales": ("6395391", "dune_whales_cache.json", "whale_ml_ready.csv"),
    "market_intent": ("6385600", "dune_intent_cache.json", "market_intent_ml_ready.csv")
}
REQUEST_DELAY = 0.5

- Data Collection - Fetch Whale Data from Dune

In [17]:
def fetch_dune_incremental(query_id, cache_file):
    
    headers = {"x-dune-api-key": DUNE_API_KEY}
    today = pd.Timestamp(datetime.utcnow().date())
    
    # Load cache
    df_cached, fetch_start = pd.DataFrame(), None
    if os.path.exists(cache_file):
        cache = json.load(open(cache_file))
        df_cached = pd.DataFrame(cache["data"])
        df_cached["block_date"] = pd.to_datetime(df_cached["block_date"])
        fetch_start = pd.to_datetime(cache["last_block_date"]) + timedelta(days=1)
        
        if fetch_start >= today:
            print(f"‚úÖ {cache_file} up-to-date")
            return df_cached
        print(f"üîÑ Fetching {fetch_start.date()} ‚Üí {today.date()}")
    else:
        print(f"üÜï Full fetch for {cache_file}")
    
    # Execute query
    payload = {"query_parameters": [{"name": "start_date", "type": "date", "value": fetch_start.strftime("%Y-%m-%d")}]} if fetch_start is not None else {}
    exec_id = requests.post(f"https://api.dune.com/api/v1/query/{query_id}/execute", 
                           headers=headers, json=payload).json()["execution_id"]
    
    # Poll until complete
    while True:
        status = requests.get(f"https://api.dune.com/api/v1/execution/{exec_id}/status", 
                            headers=headers).json()["state"]
        if status == "QUERY_STATE_COMPLETED": break
        if status == "QUERY_STATE_FAILED": raise RuntimeError("Query failed")
        time.sleep(10)
    
    # Get results
    df_new = pd.DataFrame(requests.get(f"https://api.dune.com/api/v1/execution/{exec_id}/results", 
                                       headers=headers).json()["result"]["rows"])
    
    if df_new.empty:
        return df_cached
    
    df_new["block_date"] = pd.to_datetime(df_new["block_date"])
    df_all = pd.concat([df_cached, df_new[df_new["block_date"] < today]], 
                      ignore_index=True).drop_duplicates("block_date", keep="last").sort_values("block_date")
    
    # Save cache
    json.dump({"last_block_date": df_all["block_date"].max().strftime("%Y-%m-%d"),
               "data": json.loads(df_all.to_json(orient="records", date_format="iso"))}, 
              open(cache_file, "w"))
    
    print(f"‚úÖ {cache_file} updated ({len(df_all)} rows)")
    return df_all

In [18]:
# Load Dune Data (Whales & Market Intent)

# %%
print("="*60)
print("LOADING DUNE DATA")
print("="*60)

datasets = {}
for name, (qid, cache, output) in QUERIES.items():
    print(f"\n{name.upper()}")
    print("-"*40)
    datasets[name] = fetch_dune_incremental(qid, cache)
    datasets[name].to_csv(output, index=False)
    print(f"üíæ Saved to {output}")
    time.sleep(0.5)

df_whales = datasets["whales"]
df_market_intent = datasets["market_intent"]

print(f"\n‚úÖ Dune Data Loaded:")
print(f"   ‚Ä¢ Whales: {len(df_whales)} rows")
print(f"   ‚Ä¢ Market Intent: {len(df_market_intent)} rows")

# Display sample
print(f"\nüìä Whales Sample:")
display(df_whales.head(3))

LOADING DUNE DATA

WHALES
----------------------------------------
‚úÖ dune_whales_cache.json up-to-date
üíæ Saved to whale_ml_ready.csv



MARKET_INTENT
----------------------------------------
‚úÖ dune_intent_cache.json up-to-date
üíæ Saved to market_intent_ml_ready.csv

‚úÖ Dune Data Loaded:
   ‚Ä¢ Whales: 1095 rows
   ‚Ä¢ Market Intent: 1095 rows

üìä Whales Sample:


Unnamed: 0,block_date,deposit_tx_count,deposit_withdrawal_ratio,exchange_volume_ratio,mega_whale_ratio,mega_whale_tx_count,mega_whale_volume_eth,net_flow_ma7,non_exchange_ratio,non_exchange_tx_count,non_exchange_volume_eth,std_whale_tx_size_eth,whale_exchange_deposits_eth,whale_exchange_withdrawals_eth,whale_net_exchange_flow_eth,whale_tx_count,whale_volume_eth,withdrawal_tx_count
0,2022-12-24,11,1.5554,0.2899,0.9475,36,108354.7154,-7203.919,0.7101,24,81206.9408,3279.562931,20175.0838,12971.1648,-7203.919,42,114353.1894,7
1,2022-12-25,10,4.4874,0.6327,1.0,36,147540.1401,-33266.8544,0.3673,18,54184.9701,8106.49399,76342.4799,17012.69,-59329.7899,36,147540.1401,8
2,2022-12-26,8,0.189,0.3896,0.9751,50,196152.9223,-4359.6484,0.6104,38,122784.2329,7771.982946,12456.6605,65911.4244,53454.7638,55,201152.3179,9


- Coingecko Price Function

In [19]:
def fetch_coingecko_price_chunked(cg_id, start_date, end_date, api_key, chunk_days=30):
    """
    Fetch historical price data from CoinGecko in chunks.
    
    Using 30-day chunks for reliability with Pro API.
    """
    start_dt = datetime.strptime(start_date, "%Y-%m-%d")
    end_dt = datetime.strptime(end_date, "%Y-%m-%d")
    
    # Use Pro API URL if API key is provided
    base_url = "https://pro-api.coingecko.com/api/v3" if api_key else "https://api.coingecko.com/api/v3"
    
    all_data = []
    current_start = start_dt
    
    while current_start < end_dt:
        current_end = min(current_start + timedelta(days=chunk_days), end_dt)
        
        from_ts = int(current_start.timestamp())
        to_ts = int(current_end.timestamp())
        
        url = f"{base_url}/coins/{cg_id}/market_chart/range"
        params = {"vs_currency": "usd", "from": from_ts, "to": to_ts}
        headers = {"x-cg-pro-api-key": api_key} if api_key else {}
        
        max_retries = 3
        for attempt in range(max_retries):
            try:
                response = requests.get(url, params=params, headers=headers)
                response.raise_for_status()
                
                data = response.json()
                prices = data.get("prices", [])
                
                if prices:
                    all_data.extend(prices)
                
                print(f"   üì• {current_start.date()} ‚Üí {current_end.date()} ({len(prices)} points)")
                
                # Rate limiting
                time.sleep(0.3)
                break  # Success, exit retry loop
                
            except requests.exceptions.HTTPError as e:
                if e.response.status_code == 429:
                    wait_time = 60 * (attempt + 1)
                    print(f"   ‚ö†Ô∏è  Rate limited. Waiting {wait_time}s... (attempt {attempt+1}/{max_retries})")
                    time.sleep(wait_time)
                    continue
                else:
                    # Print detailed error info
                    print(f"   ‚ùå Error: {e.response.status_code} - {e.response.text}")
                    print(f"   URL: {url}")
                    print(f"   Params: {params}")
                    
                    if attempt < max_retries - 1:
                        print(f"   Retrying in 5s... (attempt {attempt+1}/{max_retries})")
                        time.sleep(5)
                    else:
                        raise
        
        current_start = current_end + timedelta(days=1)
    
    if not all_data:
        return pd.DataFrame(columns=["date", "price"])
    
    df = pd.DataFrame(all_data, columns=["timestamp", "price"])
    df["date"] = pd.to_datetime(df["timestamp"], unit="ms").dt.date
    df = df.groupby("date")["price"].mean().reset_index()
    df["date"] = pd.to_datetime(df["date"])
    
    symbol = cg_id.upper()
    print(f"üìà {symbol}: {len(df)} days | ${df['price'].min():.0f} - ${df['price'].max():.0f}")
    
    return df


In [20]:
def load_cached_price(symbol):
    """Load price data from cache."""
    path = f"{PRICE_CACHE_DIR}/{symbol}.csv"
    if os.path.exists(path):
        df = pd.read_csv(path)
        df["date"] = pd.to_datetime(df["date"])
        return df
    return None


def save_price_cache(symbol, df):
    """Save price data to cache."""
    path = f"{PRICE_CACHE_DIR}/{symbol}.csv"
    df = df.copy()
    df["date"] = pd.to_datetime(df["date"])
    df.sort_values("date").drop_duplicates("date").to_csv(path, index=False)


In [22]:
def get_price_data_incremental(symbol, cg_id, start_date, end_date):
    
    cached = load_cached_price(symbol)
    
    start_date = pd.Timestamp(start_date)
    end_date = pd.Timestamp(end_date)
    today = pd.Timestamp(datetime.utcnow().date())
    yesterday = today - timedelta(days=1)
    
    if cached is None:
        print(f"üì¶ No cache for {symbol.upper()}. Fetching full range...")
        fetch_start = start_date
        base_df = pd.DataFrame()
    else:
        last_cached_date = pd.Timestamp(cached["date"].max())
        
        # Check if we have data through yesterday
        if last_cached_date >= yesterday:
            print(f"‚úÖ {symbol.upper()} cache up-to-date (through {last_cached_date.date()})")
            return cached
        
        fetch_start = last_cached_date + timedelta(days=1)
        base_df = cached
        print(f"üîÑ Updating {symbol.upper()}: {fetch_start.date()} ‚Üí {yesterday.date()}")
    
    # Cap end_date at yesterday (we predict next day, so only need up to yesterday)
    end_date = min(end_date, yesterday)
    
    # Fetch historical data with chunking
    if fetch_start <= end_date:
        new_data = fetch_coingecko_price_chunked(
            cg_id,
            fetch_start.strftime("%Y-%m-%d"),
            end_date.strftime("%Y-%m-%d"),
            COINGECKO_API_KEY
        )
        
        df_new = new_data.rename(columns={"price": f"{symbol}_price"})
        df_all = pd.concat([base_df, df_new], ignore_index=True)
        save_price_cache(symbol, df_all)
    else:
        df_all = base_df
    
    return df_all

In [23]:
whales_min = df_whales['block_date'].min()
whales_max = df_whales['block_date'].max()
intent_min = df_market_intent['block_date'].min()
intent_max = df_market_intent['block_date'].max()

# Add 100-day buffer for technical indicators
min_date = min(whales_min, intent_min) - timedelta(days=100)
max_date = max(whales_max, intent_max)

print(f"\nüìÖ Date range: {min_date.date()} ‚Üí {max_date.date()}")
print(f"   (Includes 100-day buffer for indicators)")

# Fetch ETH prices
print(f"\n{'='*40}")
print("ETHEREUM (ETH)")
print(f"{'='*40}")
df_eth = get_price_data_incremental("eth", "ethereum", min_date, max_date)
print(f"üíæ ETH prices: {len(df_eth)} days")

# Fetch BTC prices
print(f"\n{'='*40}")
print("BITCOIN (BTC)")
print(f"{'='*40}")
df_btc = get_price_data_incremental("btc", "bitcoin", min_date, max_date)
print(f"üíæ BTC prices: {len(df_btc)} days")



üìÖ Date range: 2022-09-15 ‚Üí 2025-12-22
   (Includes 100-day buffer for indicators)

ETHEREUM (ETH)
üîÑ Updating ETH: 2025-12-22 ‚Üí 2025-12-22
üíæ ETH prices: 1156 days

BITCOIN (BTC)
üîÑ Updating BTC: 2025-12-22 ‚Üí 2025-12-22
üíæ BTC prices: 1156 days


In [24]:
print(f"\nüìä Loaded Datasets:")
print(f"   ‚Ä¢ Whales Features:     {len(df_whales):>5} rows")
print(f"   ‚Ä¢ Market Intent:       {len(df_market_intent):>5} rows")
print(f"   ‚Ä¢ ETH Prices:          {len(df_eth):>5} rows")
print(f"   ‚Ä¢ BTC Prices:          {len(df_btc):>5} rows")

print(f"\nüìÖ Date Ranges:")
print(f"   ‚Ä¢ Whales:        {df_whales['block_date'].min().date()} ‚Üí {df_whales['block_date'].max().date()}")
print(f"   ‚Ä¢ Market Intent: {df_market_intent['block_date'].min().date()} ‚Üí {df_market_intent['block_date'].max().date()}")
print(f"   ‚Ä¢ ETH:           {df_eth['date'].min().date()} ‚Üí {df_eth['date'].max().date()}")
print(f"   ‚Ä¢ BTC:           {df_btc['date'].min().date()} ‚Üí {df_btc['date'].max().date()}")

print("\n‚úÖ All data ready for feature engineering and modeling!")

# Display samples
print(f"\n{'='*60}")
print("ETH PRICE SAMPLE")
print(f"{'='*60}")
display(df_eth.tail(5))

print(f"\n{'='*60}")
print("BTC PRICE SAMPLE")
print(f"{'='*60}")
display(df_btc.tail(5))


üìä Loaded Datasets:
   ‚Ä¢ Whales Features:      1095 rows
   ‚Ä¢ Market Intent:        1095 rows
   ‚Ä¢ ETH Prices:           1156 rows
   ‚Ä¢ BTC Prices:           1156 rows

üìÖ Date Ranges:
   ‚Ä¢ Whales:        2022-12-24 ‚Üí 2025-12-22
   ‚Ä¢ Market Intent: 2022-12-24 ‚Üí 2025-12-22
   ‚Ä¢ ETH:           2022-09-15 ‚Üí 2025-12-21
   ‚Ä¢ BTC:           2022-09-15 ‚Üí 2025-12-21

‚úÖ All data ready for feature engineering and modeling!

ETH PRICE SAMPLE


Unnamed: 0,date,eth_price
1151,2025-12-17,2906.573711
1152,2025-12-18,2853.067689
1153,2025-12-19,2935.880471
1154,2025-12-20,2979.478368
1155,2025-12-21,2980.317996



BTC PRICE SAMPLE


Unnamed: 0,date,btc_price
1151,2025-12-17,86887.683596
1152,2025-12-18,86679.145619
1153,2025-12-19,87362.632014
1154,2025-12-20,88174.588995
1155,2025-12-21,88260.199788


In [6]:
# %% [markdown]
# # Data Loading Pipeline - Dune + CoinGecko

# %% Setup
import os, json, time, requests, pandas as pd
from datetime import datetime, timedelta
from dotenv import load_dotenv

load_dotenv()
DUNE_API_KEY = os.getenv("DUNE_WHALES_API")
COINGECKO_API_KEY = os.getenv("COINGECKO_API_KEY")
os.makedirs("data/price_cache", exist_ok=True)

QUERIES = {
    "whales": ("6395391", "dune_whales_cache.json", "whale_ml_ready.csv"),
    "market_intent": ("6385600", "dune_intent_cache.json", "market_intent_ml_ready.csv")
}

# %% Dune Functions
def fetch_dune(qid, cache):
    headers = {"x-dune-api-key": DUNE_API_KEY}
    today = pd.Timestamp.now(tz='UTC').normalize()
    
    # Load cache
    if os.path.exists(cache):
        c = json.load(open(cache))
        df_cached = pd.DataFrame(c["data"])
        df_cached["block_date"] = pd.to_datetime(df_cached["block_date"], utc=True)
        last_date = pd.to_datetime(c["last_block_date"], utc=True)
        
        # If cache has yesterday's data, it's current enough (today's data doesn't exist yet)
        if last_date >= today - timedelta(1):
            print(f"‚úÖ {cache} current ({last_date.date()})")
            return df_cached
        
        print(f"üîÑ {cache}: fetching latest")
    else:
        df_cached = pd.DataFrame()
        print(f"üÜï {cache}: full fetch")
    
    # Execute query
    resp = requests.post(f"https://api.dune.com/api/v1/query/{qid}/execute", headers=headers).json()
    if "execution_id" not in resp:
        raise RuntimeError(f"Dune API error: {resp}")
    eid = resp["execution_id"]
    
    # Poll
    while True:
        s = requests.get(f"https://api.dune.com/api/v1/execution/{eid}/status", headers=headers).json()["state"]
        if s == "QUERY_STATE_COMPLETED": break
        if s == "QUERY_STATE_FAILED": raise RuntimeError("Query failed")
        time.sleep(10)
    
    # Get results & merge with cache
    df_new = pd.DataFrame(requests.get(f"https://api.dune.com/api/v1/execution/{eid}/results", headers=headers).json()["result"]["rows"])
    if df_new.empty: return df_cached
    
    df_new["block_date"] = pd.to_datetime(df_new["block_date"], utc=True)
    df = pd.concat([df_cached, df_new[df_new["block_date"] < today]]).drop_duplicates("block_date", keep="last").sort_values("block_date").reset_index(drop=True)
    
    # Save cache
    json.dump({"last_block_date": df["block_date"].max().strftime("%Y-%m-%d"), 
               "data": json.loads(df.to_json(orient="records", date_format="iso"))}, open(cache, "w"))
    print(f"‚úÖ {cache}: {len(df)} rows (added {len(df_new)} new)")
    return df

# %% Load Dune Data
print("="*60, "\nDUNE DATA\n", "="*60)
datasets = {}
for name, (qid, cache, output) in QUERIES.items():
    datasets[name] = fetch_dune(qid, cache)
    datasets[name].to_csv(output, index=False)
    time.sleep(0.5)

df_whales, df_market_intent = datasets["whales"], datasets["market_intent"]
print(f"\n‚úÖ Whales: {len(df_whales)} | Intent: {len(df_market_intent)}")


# =========================================================
# Utilities
# =========================================================

def to_utc(ts):
    """
    Normalize any date-like input to UTC Timestamp.
    Handles tz-naive, tz-aware, date, datetime safely.
    """
    ts = pd.Timestamp(ts)
    if ts.tzinfo is None:
        return ts.tz_localize("UTC")
    return ts.tz_convert("UTC")


# =========================================================
# CoinGecko chunked fetch (NO SKIPPED DAYS)
# =========================================================

def fetch_cg_chunked(cg_id, start, end, key=None, days=30):
    """
    Fetch DAILY UTC prices from CoinGecko.
    No skipped days. No today.
    """
    url_base = "https://pro-api.coingecko.com/api/v3" if key else "https://api.coingecko.com/api/v3"
    headers = {"x-cg-pro-api-key": key} if key else {}

    start_dt = to_utc(start)
    end_dt   = to_utc(end) + pd.Timedelta(days=1)  # ‚úÖ INCLUSIVE FIX

    all_prices = []
    curr = start_dt

    while curr < end_dt:
        next_dt = min(curr + pd.Timedelta(days=days), end_dt)

        params = {
            "vs_currency": "usd",
            "from": int(curr.timestamp()),
            "to": int(next_dt.timestamp())
        }

        for attempt in range(3):
            try:
                r = requests.get(
                    f"{url_base}/coins/{cg_id}/market_chart/range",
                    params=params,
                    headers=headers,
                    timeout=30
                )
                r.raise_for_status()

                prices = r.json().get("prices", [])
                all_prices.extend(prices)

                print(f"üì• {cg_id}: {curr.date()} ‚Üí {next_dt.date()} ({len(prices)} pts)")
                time.sleep(0.3)
                break

            except Exception as e:
                if attempt == 2:
                    raise
                print(f"‚ö†Ô∏è Retry {attempt + 1}/3 ({e})")
                time.sleep(5)

        # ‚úÖ CRITICAL FIX: NO +1 DAY SKIP
        curr = next_dt

    if not all_prices:
        return pd.DataFrame(columns=["date", "price"])

    # -----------------------------------------------------
    # Build DAILY UTC prices
    # -----------------------------------------------------
    df = pd.DataFrame(all_prices, columns=["timestamp", "price"])
    df["date"] = pd.to_datetime(df["timestamp"], unit="ms", utc=True).dt.floor("D")

    df = (
        df.groupby("date", as_index=False)["price"]
          .mean()
          .sort_values("date")
    )

    # -----------------------------------------------------
    # Enforce full daily range (no silent gaps)
    # -----------------------------------------------------
    full_range = pd.date_range(
        start=df["date"].min(),
        end=df["date"].max(),
        freq="D",
        tz="UTC"
    )

    df = (
        df.set_index("date")
          .reindex(full_range)
          .rename_axis("date")
          .reset_index()
    )

    return df


# =========================================================
# Cached price loader (STRICT: EXCLUDES TODAY)
# =========================================================

def get_price(sym, cg_id, start, end, key=None):
    """
    Load DAILY prices through YESTERDAY ONLY.
    Today is always excluded.
    """
    cache = f"data/price_cache/{sym}.csv"

    today_utc = pd.Timestamp.utcnow().floor("D")
    yesterday = today_utc - pd.Timedelta(days=1)

    start = to_utc(start)
    end   = min(to_utc(end), yesterday)

    if start > end:
        return pd.DataFrame(columns=["date", f"{sym}_price"])

    # -----------------------------------------------------
    # Cache exists
    # -----------------------------------------------------
    if os.path.exists(cache):
        df = pd.read_csv(cache, parse_dates=["date"])
        df["date"] = df["date"].apply(to_utc)

        last_cached = df["date"].max()

        if last_cached >= end:
            print(f"‚úÖ {sym.upper()} cache current ({last_cached.date()})")
            return df

        fetch_start = last_cached + pd.Timedelta(days=1)
        print(f"üîÑ {sym.upper()}: fetching {fetch_start.date()} ‚Üí {end.date()}")

        new = fetch_cg_chunked(cg_id, fetch_start, end, key)

        if not new.empty:
            new = new.rename(columns={"price": f"{sym}_price"})
            df = (
                pd.concat([df, new])
                  .drop_duplicates("date", keep="last")
                  .sort_values("date")
                  .reset_index(drop=True)
            )

    # -----------------------------------------------------
    # No cache
    # -----------------------------------------------------
    else:
        print(f"üì¶ {sym.upper()}: full fetch {start.date()} ‚Üí {end.date()}")
        df = fetch_cg_chunked(cg_id, start, end, key)

        if df.empty:
            return df

        df = df.rename(columns={"price": f"{sym}_price"})

    df.to_csv(cache, index=False)
    print(f"‚úÖ {sym.upper()} saved (through {df['date'].max().date()})")
    return df


# =========================================================
# Example usage
# =========================================================

if __name__ == "__main__":

    min_date = min(
        df_whales["block_date"].min(),
        df_market_intent["block_date"].min()
    ) - pd.Timedelta(days=100)

    max_date = max(
        df_whales["block_date"].max(),
        df_market_intent["block_date"].max()
    )

    print(f"\nüìÖ Range: {min_date.date()} ‚Üí {max_date.date()} (today excluded)\n")

    df_btc = get_price("btc", "bitcoin", min_date, max_date, COINGECKO_API_KEY)
    df_eth = get_price("eth", "ethereum", min_date, max_date, COINGECKO_API_KEY)

    print("\n‚úÖ FINAL CHECK")
    print(f"BTC last date: {df_btc['date'].max().date()}")
    print(f"ETH last date: {df_eth['date'].max().date()}")


DUNE DATA
‚úÖ dune_whales_cache.json current (2025-12-23)


‚úÖ dune_intent_cache.json current (2025-12-23)

‚úÖ Whales: 1096 | Intent: 1096

üìÖ Range: 2022-09-15 ‚Üí 2025-12-23 (today excluded)

üîÑ BTC: fetching 2025-12-23 ‚Üí 2025-12-23
üì• bitcoin: 2025-12-23 ‚Üí 2025-12-24 (230 pts)
‚úÖ BTC saved (through 2025-12-23)
üîÑ ETH: fetching 2025-12-23 ‚Üí 2025-12-23
üì• ethereum: 2025-12-23 ‚Üí 2025-12-24 (229 pts)
‚úÖ ETH saved (through 2025-12-23)

‚úÖ FINAL CHECK
BTC last date: 2025-12-23
ETH last date: 2025-12-23


In [8]:
pd.read_csv('/workspaces/Whale-Movement-Based-Price-Direction-Generator-V2/WhalesIntent/Intent/market_intent_ml_ready.csv')

Unnamed: 0,block_date,block_fullness_delta_1d,eth_burned_delta_1d,eth_burned_zscore_90d,exchange_flow_share,median_gas_delta_1d,median_gas_delta_7d,net_exchange_flow_ratio,smart_contract_ratio_delta_1d,tx_per_active_delta_1d,tx_per_active_zscore_90d,whale_exchange_asymmetry,whale_exchange_flow_ratio,whale_tx_zscore_90d,whale_volume_ratio,whale_volume_ratio_delta_1d,whale_volume_ratio_delta_3d
0,2022-12-24 00:00:00+00:00,,,0.0000,0.214255,,,-0.046363,,,0.0000,-0.220737,-0.041071,0.0000,0.787476,,
1,2022-12-25 00:00:00+00:00,,,0.0000,0.465847,,,-0.067329,,,0.0000,-0.145227,-0.064300,0.0000,0.828582,,
2,2022-12-26 00:00:00+00:00,-0.001519,80.6159,0.7071,0.267112,0.4270,,0.059721,0.016022,0.1271,0.7071,0.274544,0.065950,0.6679,0.802762,-0.025820,
3,2022-12-27 00:00:00+00:00,-0.000582,358.3551,1.1374,0.232362,3.0477,,-0.070792,-0.001002,0.0820,0.9210,-0.312269,-0.063489,0.3739,0.798157,-0.004605,
4,2022-12-28 00:00:00+00:00,-0.000340,120.1106,1.0666,0.241721,0.6660,,-0.081152,0.014817,0.0344,0.9105,-0.351510,-0.077259,0.9019,0.835947,0.037790,0.007365
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1091,2025-12-19 00:00:00+00:00,-0.000178,12.4760,-0.2551,0.408663,0.0546,0.0003,-0.124156,-0.002820,-0.0150,-0.1670,-0.302347,-0.122617,0.4078,0.954868,0.014108,0.006050
1092,2025-12-20 00:00:00+00:00,0.001126,-22.1130,-0.3346,0.203059,-0.1093,-0.0346,-0.051241,0.013444,-0.1327,-1.2810,-0.238883,-0.046222,-1.8533,0.851792,-0.103076,-0.107979
1093,2025-12-21 00:00:00+00:00,0.000199,0.4850,-0.3298,0.354921,0.0041,-0.0320,0.055381,0.057867,0.3857,1.9035,0.171106,0.059485,-2.3759,0.889958,0.038166,-0.050803
1094,2025-12-22 00:00:00+00:00,-0.001665,1.3817,-0.3228,0.301284,0.0113,-0.1056,-0.066520,-0.066439,-0.4239,-1.5519,-0.217053,-0.064478,0.6751,0.936138,0.046180,-0.018730


In [9]:
pd.read_csv('/workspaces/Whale-Movement-Based-Price-Direction-Generator-V2/WhalesIntent/Intent/whale_ml_ready.csv')

Unnamed: 0,block_date,deposit_tx_count,deposit_withdrawal_ratio,exchange_volume_ratio,mega_whale_ratio,mega_whale_tx_count,mega_whale_volume_eth,net_flow_ma7,non_exchange_ratio,non_exchange_tx_count,non_exchange_volume_eth,std_whale_tx_size_eth,whale_exchange_deposits_eth,whale_exchange_withdrawals_eth,whale_net_exchange_flow_eth,whale_tx_count,whale_volume_eth,withdrawal_tx_count
0,2022-12-24 00:00:00+00:00,11,1.5554,0.2899,0.9475,36,1.083547e+05,-7203.9190,0.7101,24,81206.9408,3279.562931,20175.0838,12971.1648,-7203.9190,42,1.143532e+05,7
1,2022-12-25 00:00:00+00:00,10,4.4874,0.6327,1.0000,36,1.475401e+05,-59329.7899,0.3673,18,54184.9701,8106.493990,76342.4799,17012.6900,-59329.7899,36,1.475401e+05,8
2,2022-12-26 00:00:00+00:00,8,0.1890,0.3896,0.9751,50,1.961529e+05,-2937.5130,0.6104,38,122784.2329,7771.982946,12456.6605,65911.4244,53454.7638,55,2.011523e+05,9
3,2022-12-27 00:00:00+00:00,19,2.0556,0.2713,1.0000,73,2.210209e+05,-8864.0317,0.7287,44,161051.4598,3093.841876,40343.2672,19626.1980,-20717.0692,73,2.210209e+05,10
4,2022-12-28 00:00:00+00:00,14,5.0230,0.2440,1.0000,67,3.378297e+05,-20410.3467,0.7560,49,255413.2942,6826.816768,68732.8362,13683.5447,-55049.2915,67,3.378297e+05,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1091,2025-12-19 00:00:00+00:00,26,2.6251,0.4494,1.0000,75,1.103908e+06,-90928.2289,0.5506,44,607761.6724,15188.722973,359281.6150,136864.9578,-222416.6572,75,1.103908e+06,5
1092,2025-12-20 00:00:00+00:00,9,11.0293,0.2642,1.0000,40,1.632716e+05,-92508.1458,0.7358,29,120134.5612,3108.900831,39550.9925,3586.0000,-35964.9925,40,1.632716e+05,2
1093,2025-12-21 00:00:00+00:00,8,0.8793,0.5248,1.0000,32,1.928101e+05,-101623.7755,0.4752,19,91619.3550,4433.349204,47347.1628,53843.5845,6496.4217,32,1.928101e+05,5
1094,2025-12-22 00:00:00+00:00,14,2.7336,0.3557,1.0000,70,6.105957e+05,-96583.9450,0.6443,51,393401.9015,5992.049099,159021.1937,58172.5992,-100848.5945,70,6.105957e+05,5


In [53]:
pd.read_csv('/workspaces/Whale-Movement-Based-Price-Direction-Generator-V2/WhalesIntent/Intent/data/price_cache/btc.csv')

Unnamed: 0,date,btc_price
0,2022-09-15,20009.052983
1,2022-09-16,19713.657885
2,2022-09-17,19944.803793
3,2022-09-18,19896.269820
4,2022-09-19,19028.641104
...,...,...
1153,2025-12-19,87362.632014
1154,2025-12-20,88174.588995
1155,2025-12-21,88260.199788
1156,2025-12-22,89102.172598


In [52]:
pd.read_csv('/workspaces/Whale-Movement-Based-Price-Direction-Generator-V2/WhalesIntent/Intent/data/price_cache/eth.csv')

Unnamed: 0,date,eth_price
0,2022-09-15,1566.672324
1,2022-09-16,1456.046704
2,2022-09-17,1443.421822
3,2022-09-18,1424.248562
4,2022-09-19,1330.210600
...,...,...
1153,2025-12-19,2935.880471
1154,2025-12-20,2979.478368
1155,2025-12-21,2980.317996
1156,2025-12-22,3025.604874


- Feature Engineering from version 1

In [None]:
def add_price_features(df, price_col, prefix):
    """Add price-based ML features"""
    df = df.sort_values('block_date').reset_index(drop=True)
    
    # Returns
    df[f'{prefix}_daily_return'] = df[price_col].pct_change()
    df[f'{prefix}_log_return'] = np.log(df[price_col] / df[price_col].shift(1))
    
    # Moving averages
    df[f'{prefix}_ma7'] = df[price_col].rolling(7, min_periods=1).mean()
    df[f'{prefix}_ma30'] = df[price_col].rolling(30, min_periods=1).mean()
    
    # Momentum
    df[f'{prefix}_vs_ma7'] = df[price_col] / df[f'{prefix}_ma7']
    df[f'{prefix}_vs_ma30'] = df[price_col] / df[f'{prefix}_ma30']
    
    # Volatility
    df[f'{prefix}_vol7'] = df[f'{prefix}_daily_return'].rolling(7, min_periods=1).std()
    df[f'{prefix}_vol30'] = df[f'{prefix}_daily_return'].rolling(30, min_periods=1).std()
    
    # Returns
    df[f'{prefix}_ret7d'] = df[price_col].pct_change(7)
    df[f'{prefix}_ret30d'] = df[price_col].pct_change(30)
    
    # RSI
    returns = df[f'{prefix}_daily_return']
    gains = returns.where(returns > 0, 0).rolling(14, min_periods=1).mean()
    losses = -returns.where(returns < 0, 0).rolling(14, min_periods=1).mean()
    rs = gains / (losses + 1e-10)
    df[f'{prefix}_rsi'] = 100 - (100 / (1 + rs))
    
    # Lags
    for lag in [1, 3, 7]:
        df[f'{prefix}_ret_lag{lag}'] = df[f'{prefix}_daily_return'].shift(lag)
    
    return df


def add_correlation_features(df):
    """Add ETH-BTC correlation features"""
    df['eth_btc_ratio'] = df['eth_price'] / df['btc_price']
    df['eth_btc_ratio_ma7'] = df['eth_btc_ratio'].rolling(7, min_periods=1).mean()
    df['eth_btc_corr_30d'] = df['eth_daily_return'].rolling(30, min_periods=20).corr(df['btc_daily_return'])
    df['eth_outperformance'] = df['eth_daily_return'] - df['btc_daily_return']
    return df

def create_target(df):
    """Create target: next day price direction"""
    df['next_day_return'] = df['eth_price'].pct_change().shift(-1)
    df['next_day_price_direction'] = (df['next_day_return'] > 0).astype(int)
    return df

# %%
print("\n‚öôÔ∏è Engineering features...")

df_merged = add_price_features(df_merged, 'eth_price', 'eth')
df_merged = add_price_features(df_merged, 'btc_price', 'btc')
df_merged = add_correlation_features(df_merged)
df_merged = create_target(df_merged)

print(f"‚úÖ Features created: {len(df_merged.columns)} total columns")

# Drop rows with NaN target
df_final = df_merged.dropna(subset=['next_day_price_direction']).copy()

print(f"‚úÖ Final dataset: {len(df_final)} rows")
print(f"   Dropped {len(df_merged) - len(df_final)} rows (NaN target)")

# Save
df_final.to_csv(OUTPUT_FILE, index=False)
print(f"üíæ Saved: {OUTPUT_FILE}")



‚öôÔ∏è Engineering features...
‚úÖ Features created: 65 total columns
‚úÖ Final dataset: 1095 rows
   Dropped 0 rows (NaN target)
üíæ Saved: whale_prices_ml_ready.csv
