### ETH Whale Activity ML Pipeline

- Setup & Configuration

In [None]:
import os
import time
import pickle
import requests
import warnings
from datetime import datetime, timedelta

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, classification_report, confusion_matrix, make_scorer
)

try:
    from imblearn.over_sampling import SMOTE
    from imblearn.pipeline import Pipeline as ImbPipeline
    HAS_IMBLEARN = True
except ImportError:
    HAS_IMBLEARN = False
    print("‚ö†Ô∏è imbalanced-learn not installed")

try:
    from xgboost import XGBClassifier
    HAS_XGBOOST = True
except ImportError:
    HAS_XGBOOST = False
    print("‚ö†Ô∏è XGBoost not installed")

from dotenv import load_dotenv

warnings.filterwarnings('ignore')

- Loading and Configuring Environmental Varriables

In [None]:
# Load API keys
load_dotenv()
DUNE_API_KEY = os.getenv("DUNE_WHALES_API")
COINGECKO_API_KEY = os.getenv("COINGECKO_API_KEY")

if not DUNE_API_KEY or not COINGECKO_API_KEY:
    raise ValueError("‚ùå Missing API keys in .env file")

# Configuration
QUERY_ID = "6184996"
REQUEST_DELAY = 0.5
OUTPUT_FILE = 'whale_prices_ml_ready.csv'
MODEL_FILE = 'models/eth_price_predictor.pkl'

print(" Configuration loaded")

- Data Collection - Fetch Whale Data from Dune

In [None]:
def fetch_dune_data(query_id, api_key):
    """Execute Dune query and fetch results (excludes today)"""
    print("="*70)
    print(" FETCHING WHALE DATA FROM DUNE ".center(70))
    print("="*70)
    
    headers = {"x-dune-api-key": api_key}
    
    # Execute query
    execute_url = f"https://api.dune.com/api/v1/query/{query_id}/execute"
    execute_response = requests.post(execute_url, headers=headers)
    execute_data = execute_response.json()
    
    execution_id = execute_data.get("execution_id")
    if not execution_id:
        raise ValueError(f"‚ùå No execution_id: {execute_data}")
    
    print(f"üöÄ Execution ID: {execution_id}")
    
    # Poll for completion
    status_url = f"https://api.dune.com/api/v1/execution/{execution_id}/status"
    results_url = f"https://api.dune.com/api/v1/execution/{execution_id}/results"
    
    while True:
        status = requests.get(status_url, headers=headers).json()
        state = status.get("state")
        print(f"State: {state}")
        
        if state == "QUERY_STATE_COMPLETED":
            break
        elif state == "QUERY_STATE_FAILED":
            raise RuntimeError(f"‚ùå Query failed: {status}")
        time.sleep(10)
    
    # Fetch results
    results = requests.get(results_url, headers=headers).json()
    df = pd.DataFrame(results["result"]["rows"])
    df['block_date'] = pd.to_datetime(df['block_date']).dt.date
    
    # Exclude today
    today = datetime.now().date()
    df = df[df['block_date'] < today]
    
    print(f"‚úÖ Retrieved {len(df)} rows")
    print(f"   Date range: {df['block_date'].min()} ‚Üí {df['block_date'].max()}")
    return df

df_whales = fetch_dune_data(QUERY_ID, DUNE_API_KEY)

# %%
print(f"\nüìä Whale Data Overview:")
print(f"   Shape: {df_whales.shape}")
print(f"   Columns: {list(df_whales.columns)}")

# Check for rows with zero exchange activity
zero_exchange = df_whales[
    (df_whales['whale_exchange_deposits_weth'] == 0) & 
    (df_whales['whale_exchange_withdrawals_weth'] == 0)
]
print(f"\n   Rows with ZERO exchange activity: {len(zero_exchange)} ({len(zero_exchange)/len(df_whales)*100:.1f}%)")
print(f"   ‚Üí These represent whale activity OUTSIDE exchanges (OTC, DeFi, cold storage)")
if len(zero_exchange) > 0:
    print(f"   ‚Üí Avg non-exchange volume on these days: {zero_exchange['non_exchange_volume_weth'].mean():.1f} WETH")

print(f"\n   First 3 rows:")
df_whales.head(3)


- Function for Price Data - Fetch ETH & BTC Prices

In [None]:

def fetch_coingecko_price(coin_id, from_date, to_date, api_key):
    """Fetch daily prices from CoinGecko Pro API"""
    print(f"\nüìà Fetching {coin_id.upper()} prices: {from_date} ‚Üí {to_date}")
    
    from_ts = int(pd.Timestamp(from_date).timestamp())
    to_ts = int(pd.Timestamp(to_date).timestamp())
    
    url = f"https://pro-api.coingecko.com/api/v3/coins/{coin_id}/market_chart/range"
    headers = {'accept': 'application/json', 'x-cg-pro-api-key': api_key}
    params = {'vs_currency': 'usd', 'from': from_ts, 'to': to_ts}
    
    try:
        response = requests.get(url, headers=headers, params=params)
        response.raise_for_status()
        data = response.json()
        
        prices = data['prices']
        df = pd.DataFrame({'timestamp': [p[0] for p in prices], 'price': [p[1] for p in prices]})
        df['date'] = pd.to_datetime(df['timestamp'], unit='ms').dt.date
        df = df.groupby('date', as_index=False).agg({'price': 'last'})
        
        print(f"   ‚úÖ {len(df)} days | ${df['price'].min():.0f} - ${df['price'].max():.0f}")
        return df
    except Exception as e:
        print(f"   ‚ùå Error: {e}")
        return pd.DataFrame()

-  Fetching BTC and ETH Prices

In [None]:

# Determine date range (add 100-day buffer for moving averages)
min_date = pd.to_datetime(df_whales['block_date'].min()) - timedelta(days=100)
max_date = pd.to_datetime(df_whales['block_date'].max())

print(f" Price fetch range: {min_date.date()} ‚Üí {max_date.date()}")

# Fetch ETH
df_eth = fetch_coingecko_price('ethereum', min_date.strftime('%Y-%m-%d'), 
                                max_date.strftime('%Y-%m-%d'), COINGECKO_API_KEY)
df_eth = df_eth.rename(columns={'price': 'eth_price'})

time.sleep(REQUEST_DELAY)

# Fetch BTC
df_btc = fetch_coingecko_price('bitcoin', min_date.strftime('%Y-%m-%d'),
                                max_date.strftime('%Y-%m-%d'), COINGECKO_API_KEY)
df_btc = df_btc.rename(columns={'price': 'btc_price'})

print(f"\n Price data ready")
