In [18]:
import pandas as pd


In [19]:
df1 = pd.read_csv("../Data/eth_active_addresses_hr.csv")
df2 = pd.read_csv("../Data/eth_ohlcv_fees_tx_size_fail_rate_hr.csv")
df3 = pd.read_csv("../Data/eth_staking_inflow_hr.csv")
df4 = pd.read_csv("../Data/eth_exchange_deposite_withdraw_tx_hr.csv")
df5 = pd.read_csv("../Data/eth_exchange_netflow_usd_hr.csv")
df6 = pd.read_csv("../Data/eth_RV_hr.csv")

In [20]:
for df in [df1, df2, df3, df4, df5, df6]:
    df['datetime'] = pd.to_datetime(df['datetime']).dt.tz_localize(None)

# Merge all on datetime
merged = (
    df2
    .merge(df1, on="datetime", how="left")
    .merge(df3, on="datetime", how="left")
    .merge(df4, on="datetime", how="left")
    .merge(df5, on="datetime", how="left")
    .merge(df6, on="datetime", how="left")
    .sort_values("datetime")
)

merged = merged[merged["datetime"] != "2025-10-13 00:00:00"]
merged['staking_inflow'] = merged['staking_inflow'].fillna(0)
merged.drop(columns=['realised_variance'], inplace=True)

print(merged.shape)
print(merged.head())

(8208, 18)
             datetime  onchain_volume_usd  transaction_count  avg_gas_fee_usd  \
0 2024-11-05 00:00:00        9.682857e+07              44719         1.817381   
1 2024-11-05 01:00:00        9.693977e+07              44513         1.823433   
2 2024-11-05 02:00:00        1.304534e+08              45224         1.783517   
3 2024-11-05 03:00:00        9.107739e+07              42689         1.623861   
4 2024-11-05 04:00:00        9.376725e+07              43063         1.346194   

   avg_priority_fee_usd  fail_rate_percent     open      low     high  \
0              0.738825                2.4  2398.91  2398.91  2409.56   
1              0.819646                2.3  2402.45  2384.78  2410.94   
2              0.724244                2.0  2411.90  2402.15  2413.11   
3              0.716063                1.4  2405.17  2403.79  2417.69   
4              0.550501                1.4  2418.50  2418.50  2431.30   

     close  hourly_return  active_sending_addresses  \
0  2405.

In [21]:
# Add moving averages columns
merged['RV_MA_1hr'] = merged['realised_volatility'].shift(1)
merged['RV_MA_3hr'] = merged['RV_MA_1hr'].rolling(window=3).mean()
merged['RV_MA_12hr'] = merged['RV_MA_1hr'].rolling(window=12).mean()
merged["vol_future"] = merged["realised_volatility"].shift(-1)


In [22]:
merged.columns.tolist()

['datetime',
 'onchain_volume_usd',
 'transaction_count',
 'avg_gas_fee_usd',
 'avg_priority_fee_usd',
 'fail_rate_percent',
 'open',
 'low',
 'high',
 'close',
 'hourly_return',
 'active_sending_addresses',
 'active_receiving_addresses',
 'staking_inflow',
 'exchange_depositing_count',
 'exchange_withdrawing_count',
 'exchange_netflow_usd',
 'realised_volatility',
 'RV_MA_1hr',
 'RV_MA_3hr',
 'RV_MA_12hr',
 'vol_future']

In [23]:
import numpy as np
from scipy.stats import skew
# Select numeric columns
numeric_cols = merged.select_dtypes(include=[np.number]).columns

# Compute skewness
skew_vals = merged[numeric_cols].apply(lambda x: skew(x.dropna()))
skew_df = pd.DataFrame({"feature": skew_vals.index, "skew": skew_vals.values})
skew_df = skew_df.sort_values(by="skew", ascending=False)

print(skew_df)


                       feature       skew
12              staking_inflow  22.338105
3         avg_priority_fee_usd  20.877469
2              avg_gas_fee_usd  12.340594
17                   RV_MA_1hr   6.919486
16         realised_volatility   6.918769
20                  vol_future   6.918402
18                   RV_MA_3hr   4.654184
13   exchange_depositing_count   4.440219
4            fail_rate_percent   3.887409
0           onchain_volume_usd   2.935166
19                  RV_MA_12hr   2.495189
15        exchange_netflow_usd   1.041357
10    active_sending_addresses   0.789280
14  exchange_withdrawing_count   0.770224
11  active_receiving_addresses   0.604319
1            transaction_count   0.477345
6                          low   0.118388
5                         open   0.115386
8                        close   0.114921
7                         high   0.112566
9                hourly_return  -0.235443


In [24]:
# log transform certain features
log_features = [
    "onchain_volume_usd",
    "avg_gas_fee_usd",
    "avg_priority_fee_usd",
    "staking_inflow",
    "exchange_depositing_count"
]

for col in log_features:
    merged[col] = np.log1p(merged[col])

# signed log
merged["exchange_netflow_usd"] = np.sign(merged["exchange_netflow_usd"]) * np.log1p(np.abs(merged["exchange_netflow_usd"]))

In [25]:
merged.to_csv("../Data/dune_eth_hour.csv", index=False)