# Data Cleaning

In [88]:
import pandas as pd
import numpy as np
from scipy.stats import skew
from sklearn.preprocessing import PowerTransformer


In [89]:
df1 = pd.read_csv("/Users/hushiqi/Desktop/Data/btc_exchange_netflow_usd.csv")
df2 = pd.read_csv("/Users/hushiqi/Desktop/Data/btc_active_addresses.csv")
df3 = pd.read_csv("/Users/hushiqi/Desktop/Data/btc_ohlcv_tx_size_fee_pressure.csv")
df3 = df3.drop(columns=['avg_tx_size_usd', 'fee_pressure_usd'])
df4 = pd.read_csv("/Users/hushiqi/Desktop/Data/reward.csv")
df4 = df4.drop(columns=['total_reward'])
df5 = pd.read_csv("/Users/hushiqi/Desktop/Data/Exchange_to_WalletWallet_to_Exchange.csv")


In [91]:
print(df1.head())
print(df2.head())
print(df3.head())
print(df4.head())
print(df5.head())


                      datetime  exchange_netflow_usd
0  2024-11-05 00:06:00.000 UTC          4.227801e+05
1  2024-11-05 00:18:00.000 UTC          3.890704e+06
2  2024-11-05 00:57:00.000 UTC         -1.487192e+07
3  2024-11-05 00:58:00.000 UTC         -5.606578e+07
4  2024-11-05 01:04:00.000 UTC          1.416559e+06
                      datetime  active_sending_addresses  \
0  2024-11-05 00:06:00.000 UTC                      4430   
1  2024-11-05 00:18:00.000 UTC                      5728   
2  2024-11-05 00:57:00.000 UTC                      4466   
3  2024-11-05 00:58:00.000 UTC                      5356   
4  2024-11-05 01:04:00.000 UTC                      5177   

   active_receiving_addresses  
0                        8288  
1                        7632  
2                        9706  
3                        6027  
4                        7503  
                      datetime  onchain_volume_usd      open       low  \
0  2024-11-05 00:00:00.000 UTC                 0.0  678

In [92]:
def to_minute_index(df, time_col, tz='UTC', how='mean'):
    t = pd.to_datetime(df[time_col], utc=True, errors='coerce')
    df = df.copy()
    df.index = t

    if time_col in df.columns:
        df = df.drop(columns=[time_col])

    df.index = df.index.floor('T')

    if how == 'mean':
        df = df.groupby(df.index).mean(numeric_only=True)
    elif how == 'sum':
        df = df.groupby(df.index).sum(numeric_only=True)
    elif how == 'last':
        df = df.groupby(df.index).last()
    elif how == 'first':
        df = df.groupby(df.index).first()
    else:
        raise ValueError("how must be one of: mean, sum, last, first")

    if df.index.tz is None:
        df.index = df.index.tz_localize(tz)
    else:
        df.index = df.index.tz_convert(tz)

    return df

d1 = to_minute_index(df1, 'datetime')
d2 = to_minute_index(df2, 'datetime')
d3 = to_minute_index(df3, 'datetime')
d4 = to_minute_index(df4, 'minute')
d5 = to_minute_index(df5, 'minute')


d1 = d1.rename(columns={'exchange_netflow_usd': 'btc_exchange_netflow_usd'})
d4 = d4.rename(columns={'mint_reward': 'mint_reward_usd', 'total_fee': 'total_fee_usd'})

start = pd.Timestamp('2024-11-05 00:00:00', tz='UTC')
end   = pd.Timestamp('2025-10-12 23:59:00', tz='UTC')
master_index = pd.date_range(start, end, freq='T', tz='UTC')

frames = []
for d in [d1, d2, d3, d4, d5]:
    frames.append(d.reindex(master_index))



  df.index = df.index.floor('T')
  df.index = df.index.floor('T')
  df.index = df.index.floor('T')
  df.index = df.index.floor('T')
  df.index = df.index.floor('T')
  master_index = pd.date_range(start, end, freq='T', tz='UTC')


# Volatility

In [93]:
df = pd.concat(frames, axis=1)

print(df.shape, df.index[0], df.index[-1])
print(df.head(10))
df['log_ret_sq'] = np.log(df['close'] / df['close'].shift(1))**2


(492480, 13) 2024-11-05 00:00:00+00:00 2025-10-12 23:59:00+00:00
                           btc_exchange_netflow_usd  active_sending_addresses  \
2024-11-05 00:00:00+00:00                       NaN                       NaN   
2024-11-05 00:01:00+00:00                       NaN                       NaN   
2024-11-05 00:02:00+00:00                       NaN                       NaN   
2024-11-05 00:03:00+00:00                       NaN                       NaN   
2024-11-05 00:04:00+00:00                       NaN                       NaN   
2024-11-05 00:05:00+00:00                       NaN                       NaN   
2024-11-05 00:06:00+00:00             422780.100706                    4430.0   
2024-11-05 00:07:00+00:00                       NaN                       NaN   
2024-11-05 00:08:00+00:00                       NaN                       NaN   
2024-11-05 00:09:00+00:00                       NaN                       NaN   

                           active_receiving

In [94]:
df.to_csv("/Users/hushiqi/Desktop/Data/dune_btc.csv",
              index_label='minute_utc')


In [95]:
print(type(df.index), df.index[:3])


<class 'pandas.core.indexes.datetimes.DatetimeIndex'> DatetimeIndex(['2024-11-05 00:00:00+00:00', '2024-11-05 00:01:00+00:00',
               '2024-11-05 00:02:00+00:00'],
              dtype='datetime64[ns, UTC]', freq='min')


# Hourly Aggregation


In [96]:
df_minute = pd.read_csv("/Users/hushiqi/Desktop/Data/dune_btc.csv")
df_minute = pd.read_csv(
    "/Users/hushiqi/Desktop/Data/dune_btc.csv",
    parse_dates=['minute_utc'],    
    index_col='minute_utc'           
)
print(type(df_minute.index), df.index[:3])
print(df_minute.columns.tolist())

df_minute = df_minute.rename(columns={
    "exchange_to_wallet_used": "exchange_to_wallet_usd"
})

agg_rules = {
    'btc_exchange_netflow_usd': 'sum',
    'active_sending_addresses': 'sum',
    'active_receiving_addresses': 'sum',
    'onchain_volume_usd': 'sum',
    'open': 'first',
    'low': 'min',
    'high': 'max',
    'close': 'last',
    'mint_reward_usd': 'sum',
    'total_fee_usd': 'sum',
    'transaction_count': 'sum',
    'exchange_to_wallet_usd': 'sum',   
    'wallet_to_exchange_usd': 'sum',
    'log_ret_sq': 'sum',
}
agg_rules = {k: v for k, v in agg_rules.items() if k in df_minute.columns}

df_hour = df_minute.resample('H').agg(agg_rules).reset_index()
print(df_hour.head())


<class 'pandas.core.indexes.datetimes.DatetimeIndex'> DatetimeIndex(['2024-11-05 00:00:00+00:00', '2024-11-05 00:01:00+00:00',
               '2024-11-05 00:02:00+00:00'],
              dtype='datetime64[ns, UTC]', freq='min')
['btc_exchange_netflow_usd', 'active_sending_addresses', 'active_receiving_addresses', 'onchain_volume_usd', 'open', 'low', 'high', 'close', 'mint_reward_usd', 'total_fee_usd', 'transaction_count', 'exchange_to_wallet_used', 'wallet_to_exchange_usd', 'log_ret_sq']
                 minute_utc  btc_exchange_netflow_usd  \
0 2024-11-05 00:00:00+00:00             -6.662422e+07   
1 2024-11-05 01:00:00+00:00              1.639840e+06   
2 2024-11-05 02:00:00+00:00             -8.418988e+06   
3 2024-11-05 03:00:00+00:00              9.149680e+06   
4 2024-11-05 04:00:00+00:00             -7.860233e+07   

   active_sending_addresses  active_receiving_addresses  onchain_volume_usd  \
0                   19980.0                     31653.0           34.327889   
1      

  df_hour = df_minute.resample('H').agg(agg_rules).reset_index()


In [97]:
df_hour['realized_volatility'] = df_hour['log_ret_sq'] ** 0.5
df_hour = df_hour.drop(columns = 'log_ret_sq')
df_hour = df_hour.rename(columns= {'minute_utc':'hour_utc'})

df_hour['hour_utc'] = pd.to_datetime(df_hour['hour_utc'], utc=True, errors='coerce')
df_hour = df_hour.sort_values('hour_utc').set_index('hour_utc')

df_hour['RV_MA_1hr'] = df_hour['realized_volatility'].shift(1)
df_hour['RV_MA_3hr'] = df_hour['RV_MA_1hr'].rolling(window=3).mean()
df_hour['RV_MA_12hr'] = df_hour['RV_MA_1hr'].rolling(window=12).mean()

df_hour['hourly_return'] = (df_hour['close'] - df_hour['close'].shift(1)) / df_hour['close']
df_hour["vol_future"] = df_hour["realized_volatility"].shift(-1)

print(df_hour[['realized_volatility','RV_MA_3hr','RV_MA_12hr','hourly_return','vol_future']].head(15))



                           realized_volatility  RV_MA_3hr  RV_MA_12hr  \
hour_utc                                                                
2024-11-05 00:00:00+00:00             0.007705        NaN         NaN   
2024-11-05 01:00:00+00:00             0.008782        NaN         NaN   
2024-11-05 02:00:00+00:00             0.005309        NaN         NaN   
2024-11-05 03:00:00+00:00             0.006122   0.007265         NaN   
2024-11-05 04:00:00+00:00             0.006630   0.006737         NaN   
2024-11-05 05:00:00+00:00             0.007759   0.006020         NaN   
2024-11-05 06:00:00+00:00             0.008698   0.006837         NaN   
2024-11-05 07:00:00+00:00             0.010528   0.007695         NaN   
2024-11-05 08:00:00+00:00             0.010221   0.008995         NaN   
2024-11-05 09:00:00+00:00             0.010653   0.009816         NaN   
2024-11-05 10:00:00+00:00             0.007199   0.010467         NaN   
2024-11-05 11:00:00+00:00             0.009065   0.

In [None]:
numeric_cols = ['btc_exchange_netflow_usd', 'active_sending_addresses', 'active_receiving_addresses', 'onchain_volume_usd', 'open', 'low', 'high', 'close','mint_reward_usd','total_fee_usd','transaction_count','wallet_to_exchange_usd','exchange_to_wallet_usd']
skew_vals = df_hour[numeric_cols].apply(lambda x: skew(x.dropna()))
skew_df = pd.DataFrame({"feature": skew_vals.index, "skew": skew_vals.values})
skew_df = skew_df.sort_values(by="skew", ascending=False)
print(skew_df)

skew_col = ['total_fee_usd','exchange_to_wallet_usd','wallet_to_exchange_usd','onchain_volume_usd','transaction_count']
for col in skew_col:
    df_hour[f"{col}_flag"] = (df_hour[col] > 0).astype(int)
    
    df_hour[col] = np.where(df_hour[col] > 0, np.log(df_hour[col] + 1), 0)

pt = PowerTransformer(method='yeo-johnson')
df_hour[skew_col] = pt.fit_transform(df_hour[skew_col])

skew_vals = df_hour[numeric_cols].apply(lambda x: skew(x.dropna()))
skew_df = pd.DataFrame({"feature": skew_vals.index, "skew": skew_vals.values})
skew_df = skew_df.sort_values(by="skew", ascending=False)
print(skew_df)


                       feature      skew
9                total_fee_usd  9.370760
12      exchange_to_wallet_usd  7.569025
11      wallet_to_exchange_usd  7.295002
3           onchain_volume_usd  2.661770
10           transaction_count  1.170797
1     active_sending_addresses  0.968151
2   active_receiving_addresses  0.586893
8              mint_reward_usd  0.578934
0     btc_exchange_netflow_usd -0.079576
5                          low -0.196818
7                        close -0.197719
4                         open -0.199159
6                         high -0.204823
                       feature      skew
1     active_sending_addresses  0.968151
2   active_receiving_addresses  0.586893
8              mint_reward_usd  0.578934
9                total_fee_usd  0.424899
12      exchange_to_wallet_usd  0.090903
3           onchain_volume_usd  0.056495
10           transaction_count  0.029060
11      wallet_to_exchange_usd -0.042511
0     btc_exchange_netflow_usd -0.079576
5               

In [99]:
df_hour.to_csv("/Users/hushiqi/Desktop/Data/dune_btc_hour.csv",
              index_label='hour_utc')