In [20]:
import pandas as pd
import re
import numpy as np
import os
import multiprocessing as mp
import logging
import matplotlib.pyplot as plt 
pd.set_option('display.max_columns', 700)  # Show all columns
pd.set_option('display.max_rows', None)  # Show all columns
pd.set_option('display.float_format', '{:.5f}'.format)

In [21]:
df = pd.read_csv("Data/DF_Combined_2025_1h_Processed.csv")
df.head()

Unnamed: 0,coin,date,RET,close,high,low,open,volume
0,AAVEUSDT,2024-12-31 23:00:00,-0.00287,308.7,311.66,307.85,309.59,16569.7
1,AAVEUSDT,2025-01-01 00:00:00,0.01591,313.61,313.72,308.72,308.73,27473.9
2,AAVEUSDT,2025-01-01 01:00:00,-0.00077,313.37,314.28,311.37,313.61,23172.2
3,AAVEUSDT,2025-01-01 02:00:00,0.00377,314.55,316.92,313.21,313.37,26176.5
4,AAVEUSDT,2025-01-01 03:00:00,-0.00741,312.22,314.58,311.81,314.53,13005.1


In [22]:
# --- Mid-to-Long Term Momentum Setup ---

# Shorter lookback windows (intraday to 1 day)
SHORT_WINDOWS = [10, 24, 72]         # 10h, 1D, 3D

# Longer moving average windows (weekly to monthly)
LONG_WINDOWS = [168, 336, 720]       # 1W, 2W, 1M

# Ensure sorting for correct rolling behavior
df = df.sort_values(by=['coin', 'date'])

# --- Loop Through All Valid Combinations ---
for short_w in SHORT_WINDOWS:
    for long_w in LONG_WINDOWS:
        if short_w < long_w:
            # Define column names
            col_name_sma = f'SMA_{short_w}'
            col_name_lma = f'LMA_{long_w}'
            momentum_col_name = f'Mom_S{short_w}_L{long_w}'
            
            # Compute rolling means per coin
            df[col_name_lma] = df.groupby('coin')['close'].transform(
                lambda x: x.rolling(window=long_w, min_periods=long_w).mean()
            )
            df[col_name_sma] = df.groupby('coin')['close'].transform(
                lambda x: x.rolling(window=short_w, min_periods=short_w).mean()
            )
            
            # Momentum = (SMA_short - LMA_long) / LMA_long
            df[momentum_col_name] = (
                df[col_name_sma] - df[col_name_lma]
            ) / df[col_name_lma]
            
            # Drop temporary columns
            df = df.drop(columns=[col_name_sma, col_name_lma])

print("Calculated Momentum Features:")
print([col for col in df.columns if col.startswith('Mom_S')])
df.head()


Calculated Momentum Features:
['Mom_S10_L168', 'Mom_S10_L336', 'Mom_S10_L720', 'Mom_S24_L168', 'Mom_S24_L336', 'Mom_S24_L720', 'Mom_S72_L168', 'Mom_S72_L336', 'Mom_S72_L720']


Unnamed: 0,coin,date,RET,close,high,low,open,volume,Mom_S10_L168,Mom_S10_L336,Mom_S10_L720,Mom_S24_L168,Mom_S24_L336,Mom_S24_L720,Mom_S72_L168,Mom_S72_L336,Mom_S72_L720
0,AAVEUSDT,2024-12-31 23:00:00,-0.00287,308.7,311.66,307.85,309.59,16569.7,,,,,,,,,
1,AAVEUSDT,2025-01-01 00:00:00,0.01591,313.61,313.72,308.72,308.73,27473.9,,,,,,,,,
2,AAVEUSDT,2025-01-01 01:00:00,-0.00077,313.37,314.28,311.37,313.61,23172.2,,,,,,,,,
3,AAVEUSDT,2025-01-01 02:00:00,0.00377,314.55,316.92,313.21,313.37,26176.5,,,,,,,,,
4,AAVEUSDT,2025-01-01 03:00:00,-0.00741,312.22,314.58,311.81,314.53,13005.1,,,,,,,,,


In [23]:
def add_rsi_features(df: pd.DataFrame, windows=(7, 14, 28, 50, 75, 100)) -> pd.DataFrame:
    """
    Add RSI_{window} columns per coin. Robust to duplicate indices.
    Requires columns: ['coin','date','close'].
    """
    def calc_rsi(close: pd.Series, window: int) -> pd.Series:
        delta = close.diff()
        gain = delta.clip(lower=0.0)
        loss = (-delta).clip(lower=0.0)

        avg_gain = gain.rolling(window, min_periods=window).mean()
        avg_loss = loss.rolling(window, min_periods=window).mean()

        rs = avg_gain / (avg_loss + 1e-12)
        rsi = 100 - (100 / (1 + rs))
        return rsi

    out = df.copy()

    # ensure proper ordering within each coin
    if not np.issubdtype(out['date'].dtype, np.datetime64):
        out['date'] = pd.to_datetime(out['date'], errors='coerce')
    out.sort_values(['coin', 'date'], inplace=True)

    # compute per coin and assign by index (avoids reindexing errors)
    for w in windows:
        col = f'RSI_{w}'
        out[col] = np.nan  # pre-create column
        for coin, g in out.groupby('coin'):
            out.loc[g.index, col] = calc_rsi(g['close'], w).values

    return out


df = add_rsi_features(df, windows=(28, 24*2, 24*3, 24*5, 24*7, 24*14, 24*30))
df.head()


Unnamed: 0,coin,date,RET,close,high,low,open,volume,Mom_S10_L168,Mom_S10_L336,Mom_S10_L720,Mom_S24_L168,Mom_S24_L336,Mom_S24_L720,Mom_S72_L168,Mom_S72_L336,Mom_S72_L720,RSI_28,RSI_48,RSI_72,RSI_120,RSI_168,RSI_336,RSI_720
0,AAVEUSDT,2024-12-31 23:00:00,-0.00287,308.7,311.66,307.85,309.59,16569.7,,,,,,,,,,,,,,,,
1,AAVEUSDT,2025-01-01 00:00:00,0.01591,313.61,313.72,308.72,308.73,27473.9,,,,,,,,,,,,,,,,
2,AAVEUSDT,2025-01-01 01:00:00,-0.00077,313.37,314.28,311.37,313.61,23172.2,,,,,,,,,,,,,,,,
3,AAVEUSDT,2025-01-01 02:00:00,0.00377,314.55,316.92,313.21,313.37,26176.5,,,,,,,,,,,,,,,,
4,AAVEUSDT,2025-01-01 03:00:00,-0.00741,312.22,314.58,311.81,314.53,13005.1,,,,,,,,,,,,,,,,


In [24]:
df['volume_z'] = df.groupby('coin')['volume'].transform(
    lambda x: (x - x.mean()) / x.std()
)
df['volume_change'] = df.groupby('coin')['volume'].pct_change()
df.head()

Unnamed: 0,coin,date,RET,close,high,low,open,volume,Mom_S10_L168,Mom_S10_L336,Mom_S10_L720,Mom_S24_L168,Mom_S24_L336,Mom_S24_L720,Mom_S72_L168,Mom_S72_L336,Mom_S72_L720,RSI_28,RSI_48,RSI_72,RSI_120,RSI_168,RSI_336,RSI_720,volume_z,volume_change
0,AAVEUSDT,2024-12-31 23:00:00,-0.00287,308.7,311.66,307.85,309.59,16569.7,,,,,,,,,,,,,,,,,-0.73621,
1,AAVEUSDT,2025-01-01 00:00:00,0.01591,313.61,313.72,308.72,308.73,27473.9,,,,,,,,,,,,,,,,,-0.44968,0.65808
2,AAVEUSDT,2025-01-01 01:00:00,-0.00077,313.37,314.28,311.37,313.61,23172.2,,,,,,,,,,,,,,,,,-0.56272,-0.15657
3,AAVEUSDT,2025-01-01 02:00:00,0.00377,314.55,316.92,313.21,313.37,26176.5,,,,,,,,,,,,,,,,,-0.48377,0.12965
4,AAVEUSDT,2025-01-01 03:00:00,-0.00741,312.22,314.58,311.81,314.53,13005.1,,,,,,,,,,,,,,,,,-0.82987,-0.50318


In [25]:
def add_mfi_features(df: pd.DataFrame, windows=(14, 28, 50, 75, 100)) -> pd.DataFrame:
    """
    Adds MFI_{window} per coin. Keeps the 'coin' column.
    Requires columns: ['coin','date','high','low','close','volume'].
    """
    out = df.copy()
    if not np.issubdtype(out['date'].dtype, np.datetime64):
        out['date'] = pd.to_datetime(out['date'], errors='coerce')
    out.sort_values(['coin', 'date'], inplace=True)

    def _compute(group: pd.DataFrame) -> pd.DataFrame:
        g = group.copy()
        tp = (g['high'] + g['low'] + g['close']) / 3.0
        mf = tp * g['volume']

        up_flow   = np.where(tp > tp.shift(1), mf, 0.0)
        down_flow = np.where(tp < tp.shift(1), mf, 0.0)

        up_s = pd.Series(up_flow, index=g.index)
        dn_s = pd.Series(down_flow, index=g.index)

        for w in windows:
            up_sum = up_s.rolling(w, min_periods=w).sum()
            dn_sum = dn_s.rolling(w, min_periods=w).sum()
            g[f"MFI_{w}"] = 100.0 * (up_sum / (up_sum + dn_sum + 1e-12))

        g['coin'] = group.name  # reattach the coin key
        return g

    out = out.groupby('coin', group_keys=False).apply(_compute).reset_index(drop=True)
    return out


# Example usage
df = add_mfi_features(df, windows=(24, 24*3, 24*5, 24*7, 24*14, 24*30))
df.head()


  out = out.groupby('coin', group_keys=False).apply(_compute).reset_index(drop=True)


Unnamed: 0,coin,date,RET,close,high,low,open,volume,Mom_S10_L168,Mom_S10_L336,Mom_S10_L720,Mom_S24_L168,Mom_S24_L336,Mom_S24_L720,Mom_S72_L168,Mom_S72_L336,Mom_S72_L720,RSI_28,RSI_48,RSI_72,RSI_120,RSI_168,RSI_336,RSI_720,volume_z,volume_change,MFI_24,MFI_72,MFI_120,MFI_168,MFI_336,MFI_720
0,AAVEUSDT,2024-12-31 23:00:00,-0.00287,308.7,311.66,307.85,309.59,16569.7,,,,,,,,,,,,,,,,,-0.73621,,,,,,,
1,AAVEUSDT,2025-01-01 00:00:00,0.01591,313.61,313.72,308.72,308.73,27473.9,,,,,,,,,,,,,,,,,-0.44968,0.65808,,,,,,
2,AAVEUSDT,2025-01-01 01:00:00,-0.00077,313.37,314.28,311.37,313.61,23172.2,,,,,,,,,,,,,,,,,-0.56272,-0.15657,,,,,,
3,AAVEUSDT,2025-01-01 02:00:00,0.00377,314.55,316.92,313.21,313.37,26176.5,,,,,,,,,,,,,,,,,-0.48377,0.12965,,,,,,
4,AAVEUSDT,2025-01-01 03:00:00,-0.00741,312.22,314.58,311.81,314.53,13005.1,,,,,,,,,,,,,,,,,-0.82987,-0.50318,,,,,,


In [26]:
# Define the look-ahead periods (in hours)
HORIZONS = [24, 24*3, 24*5, 24*7, 24*14, 24*30]

# 1. Create the future return columns
for h in HORIZONS:
    # Column name will be 'Future_RET_H' where H is the horizon
    col_name = f'Future_RET_{h}H'
    
    
    # --- Correct approach for cumulative future return (Gross Return Product) ---
    def calculate_future_cumulative_return(group, hours):
        # Calculate the cumulative gross return (1 + R_t) * (1 + R_t+1) ... (1 + R_t+h-1)
        gross_return = (1 + group['RET']).rolling(window=hours).apply(lambda x: x.prod(), raw=True)
        # Shift back by 'hours' and subtract 1 to get the simple cumulative return from t to t+h
        future_return = gross_return.shift(-hours) - 1
        return future_return

    df[col_name] = df.groupby('coin').apply(
        lambda x: calculate_future_cumulative_return(x, h), 
        include_groups=False
    ).reset_index(level=0, drop=True)
    

# Drop rows where the target return cannot be calculated (at the end of the time series)
df = df.dropna()
df.head()

Unnamed: 0,coin,date,RET,close,high,low,open,volume,Mom_S10_L168,Mom_S10_L336,Mom_S10_L720,Mom_S24_L168,Mom_S24_L336,Mom_S24_L720,Mom_S72_L168,Mom_S72_L336,Mom_S72_L720,RSI_28,RSI_48,RSI_72,RSI_120,RSI_168,RSI_336,RSI_720,volume_z,volume_change,MFI_24,MFI_72,MFI_120,MFI_168,MFI_336,MFI_720,Future_RET_24H,Future_RET_72H,Future_RET_120H,Future_RET_168H,Future_RET_336H,Future_RET_720H
720,AAVEUSDT,2025-01-30 23:00:00,-0.00574,315.36,319.48,315.13,317.18,15502.9,0.00683,-0.02506,-0.00173,-0.01778,-0.04889,-0.02613,-0.0487,-0.07883,-0.05679,66.82228,65.17831,53.43246,48.11259,47.28295,50.19439,50.15723,-0.76424,0.62635,69.78539,54.98108,50.69735,48.52804,52.35545,51.24792,0.05467,-0.18119,-0.13753,-0.23478,-0.18747,-0.38185
721,AAVEUSDT,2025-01-31 00:00:00,0.00999,318.51,318.63,313.26,315.36,25200.6,0.00734,-0.02484,-0.00147,-0.01409,-0.04558,-0.02271,-0.04771,-0.07813,-0.05604,67.66552,66.19758,55.76252,48.40233,48.47991,50.25958,50.11578,-0.50942,0.62554,74.56711,56.08476,50.76109,49.07325,52.37237,51.24502,0.04,-0.21889,-0.15943,-0.24351,-0.20134,-0.38956
722,AAVEUSDT,2025-01-31 01:00:00,0.00546,320.25,320.41,316.68,318.5,20953.7,0.00894,-0.02347,-0.00012,-0.01052,-0.0423,-0.0194,-0.0466,-0.07722,-0.05516,67.62997,65.50172,56.84044,48.9077,48.6525,49.89999,50.16245,-0.62101,-0.16852,74.18358,57.14448,51.20302,49.47103,52.29083,51.24223,0.02948,-0.32053,-0.16465,-0.23663,-0.20322,-0.395
723,AAVEUSDT,2025-01-31 02:00:00,-0.00987,317.09,320.75,315.63,320.25,18808.8,0.00923,-0.0234,-8e-05,-0.00757,-0.03965,-0.01673,-0.04568,-0.07653,-0.05448,68.63245,64.00636,55.13464,48.42787,48.47556,49.88299,50.05992,-0.67737,-0.10236,71.33125,56.35625,51.2237,49.53734,51.98588,51.17069,0.04084,-0.31496,-0.15122,-0.22221,-0.19118,-0.38888
724,AAVEUSDT,2025-01-31 03:00:00,-0.00369,315.92,317.6,314.0,317.07,22673.2,0.0089,-0.02405,-0.00078,-0.00513,-0.03762,-0.01467,-0.04475,-0.07595,-0.05392,70.60174,61.78172,54.31518,47.60089,47.62576,49.91208,50.08733,-0.57583,0.20546,67.75794,55.45892,50.65738,48.89632,51.98633,51.15471,0.03191,-0.27314,-0.15824,-0.22123,-0.17688,-0.38532


In [27]:
# Create binary target columns: 1 = positive return, -1 = negative or zero
for col in [c for c in df.columns if c.startswith("Future_RET_")]:
    df[f"BIN_{col}"] = np.where(df[col] > 0, 1, -1)
df.head()

Unnamed: 0,coin,date,RET,close,high,low,open,volume,Mom_S10_L168,Mom_S10_L336,Mom_S10_L720,Mom_S24_L168,Mom_S24_L336,Mom_S24_L720,Mom_S72_L168,Mom_S72_L336,Mom_S72_L720,RSI_28,RSI_48,RSI_72,RSI_120,RSI_168,RSI_336,RSI_720,volume_z,volume_change,MFI_24,MFI_72,MFI_120,MFI_168,MFI_336,MFI_720,Future_RET_24H,Future_RET_72H,Future_RET_120H,Future_RET_168H,Future_RET_336H,Future_RET_720H,BIN_Future_RET_24H,BIN_Future_RET_72H,BIN_Future_RET_120H,BIN_Future_RET_168H,BIN_Future_RET_336H,BIN_Future_RET_720H
720,AAVEUSDT,2025-01-30 23:00:00,-0.00574,315.36,319.48,315.13,317.18,15502.9,0.00683,-0.02506,-0.00173,-0.01778,-0.04889,-0.02613,-0.0487,-0.07883,-0.05679,66.82228,65.17831,53.43246,48.11259,47.28295,50.19439,50.15723,-0.76424,0.62635,69.78539,54.98108,50.69735,48.52804,52.35545,51.24792,0.05467,-0.18119,-0.13753,-0.23478,-0.18747,-0.38185,1,-1,-1,-1,-1,-1
721,AAVEUSDT,2025-01-31 00:00:00,0.00999,318.51,318.63,313.26,315.36,25200.6,0.00734,-0.02484,-0.00147,-0.01409,-0.04558,-0.02271,-0.04771,-0.07813,-0.05604,67.66552,66.19758,55.76252,48.40233,48.47991,50.25958,50.11578,-0.50942,0.62554,74.56711,56.08476,50.76109,49.07325,52.37237,51.24502,0.04,-0.21889,-0.15943,-0.24351,-0.20134,-0.38956,1,-1,-1,-1,-1,-1
722,AAVEUSDT,2025-01-31 01:00:00,0.00546,320.25,320.41,316.68,318.5,20953.7,0.00894,-0.02347,-0.00012,-0.01052,-0.0423,-0.0194,-0.0466,-0.07722,-0.05516,67.62997,65.50172,56.84044,48.9077,48.6525,49.89999,50.16245,-0.62101,-0.16852,74.18358,57.14448,51.20302,49.47103,52.29083,51.24223,0.02948,-0.32053,-0.16465,-0.23663,-0.20322,-0.395,1,-1,-1,-1,-1,-1
723,AAVEUSDT,2025-01-31 02:00:00,-0.00987,317.09,320.75,315.63,320.25,18808.8,0.00923,-0.0234,-8e-05,-0.00757,-0.03965,-0.01673,-0.04568,-0.07653,-0.05448,68.63245,64.00636,55.13464,48.42787,48.47556,49.88299,50.05992,-0.67737,-0.10236,71.33125,56.35625,51.2237,49.53734,51.98588,51.17069,0.04084,-0.31496,-0.15122,-0.22221,-0.19118,-0.38888,1,-1,-1,-1,-1,-1
724,AAVEUSDT,2025-01-31 03:00:00,-0.00369,315.92,317.6,314.0,317.07,22673.2,0.0089,-0.02405,-0.00078,-0.00513,-0.03762,-0.01467,-0.04475,-0.07595,-0.05392,70.60174,61.78172,54.31518,47.60089,47.62576,49.91208,50.08733,-0.57583,0.20546,67.75794,55.45892,50.65738,48.89632,51.98633,51.15471,0.03191,-0.27314,-0.15824,-0.22123,-0.17688,-0.38532,1,-1,-1,-1,-1,-1


In [28]:
df.to_csv("Data/DF_Combined_2025_1h_Featured.csv", index=False)