In [52]:
import pandas as pd
import numpy as np


In [53]:
spot = pd.read_csv(
    "data/processed/ema_regime_strategy_results.csv",
    parse_dates=['date'],
    index_col='date'
)


In [54]:
spot.columns


Index(['open', 'high', 'low', 'close', 'volume', 'log_return', 'regime',
       'signal', 'strategy_return', 'cum_market', 'cum_strategy', 'ema_5',
       'ema_15', 'ema_signal', 'trade_signal', 'position'],
      dtype='object')

In [55]:
# EMA-only position for ML dataset
spot['ml_position'] = 0

for i in range(1, len(spot)):
    if spot['ema_signal'].iloc[i] == 1:
        spot.iloc[i, spot.columns.get_loc('ml_position')] = 1
    elif spot['ema_signal'].iloc[i] == -1:
        spot.iloc[i, spot.columns.get_loc('ml_position')] = -1
    else:
        spot.iloc[i, spot.columns.get_loc('ml_position')] = spot['ml_position'].iloc[i-1]


In [56]:
spot['ml_position'].value_counts()


ml_position
-1    9345
 1    9330
 0       1
Name: count, dtype: int64

In [57]:
spot['position'].value_counts()


position
 0    9342
-1    9330
 1       4
Name: count, dtype: int64

In [58]:
# EMA-only position for ML dataset
spot['ml_position'] = 0

for i in range(1, len(spot)):
    if spot['ema_signal'].iloc[i] == 1:
        spot.iloc[i, spot.columns.get_loc('ml_position')] = 1
    elif spot['ema_signal'].iloc[i] == -1:
        spot.iloc[i, spot.columns.get_loc('ml_position')] = -1
    else:
        spot.iloc[i, spot.columns.get_loc('ml_position')] = spot['ml_position'].iloc[i-1]


In [59]:
spot['ml_position'].value_counts()


ml_position
-1    9345
 1    9330
 0       1
Name: count, dtype: int64

In [60]:
ml_trades = spot.copy()


ml_trades = ml_trades[ml_trades['ema_signal'] != 0].copy()


In [61]:
ml_trades.shape


(1302, 17)

In [62]:
horizon = 10

ml_trades['future_return'] = (
    spot['close']
    .shift(-horizon)
    .loc[ml_trades.index] /
    spot['close'].loc[ml_trades.index] - 1
)


In [63]:
ml_trades['trade_return'] = (
    ml_trades['ema_signal'] * ml_trades['future_return']
)


In [64]:
ml_trades['target'] = (ml_trades['trade_return'] > 0).astype(int)


In [65]:
ml_trades['target'].value_counts()


target
0    697
1    605
Name: count, dtype: int64

In [66]:
features = [
    'ema_5',
    'ema_15',
    'log_return',
    'regime'
]

X = ml_trades[features].copy()
y = ml_trades['target']


In [67]:
X.isna().sum(), y.isna().sum()


(ema_5         0
 ema_15        0
 log_return    0
 regime        0
 dtype: int64,
 np.int64(0))

In [68]:
# Minute of the trading day
X['minute'] = (
    ml_trades.index.hour * 60 +
    ml_trades.index.minute
)

# Day of week (0=Mon, 4=Fri)
X['day_of_week'] = ml_trades.index.dayofweek


In [69]:
X[['minute','day_of_week']].describe()

#“Day-of-week was included as a categorical time feature.
#A small number of observations fell outside regular weekdays
#due to data-source artifacts, which were retained without leakage.”


Unnamed: 0,minute,day_of_week
count,1302.0,1302.0
mean,739.170507,1.997696
std,112.064356,1.406309
min,555.0,0.0
25%,640.0,1.0
50%,745.0,2.0
75%,835.0,3.0
max,925.0,5.0


In [70]:
X.isna().sum()


ema_5          0
ema_15         0
log_return     0
regime         0
minute         0
day_of_week    0
dtype: int64

In [71]:
# Lagged returns
X['log_return_lag1'] = ml_trades['log_return'].shift(1)
X['log_return_lag2'] = ml_trades['log_return'].shift(2)

# EMA spread (trend strength)
X['ema_spread'] = ml_trades['ema_5'] - ml_trades['ema_15']
X['ema_spread_lag1'] = X['ema_spread'].shift(1)


In [72]:
X.isna().sum()


ema_5              0
ema_15             0
log_return         0
regime             0
minute             0
day_of_week        0
log_return_lag1    1
log_return_lag2    2
ema_spread         0
ema_spread_lag1    1
dtype: int64

In [73]:
valid_idx = X.dropna().index
X = X.loc[valid_idx]
y = y.loc[valid_idx]


In [74]:
X.isna().sum(), y.isna().sum()


(ema_5              0
 ema_15             0
 log_return         0
 regime             0
 minute             0
 day_of_week        0
 log_return_lag1    0
 log_return_lag2    0
 ema_spread         0
 ema_spread_lag1    0
 dtype: int64,
 np.int64(0))

In [75]:

X['ema_distance'] = (ml_trades['ema_5'] - ml_trades['ema_15']).abs()


In [76]:
X['recent_volatility'] = (
    ml_trades['log_return']
    .rolling(10)
    .std()
)


In [77]:
X.isna().sum()


ema_5                0
ema_15               0
log_return           0
regime               0
minute               0
day_of_week          0
log_return_lag1      0
log_return_lag2      0
ema_spread           0
ema_spread_lag1      0
ema_distance         0
recent_volatility    7
dtype: int64

In [78]:
valid_idx = X.dropna().index
X = X.loc[valid_idx]
y = y.loc[valid_idx]


In [79]:
X.shape, y.shape



((1293, 12), (1293,))

In [80]:
split_idx = int(len(X) * 0.7)

X_train = X.iloc[:split_idx]
y_train = y.iloc[:split_idx]

X_test = X.iloc[split_idx:]
y_test = y.iloc[split_idx:]


In [81]:
X_train.shape, X_test.shape


((905, 12), (388, 12))

In [82]:
from sklearn.model_selection import TimeSeriesSplit

tscv = TimeSeriesSplit(n_splits=5)


In [83]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier(
    n_estimators=200,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='logloss',
    random_state=42
)


In [84]:
xgb_model.fit(X_train, y_train)


In [85]:
from sklearn.metrics import classification_report, roc_auc_score

y_pred_proba = xgb_model.predict_proba(X_test)[:, 1]

print("ROC AUC:", roc_auc_score(y_test, y_pred_proba))
print(classification_report(y_test, y_pred_proba > 0.5))


ROC AUC: 0.5417920383437624
              precision    recall  f1-score   support

           0       0.56      0.62      0.59       203
           1       0.53      0.47      0.50       185

    accuracy                           0.55       388
   macro avg       0.54      0.54      0.54       388
weighted avg       0.54      0.55      0.54       388



In [86]:
import pandas as pd

feature_importance = pd.Series(
    xgb_model.feature_importances_,
    index=X_train.columns
).sort_values(ascending=False)

feature_importance.head(10)


log_return           0.091884
ema_spread_lag1      0.090183
recent_volatility    0.087284
ema_distance         0.086255
log_return_lag2      0.085822
ema_spread           0.085035
regime               0.084862
minute               0.081093
ema_5                0.080124
log_return_lag1      0.080033
dtype: float32

In [87]:
# Create empty column
ml_trades['ml_confidence'] = np.nan

# Assign only for rows that exist in X
ml_trades.loc[X.index, 'ml_confidence'] = xgb_model.predict_proba(X)[:, 1]


In [88]:
ml_trades['ml_confidence'].isna().sum()


np.int64(9)

In [89]:
ml_trades = ml_trades.dropna(subset=['ml_confidence'])


In [90]:
ml_trades['ml_take_trade'] = ml_trades['ml_confidence'] > 0.5
ml_trades['ml_take_trade'].value_counts()


ml_take_trade
False    730
True     563
Name: count, dtype: int64

In [91]:
ml_trades['ml_take_trade'].value_counts()

ml_take_trade
False    730
True     563
Name: count, dtype: int64

In [92]:
spot['ml_position'] = 0
current_pos = 0

for i in range(1, len(spot)):

    idx = spot.index[i]

    # Entry logic
    if idx in ml_trades.index:
        if ml_trades.loc[idx, 'ml_take_trade']:
            current_pos = spot.loc[idx, 'position']

    # Exit logic (base strategy exits)
    if spot.loc[idx, 'position'] == 0:
        current_pos = 0

    spot.iloc[i, spot.columns.get_loc('ml_position')] = current_pos


In [93]:
spot['ml_position'].value_counts()


ml_position
 0    13011
-1     5665
Name: count, dtype: int64

In [94]:
spot['ml_strategy_return'] = (
    spot['ml_position'].shift(1) * spot['log_return']
)
spot['ml_strategy_return'].fillna(0, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  spot['ml_strategy_return'].fillna(0, inplace=True)


In [95]:
def performance_metrics(returns):
    cum_return = (1 + returns).cumprod()
    total_return = cum_return.iloc[-1] - 1

    sharpe = returns.mean() / returns.std() * np.sqrt(252 * 75)

    drawdown = cum_return / cum_return.cummax() - 1
    max_dd = drawdown.min()

    win_rate = (returns > 0).mean()

    return {
        'Total Return': total_return,
        'Sharpe': sharpe,
        'Max Drawdown': max_dd,
        'Win Rate': win_rate
    }

baseline_metrics = performance_metrics(spot['strategy_return'])
ml_metrics = performance_metrics(spot['ml_strategy_return'])

baseline_metrics, ml_metrics


({'Total Return': np.float64(2.011689715082043),
  'Sharpe': np.float64(11.537049636079674),
  'Max Drawdown': -0.011475594851090953,
  'Win Rate': np.float64(0.2755943456843007)},
 {'Total Return': np.float64(0.2976401310000669),
  'Sharpe': np.float64(4.0262070724785515),
  'Max Drawdown': -0.021248046142044674,
  'Win Rate': np.float64(0.15886699507389163)})

In [96]:
baseline_metrics = performance_metrics(spot['strategy_return'])
ml_metrics = performance_metrics(spot['ml_strategy_return'])

baseline_metrics, ml_metrics


({'Total Return': np.float64(2.011689715082043),
  'Sharpe': np.float64(11.537049636079674),
  'Max Drawdown': -0.011475594851090953,
  'Win Rate': np.float64(0.2755943456843007)},
 {'Total Return': np.float64(0.2976401310000669),
  'Sharpe': np.float64(4.0262070724785515),
  'Max Drawdown': -0.021248046142044674,
  'Win Rate': np.float64(0.15886699507389163)})

In [97]:
spot['ml_size'] = 0.0

spot.loc[spot.index.isin(ml_trades.index), 'ml_size'] = (
    ml_trades['ml_confidence']
)

spot['ml_size'] = spot['ml_size'].clip(0.4, 1.0)

spot['ml_sized_position'] = spot['position'] * spot['ml_size']


In [98]:
spot['ml_sized_return'] = (
    spot['ml_sized_position'].shift(1) * spot['log_return']
)


In [99]:
spot['ml_sized_return'].fillna(0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  spot['ml_sized_return'].fillna(0, inplace=True)


In [100]:
ml_sized_metrics = performance_metrics(spot['ml_sized_return'])

baseline_metrics, ml_metrics, ml_sized_metrics


({'Total Return': np.float64(2.011689715082043),
  'Sharpe': np.float64(11.537049636079674),
  'Max Drawdown': -0.011475594851090953,
  'Win Rate': np.float64(0.2755943456843007)},
 {'Total Return': np.float64(0.2976401310000669),
  'Sharpe': np.float64(4.0262070724785515),
  'Max Drawdown': -0.021248046142044674,
  'Win Rate': np.float64(0.15886699507389163)},
 {'Total Return': np.float64(0.003053244568302249),
  'Sharpe': np.float64(0.10370691631116731),
  'Max Drawdown': -0.033457706134308096,
  'Win Rate': np.float64(0.24791175840651103)})

In [101]:
ml_trades.to_csv("data/ml_trades.csv")


In [102]:
spot.to_csv("data/spot_with_trades.csv")
