In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import RobustScaler
import os
import joblib

%matplotlib inline

In [None]:
# Configuration
N_STEPS_IN = 30  
N_STEPS_OUT = 7 

def create_multistep_sequences(data, target, n_steps_in, n_steps_out):
    X, y = [], []
    if len(data) <= (n_steps_in + n_steps_out):
        return np.array([]), np.array([])
        
    for i in range(len(data) - n_steps_in - n_steps_out + 1):
        end_ix = i + n_steps_in
        out_end_ix = end_ix + n_steps_out
        seq_x = data[i:end_ix, :]
        seq_y = target[end_ix:out_end_ix, 0]
        X.append(seq_x)
        y.append(seq_y)
    return np.array(X), np.array(y)

In [None]:
# Load Data
data_path = 'training_data.csv'
if not os.path.exists(data_path):
    print("Error: training_data.csv not found.")
    raise SystemExit('Stopping execution')

print("Loading data...")
df = pd.read_csv(data_path)
df['datetime'] = pd.to_datetime(df['datetime'])
df = df.sort_values('datetime')
df = df.set_index('datetime')

# Resample to 4-Hour (4H)
print("Resampling to 4-Hour (4H)...")
df_4h = df.resample('4H').agg({
    'price': 'mean',
    'volume': 'sum', 
    'Sentiment Score': 'mean'
}).dropna()
print(f"4H Data Shape: {df_4h.shape}")

Loading data...
Resampling to 4-Hour (4H)...
4H Data Shape: (615, 3)


  df_4h = df.resample('4H').agg({


In [None]:
# Regime Detection (ONLINE / NO LEAKAGE)
print("Calculating Rolling Volatility (Regime Proxy)...")
df_4h['return'] = df_4h['price'].pct_change().fillna(0)
df_4h['volatility'] = df_4h['return'].rolling(window=20).std().fillna(0)

print("Volatility Feature Created.")

Calculating Rolling Volatility (Regime Proxy)...
Volatility Feature Created.


In [None]:
# Feature Engineering
# Features: Return, Volume, Sentiment, Volatility
feature_cols = ['return', 'volume', 'Sentiment Score', 'volatility']
target_col = 'return'

print(f"Selected Features: {feature_cols}")

split_idx = int(len(df_4h) * 0.8)
train_df = df_4h.iloc[:split_idx]
test_df = df_4h.iloc[split_idx:]

# Scalers (Updated to RobustScaler for Financial Data to handle outliers)
print("Scaling data with RobustScaler...")
f_scaler = RobustScaler()
t_scaler = RobustScaler()

# Fit on TRAIN only
X_train_s = f_scaler.fit_transform(train_df[feature_cols])
# Transform TEST
X_test_s = f_scaler.transform(test_df[feature_cols])

# Fit Target on TRAIN only
y_train_s = t_scaler.fit_transform(train_df[[target_col]])
y_test_s = t_scaler.transform(test_df[[target_col]])

print(f"Generating sequences: Input={N_STEPS_IN}, Output={N_STEPS_OUT}")

X_train, y_train = create_multistep_sequences(X_train_s, y_train_s, N_STEPS_IN, N_STEPS_OUT)

# Combined for test extraction (sliding window over boundary)
full_inputs = np.vstack([X_train_s, X_test_s])
full_targets = np.vstack([y_train_s, y_test_s])
X_all, y_all = create_multistep_sequences(full_inputs, full_targets, N_STEPS_IN, N_STEPS_OUT)

# Extract Test portion
X_test = X_all[len(X_train):]
y_test = y_all[len(y_train):]

print(f"Train shapes: X={X_train.shape}, y={y_train.shape}")
print(f"Test shapes: X={X_test.shape}, y={y_test.shape}")

# Save
dates = df_4h.index.astype(str).tolist()

np.savez('daily_data.npz', 
            X_train=X_train, y_train=y_train,
            X_test=X_test, y_test=y_test,
            dates=dates,
            n_steps_in=N_STEPS_IN, n_steps_out=N_STEPS_OUT)

joblib.dump(f_scaler, 'feature_scaler.pkl')
joblib.dump(t_scaler, 'target_scaler_daily.pkl')
print("Saved daily_data.npz (Fixed Version: No Leakage + RobustScaler)")

Selected Features: ['return', 'volume', 'Sentiment Score', 'volatility']
Scaling data with RobustScaler...
Generating sequences: Input=30, Output=7
Train shapes: X=(456, 30, 4), y=(456, 7)
Test shapes: X=(123, 30, 4), y=(123, 7)
Saved daily_data.npz (Fixed Version: No Leakage + RobustScaler)
