In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import joblib
import os

In [42]:
COIN = "binancecoin"
df = pd.read_csv(f"../data/processed/{COIN}_processed.csv", parse_dates=["open_time"])
df = df.set_index("open_time").sort_index()

# Train/Test/Val split

In [43]:
train_ratio = 0.7
val_ratio   = 0.15

In [44]:
n = len(df)
train_end = int(n * train_ratio)
val_end   = train_end + int(n * val_ratio)

df_train = df.iloc[:train_end].copy()
df_val   = df.iloc[train_end:val_end].copy()
df_test  = df.iloc[val_end:].copy()

print("Train:", df_train.shape, "Val:", df_val.shape, "Test:", df_test.shape)

Train: (30680, 12) Val: (6574, 12) Test: (6576, 12)


# Scaling

In [45]:
feature_cols = [
    "open","high","low","volume",
    "return_1h","volatility_24h",
    "ma_24","ma_168","ma_ratio",
    "vol_change","missing_flag"
]

In [46]:
# Fit scalers on FULL dataset to handle future/live prices
feature_scaler = MinMaxScaler()
feature_scaler.fit(df[feature_cols])  # Fit on ALL data

# Transform each split
df_train[feature_cols] = feature_scaler.transform(df_train[feature_cols])
df_val[feature_cols]   = feature_scaler.transform(df_val[feature_cols])
df_test[feature_cols]  = feature_scaler.transform(df_test[feature_cols])

In [47]:
# Fit price scaler on FULL dataset
price_scaler = MinMaxScaler()
price_scaler.fit(df[["close"]])  # Fit on ALL data

# Transform each split
df_train["close_scaled"] = price_scaler.transform(df_train[["close"]])
df_val["close_scaled"]   = price_scaler.transform(df_val[["close"]])
df_test["close_scaled"]  = price_scaler.transform(df_test[["close"]])

In [48]:
# Save scalers with new naming convention
SAVE_DIR = f"../data/scaled/{COIN}"
os.makedirs(SAVE_DIR, exist_ok=True)

joblib.dump(feature_scaler, f"{SAVE_DIR}/feature_scaler.pkl")
joblib.dump(price_scaler,   f"{SAVE_DIR}/price_scaler.pkl")

print("Scalers saved to:", SAVE_DIR)
print(f"Price range: ${price_scaler.data_min_[0]:,.2f} - ${price_scaler.data_max_[0]:,.2f}")

Scalers saved to: ../data/scaled/binancecoin
Price range: $26.83 - $1,368.76


In [49]:
SEQ_LEN = 48
HORIZONS = [1, 24]

In [50]:
def create_sequence(df, horizon):
    X, y = [], []
    data_X = df[feature_cols].values          # (N, n_features)
    data_y = df["close_scaled"].values       # (N,)

    for i in range(SEQ_LEN, len(df) - horizon):
        X.append(data_X[i-SEQ_LEN:i])        # past 48h features
        y.append(data_y[i + horizon])        # close price 'h' hours ahead

    return np.array(X), np.array(y)

In [51]:
for h in HORIZONS:
    X_tr, y_tr = create_sequence(df_train, h)
    X_v,  y_v  = create_sequence(df_val, h)
    X_te, y_te = create_sequence(df_test, h)

    PATH = f"../data/sequences/{COIN}/{h}h/"
    os.makedirs(PATH, exist_ok=True)

    np.save(PATH + "X_train.npy", X_tr)
    np.save(PATH + "y_train.npy", y_tr)
    np.save(PATH + "X_val.npy",   X_v)
    np.save(PATH + "y_val.npy",   y_v)
    np.save(PATH + "X_test.npy",  X_te)
    np.save(PATH + "y_test.npy",  y_te)

    print(f"{h}h â†’ Train:{X_tr.shape}, Val:{X_v.shape}, Test:{X_te.shape}")

print("\nðŸŽ‰ DONE! All sequences and scalers saved successfully.")

1h â†’ Train:(30631, 48, 11), Val:(6525, 48, 11), Test:(6527, 48, 11)
24h â†’ Train:(30608, 48, 11), Val:(6502, 48, 11), Test:(6504, 48, 11)

ðŸŽ‰ DONE! All sequences and scalers saved successfully.
