In [1]:
%pip install optuna



In [2]:
import gc, json, pickle, warnings, optuna, joblib
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from google.colab import drive
from joblib import Parallel, delayed
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import silhouette_score
from sklearn.model_selection import TimeSeriesSplit
from collections import defaultdict
from scipy.stats import pearsonr, zscore
from scipy.spatial.distance import squareform
from scipy.cluster.hierarchy import linkage, fcluster, dendrogram

In [3]:
drive.mount('/content/drive')
warnings.filterwarnings("ignore", category=FutureWarning, module="sklearn.utils.deprecation")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
class DataLoader:
  def __init__(self, data_path, file_name):
    self.data_path = data_path
    self.file_name = file_name

  def load(self):
    df = pd.read_parquet(self.data_path + self.file_name)
    self.df = df.loc[:, ~((df == -np.inf).any() | (df == 0).all())]
    print('Data loaded')
    return self

  def train_split(self):
    label = self.df['label']
    qty_vol = self.df[['bid_qty', 'ask_qty', 'buy_qty', 'sell_qty', 'volume']]
    X_ = self.df.drop(['label', 'bid_qty', 'ask_qty', 'buy_qty', 'sell_qty', 'volume'], axis=1)
    return X_, qty_vol, label

  def test_split(self):
    qty_vol = self.df[['bid_qty', 'ask_qty', 'buy_qty', 'sell_qty', 'volume']]
    X_ = self.df.drop(['bid_qty', 'ask_qty', 'buy_qty', 'sell_qty', 'volume'], axis=1)
    return X_, qty_vol

  def load_features_json_list(self):
    with open(self.data_path + self.file_name, 'r') as f:
      return json.load(f)

  def load_clusters_pkl(self):
    with open(self.data_path + self.file_name, 'rb') as f:
      return pickle.load(f)

In [5]:
class FeatureGenerator:
  def __init__(self, df, epsilon=1e-6):
    self.df = df
    self.epsilon = epsilon

  def develop_features_from_qty_volume(self):
    '''
      Function to develop features from bid_qty, ask_qty, buy_qty, sell_qty
      and volume columns.
    '''
    imbalance = (self.df['bid_qty'] - self.df['ask_qty']) / (self.df['bid_qty'] + self.df['ask_qty'] + self.epsilon)
    buy_sell_ratio = np.log1p(self.df['buy_qty'] / (self.df['sell_qty'] + self.epsilon))
    volume_z = zscore(self.df['volume'])

    return pd.DataFrame({
        'imbalance': imbalance,
        'buy_sell_ratio': buy_sell_ratio,
        'volume_z': volume_z
        }, index=self.df.index)

  def standardize_columns(self, columns):
    '''Function to standardize specific columns wihtin a df.'''
    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(self.df[columns])
    df_scaled = pd.DataFrame(df_scaled, columns=columns, index=self.df.index)
    return pd.concat([df_scaled, self.df['label']], axis=1)

  def standardize_df(self):
    '''Function to standardize a whole df.'''
    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(self.df)
    return pd.DataFrame(df_scaled, columns=self.df.columns, index=self.df.index)

In [6]:
class LagCreator:
  def __init__(self, lag_periods = 5):
    self.lag_periods = lag_periods

  def lag_features(self, X):

    lagged_columns = []
    for col in X.columns:
        for lag in range(1, self.lag_periods + 1):
          shifted = X[col].shift(lag)
          shifted.name = f"{col}_lag{lag}"
          lagged_columns.append(shifted)

    lagged = pd.concat(lagged_columns, axis=1)
    lagged = pd.concat([X, lagged], axis=1)
    return lagged.dropna()

  def match_index_lags(self, X_lagged, Y):
    '''Y is series with a single column'''
    y_lagged = Y.loc[X_lagged.index]

    return y_lagged


In [7]:
class FeatureLagSelector:
    def __init__(self, feature_list):
        '''
        Initialize the selector with a list of desired features.
        Features can be raw (e.g., 'X264') or lagged (e.g., 'X264_lag2')
        '''
        self.feature_list = feature_list
        self.selected_features = {}

    def _parse_feature(self, feature_name):

        '''Parse feature name into base column and lag value (if present).'''

        if '_lag' in feature_name:
            base, lag = feature_name.split('_lag')
            return base, int(lag)
        return feature_name, None

    def _transform(self, df):
        '''Build a new DataFrame with the selected and lagged features.'''

        self.selected_features.clear()
        for feature in self.feature_list:
            base, lag = self._parse_feature(feature)
            if base not in df.columns:
                print(f"Base column '{base}' not found in DataFrame")
                continue
            if lag is None:
                self.selected_features[feature] = df[base]
            else:
                self.selected_features[feature] = df[base].shift(lag)

        return pd.DataFrame(self.selected_features)

    def fit_transform(self, df):
        return self._transform(df)


In [8]:
class ModelPipelineOptimizerNN:
  def __init__ (self, y, df, features2, clusters_components, model_save_location, model_name):
    self.df = df
    self.features2 = features2
    self.y = y
    self.clusters_components = clusters_components
    self.data_path = model_save_location
    self.model_name = model_name

    self.best_features = [f for group in self.clusters_components[0] for f in group]
    self.X = FeatureLagSelector(self.best_features).fit_transform(self.df).dropna()
    self.features2 = LagCreator(3).lag_features(self.features2)

    self.X = pd.concat([self.X, self.features2], axis=1)

    self.y = self.y.loc[self.X.index]


    del self.df, self.features2
    gc.collect()

  def create_column_transformer(self):
    feature_clusters, pca_components = self.clusters_components
    transformers = []

    for i, (features, n_comp) in enumerate(zip(feature_clusters, pca_components)):
      if len(features) == 1:
            # For single feature, just apply StandardScaler
            pipe = Pipeline([
                ('scaler', StandardScaler())
            ])
      else:
            # Multiple features: apply scaler + PCA
            pipe = Pipeline([
                ('scaler', StandardScaler()),
                ('pca', PCA(n_components=n_comp))
            ])
      transformers.append((f'cluster_{i}', pipe, features))

    return ColumnTransformer(transformers, remainder='passthrough')

  def build_model(self, input_dim, trial):
    model = Sequential()
    model.add(Input(shape=(input_dim,)))
    model.add(Dense(trial.suggest_int('units1', 128, 384), activation='relu'))
    model.add(Dropout(trial.suggest_float('dropout1', 0.2, 0.4)))

    model.add(Dense(1, activation='linear'))
    model.compile(
        loss='mse',
        optimizer=Adam(
            learning_rate=trial.suggest_float('lr', 0.002, 0.008, log=True)
        )
    )
    return model


  def objective(self, trial):
    col_transformer = self.create_column_transformer()
    tscv = TimeSeriesSplit(n_splits=5)
    scores = []

    for train_idx, val_idx in tscv.split(self.X):
      X_train, X_val = self.X.iloc[train_idx], self.X.iloc[val_idx]
      y_train, y_val = self.y.iloc[train_idx], self.y.iloc[val_idx]

      Xt_train = col_transformer.fit_transform(X_train)
      Xt_val = col_transformer.transform(X_val)

      model = self.build_model(Xt_train.shape[1], trial)
      model.fit(
          Xt_train,
          y_train,
          epochs=trial.suggest_int("epochs", 50, 100),
          batch_size=trial.suggest_categorical("batch_size", [64, 128, 256]),
          verbose=0)

      preds = model.predict(Xt_val).flatten()
      corr, _ = pearsonr(y_val, preds)
      scores.append(pearsonr(y_val, preds.flatten())[0])

    return np.mean(scores)

  def optimize(self, n_trials=50):
    self.study = optuna.create_study(direction="maximize")
    self.study.optimize(self.objective, n_trials=n_trials, n_jobs=-1)

    print("\nBest trial:")
    print(f"  Pearson correlation: {self.study.best_value:.4f}")
    print("  Hyperparameters:")
    for key, val in self.study.best_params.items():
        print(f"    {key}: {val}")

    # Visualize optimization
    optuna.visualization.plot_optimization_history(self.study).show()
    optuna.visualization.plot_param_importances(self.study).show()

    # Recreate PCA + scaler transformer
    col_transformer = self.create_column_transformer()
    Xt = col_transformer.fit_transform(self.X)

    # Build final pipeline with best NN params
    best_trial = self.study.best_trial
    final_model = self.build_model(Xt.shape[1], best_trial)
    final_model.fit(Xt, self.y, epochs=best_trial.params['epochs'], batch_size=best_trial.params['batch_size'], verbose=0)

    joblib.dump((col_transformer, final_model), self.data_path + self.model_name)
    print(f"\nModel saved to: {self.data_path + self.model_name}")

    return final_model

In [9]:
# Features
# Group 1 - X1, X2 ....
# Group 2 - _qty, vol

In [10]:
# Filepaths
dataPathLoad = '/content/drive/MyDrive/Colab Notebooks/DRW/data/'
dataPathSave = '/content/drive/MyDrive/Colab Notebooks/DRW/data/v7'
fileNameTrain = 'train.parquet'

In [11]:
# Load train data and remove -inf columns and columns full of 0
loader = DataLoader(dataPathLoad, fileNameTrain)
features_group1, features_group2, label = loader.load().train_split()
del loader
gc.collect()

Data loaded


22

In [12]:
# Build the right features2
features_group2 = FeatureGenerator(features_group2).develop_features_from_qty_volume()


In [13]:
# Load the groups
with open(dataPathSave + 'PCA_components7.json', 'r') as f:
    groups_components = json.load(f)

In [None]:
# Optimize and save model
model_name = 'model7_5splits_NN_2.pkl'
optimizer = ModelPipelineOptimizerNN(label, features_group1, features_group2, groups_components, dataPathSave, model_name)
final_pipeline = optimizer.optimize()

[I 2025-06-09 13:29:58,374] A new study created in memory with name: no-name-43678368-1373-49df-ac34-dabc7d5bea3b
