In [1]:
%pip install optuna kneed

Collecting optuna
  Downloading optuna-4.3.0-py3-none-any.whl.metadata (17 kB)
Collecting kneed
  Downloading kneed-0.8.5-py3-none-any.whl.metadata (5.5 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.16.1-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.3.0-py3-none-any.whl (386 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m386.6/386.6 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading kneed-0.8.5-py3-none-any.whl (10 kB)
Downloading alembic-1.16.1-py3-none-any.whl (242 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m242.5/242.5 kB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, kneed, alembic, optuna
Successfully installed alembic-1.16.1 colorlog-6.9.0 kneed-0.8.5 optuna-4.3.0


In [2]:
import gc, json, pickle
import optuna
import joblib
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
from kneed import KneeLocator
from xgboost import XGBRegressor
from google.colab import drive
from joblib import Parallel, delayed
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from sklearn.model_selection import TimeSeriesSplit
from collections import defaultdict
from scipy.stats import pearsonr, zscore
from scipy.spatial.distance import squareform
from scipy.cluster.hierarchy import linkage, fcluster, dendrogram

In [3]:
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
class DataLoader:
  def __init__(self, data_path, file_name):
    self.data_path = data_path
    self.file_name = file_name

  def load(self):
    df = pd.read_parquet(self.data_path + self.file_name)
    self.df = df.loc[:, ~((df == -np.inf).any() | (df == 0).all())]
    print('Data loaded')
    return self

  def train_split(self):
    label = self.df['label']
    qty_vol = self.df[['bid_qty', 'ask_qty', 'buy_qty', 'sell_qty', 'volume']]
    X_ = self.df.drop(['label', 'bid_qty', 'ask_qty', 'buy_qty', 'sell_qty', 'volume'], axis=1)
    return X_, qty_vol, label

  def test_split(self):
    qty_vol = self.df[['bid_qty', 'ask_qty', 'buy_qty', 'sell_qty', 'volume']]
    X_ = self.df.drop(['bid_qty', 'ask_qty', 'buy_qty', 'sell_qty', 'volume'], axis=1)
    return X_, qty_vol

  def load_features_json_list(self):
    with open(self.data_path + self.file_name, 'r') as f:
      return json.load(f)

In [5]:
class FeatureGenerator:
  def __init__(self, df, epsilon=1e-6):
    self.df = df
    self.epsilon = epsilon

  def develop_features_from_qty_volume(self):
    '''
      Function to develop features from bid_qty, ask_qty, buy_qty, sell_qty
      and volume columns.
    '''
    imbalance = (self.df['bid_qty'] - self.df['ask_qty']) / (self.df['bid_qty'] + self.df['ask_qty'] + self.epsilon)
    buy_sell_ratio = np.log1p(self.df['buy_qty'] / (self.df['sell_qty'] + self.epsilon))
    volume_z = zscore(self.df['volume'])

    return pd.DataFrame({
        'imbalance': imbalance,
        'buy_sell_ratio': buy_sell_ratio,
        'volume_z': volume_z
        }, index=self.df.index)

  def standardize_columns(self, columns):
    '''Function to standardize specific columns wihtin a df.'''
    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(self.df[columns])
    df_scaled = pd.DataFrame(df_scaled, columns=columns, index=self.df.index)
    return pd.concat([df_scaled, self.df['label']], axis=1)

  def standardize_df(self):
    '''Function to standardize a whole df.'''
    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(self.df)
    return pd.DataFrame(df_scaled, columns=self.df.columns, index=self.df.index)

In [6]:
class CorrelationAnalyzer:
  def __init__(self, feature, label):
    self.feature = feature
    self.label = label

  def _correlation(self, feature, label):
    '''
    Calcualte the Pearson correlation and p-value
    for a single feature and Y
    '''
    r, p = pearsonr(feature, label)
    return r, p

  def compute_correlations_in_window(self, window='2D'):
    '''
    Calcualte the Pearson correlation and p-value
    for each window between each feature and Y
    '''
    grouped_feature = self.feature.groupby(pd.Grouper(freq=window))
    grouped_label = self.label.groupby(pd.Grouper(freq=window))
    results = []
    for (period_start_feature, df_window_feature,
         _, df_window_label) in (
        zip(grouped_feature, grouped_label)):

      output = Parallel(n_jobs=-1)(
          delayed(self._correlation)(f, l) for f, l in (
              zip(df_window_feature, df_window_label)
              )
          )

      for feature, r, p in output:
        results.append({
            'period_start': period_start_feature,
            'feature': feature,
            'correlation': r,
            'p_value': p
            })

    return pd.DataFrame(results)

  def score_features(self, df_correlations, epsilon=1e-6):
    '''
    Score features based on their correlation with Y as a:
      base_score:
        correlation_mean / correlation_std

      adjusted_score:
        base_score * (1 / p_value_mean)
    '''
    summary = df_correlations.groupby('feature').agg({
        'correlation': ['mean', 'std'],
        'p_value': 'mean'
        })

    summary['base_score'] = (
        abs(summary[('correlation', 'mean')]) / (summary[('correlation', 'std')]
                                                 + epsilon)
    )

    summary['adjusted_score'] = (summary['base_score'] *
     (1 / summary[('p_value', 'mean')] + epsilon))

    return summary

  def number_of_features_to_select(self, summary, score='adjusted_score'):
    df_sorted = summary.sort_values(by=score, ascending=False).reset_index()
    df_sorted.rename(columns={"index": "feature"}, inplace=True)
    df_sorted["cumulative_score"] = df_sorted[score].cumsum()
    df_sorted["normalized_score"] = df_sorted["cumulative_score"] / df_sorted["cumulative_score"].iloc[-1]
    df_sorted["feature_index"] = np.arange(1, len(df_sorted) + 1)

    first_point = np.array([0, df_sorted["normalized_score"].iloc[0]])
    last_point = np.array([len(df_sorted) - 1, df_sorted["normalized_score"].iloc[-1]])
    line_vec = last_point - first_point
    line_vec_norm = line_vec / np.linalg.norm(line_vec)

    distances = []
    for i in range(len(df_sorted)):
        point = np.array([i, df_sorted["normalized_score"].iloc[i]])
        vec_from_first = point - first_point
        proj_len = np.dot(vec_from_first, line_vec_norm)
        proj_point = first_point + proj_len * line_vec_norm
        dist = np.linalg.norm(point - proj_point)
        distances.append(dist)

    elbow_index = int(np.argmax(distances))
    optimal_n_features = elbow_index + 1
    selected_features = df_sorted["feature"].iloc[:optimal_n_features].tolist()

    return optimal_n_features, selected_features, df_sorted

  def select_top_features(self, summary, top_k=55, score='adjusted_score'):
    sorted_summary = summary.sort_values(score, ascending=False)
    top_k_df = sorted_summary.head(top_k)
    return top_k_df.index.tolist()

In [7]:
class LagCreator:
  def __init__(self, lag_periods = 5):
    self.lag_periods = lag_periods

  def lag_features(self, X):

    lagged_columns = []
    for col in X.columns:
        for lag in range(1, self.lag_periods + 1):
          shifted = X[col].shift(lag)
          shifted.name = f"{col}_lag{lag}"
          lagged_columns.append(shifted)

    lagged = pd.concat(lagged_columns, axis=1)
    lagged = pd.concat([X, lagged], axis=1)
    return lagged.dropna()

  def match_index_lags(self, X_lagged, Y):
    '''Y is series with a single column'''
    y_lagged = Y.loc[X_lagged.index]

    return y_lagged


In [14]:
class CorrelationBatches:
  def __init__(self, df, lags, Y, freq):
    self.Y_column = Y
    self.df = df
    self.lags = lags
    self.freq = freq

  def _lag_column(self, column_df, feature_name, l):
    '''Create a lagged version of a given feature column.'''
    lagged_col = column_df.shift(1).dropna()
    lagged_col.name = f"{feature_name}_lag{l}"
    return lagged_col

  def _standardize_col(self, column_df):
    '''Standardize a feature column'''
    scaler = StandardScaler()
    scaled_col = scaler.fit_transform(column_df.values.reshape(-1, 1))
    scaled_col = pd.Series(scaled_col.flatten(), index=column_df.index, name=column_df.name)
    return scaled_col

  def _col_label_corr(self, period_start, column_df):
    '''Compute Pearson correlation between a feature column and the target column.'''
    y = self.Y_column.loc[column_df.index]
    column_df = self._standardize_col(column_df)
    corr, p = pearsonr(column_df, y)
    return column_df.name, period_start, corr, p

  def feature_correlation(self, feature):
    '''
    Compute correlations of a feature and its lagged versions with the target column.
    Each call standardizes the values, then calculates the Pearson correlation
    and p-value accross windows of 3 days.
    Then it creates lagged versions of the feature and repeats the process.
    Returns:
      list: A list of three lists containing:
        [0] Feature names (including lags),
        [1] Correlation coefficients,
        [2] Corresponding p-values.

    '''
    feature_correlations = []

    feature_df = self.df[feature]
    grouped = feature_df.groupby(pd.Grouper(freq=self.freq))
    output = Parallel(n_jobs=-1)(
        delayed(self._col_label_corr)(period_start, df_g) for period_start, df_g in grouped
        )

    feature_correlations.append(output)

    ft_name = feature_df.name

    for lag in range(1, self.lags + 1):
      feature_df = self._lag_column(feature_df, ft_name, lag)
      grouped = feature_df.groupby(pd.Grouper(freq=self.freq))
      output = Parallel(n_jobs=-1)(
          delayed(self._col_label_corr)(period_start, df_g) for period_start, df_g in grouped
          )

      feature_correlations.append(output)

    return feature_correlations

In [9]:
class FeatureLagSelector:
    def __init__(self, feature_list):
        '''
        Initialize the selector with a list of desired features.
        Features can be raw (e.g., 'X264') or lagged (e.g., 'X264_lag2')
        '''
        self.feature_list = feature_list
        self.selected_features = {}

    def _parse_feature(self, feature_name):

        '''Parse feature name into base column and lag value (if present).'''

        if '_lag' in feature_name:
            base, lag = feature_name.split('_lag')
            return base, int(lag)
        return feature_name, None

    def _transform(self, df):
        '''Build a new DataFrame with the selected and lagged features.'''

        self.selected_features.clear()
        for feature in self.feature_list:
            base, lag = self._parse_feature(feature)
            if base not in df.columns:
                print(f"Base column '{base}' not found in DataFrame")
                continue
            if lag is None:
                self.selected_features[feature] = df[base]
            else:
                self.selected_features[feature] = df[base].shift(lag)

        return pd.DataFrame(self.selected_features)

    def fit_transform(self, df):
        return self._transform(df)


In [10]:
class PCAProcessor:
  def __init__(self, X_selected, groups, path_to_save_results, file_name):
    self.X_selected = X_selected
    self.groups = groups
    self.path_to_save_results = path_to_save_results
    self.file_name = file_name

  def _pca_transform(self, g, n_components):
    pca = PCA(n_components=n_components)
    g_pca = pca.fit_transform(self.X_selected[g])
    return g_pca

  def _pca_explained_variance(self, g):
    pca = PCA()
    pca.fit(self.X_selected[g])
    return pca.explained_variance_ratio_

  def _penalized_score(self, g, alpha = 0.005):
    scores = []
    explained_variance = self._pca_explained_variance(g)

    for k in range(1, len(explained_variance) + 1):
      score = explained_variance[:k].sum() - alpha * k
      scores.append(score)

    return np.argmax(scores) + 1, g

  def _best_number_of_components(self, alpha = 0.005):
    '''Finds the best number of components for each group utilizing the penalized score'''
    results = Parallel(n_jobs=-1)(
        delayed(self._penalized_score)(g, alpha) for g in self.groups
    )

    self.n_components = []
    self.groups_features = []

    for n, g in results:
      self.n_components.append(n)
      self.groups_features.append(g)
    return self.n_components, self.groups_features

  def transform_data(self, penalizing_term_alpha = 0.005):
    '''A function that combines all the previous methods in order to output a
    df that contains all the pca components
    '''
    self.n_components, self.groups_features = self._best_number_of_components(penalizing_term_alpha)

    list_with_all_pca = []
    for idx, (g, n) in enumerate(zip(self.groups_features, self.n_components), start=1):
      g_pca = self._pca_transform(g, n)

      col_names = [f"G{idx}P{j+1}" for j in range(n)]
      df_pca_group = pd.DataFrame(g_pca, columns=col_names, index=self.X_selected.index)

      list_with_all_pca.append(df_pca_group)

    # Save the data to JSON
    self.n_components = [int(x) for x in self.n_components]
    best_PCA = [self.n_components, self.groups_features]
    with open(self.path_to_save_results + self.file_name, "w") as f:
        json.dump(best_PCA, f, indent=2)

    return pd.concat(list_with_all_pca, axis=1)

In [44]:
class ClusterAnalyzer:
  def __init__(self, X_selected):
    self.X_selected = X_selected

  def _correlation_matrix(self):
    '''Function to calculate correlation matrix'''
    corr_matrix = self.X_selected.corr().abs()
    return corr_matrix

  def _distance_matrix(self):
    '''Returns square matrix'''
    return (1 - self._correlation_matrix()).values

  def _hierarchical_clustering(self, method='average'):
    distance_matrix = self._distance_matrix()
    condensed_dist = squareform(distance_matrix, checks=True)
    Z = linkage(condensed_dist, method=method)
    return Z

  def _silhouette_score_cluster(self, labels, distance_matrix, metric='precomputed'):
    score = silhouette_score(distance_matrix, labels, metric=metric)
    return score

  def optimal_number_of_clusters(self, max_k = 15, min_k=2):
    '''
    max_k - maaximum number of clusters to create
    min_k - minimum number of clusters to create
    '''
    Z = self._hierarchical_clustering()
    distance_matrix = self._distance_matrix()

    best_score = -1
    self.best_k = None
    self.results = {}
    for k in range(min_k, max_k):
      labels = fcluster(Z, k, criterion='maxclust')
      score = self._silhouette_score_cluster(labels, distance_matrix)
      print(f"Number of clusters: {k}, Silhouette score: {score}")
      if score > best_score:
        self.best_k, best_score = k, score

      self.results[k] = [score, labels]

    return self.best_k, best_score, self.results

  def groups_split(self):
    # Create groups of feature names where each group is a cluster
    best_clusters = self.results[self.best_k][1]
    labels_columns = self.X_selected.columns.tolist()

    n_clusters = max(best_clusters)

    clustered_features = [[] for _ in range(n_clusters)]

    for c, e in zip(best_clusters, labels_columns):
        clustered_features[c - 1].append(e)

    return clustered_features

  def save_results(self, dataPath, fileName):
    '''Save clustering results using simple string concatenation.'''
    full_path = dataPath + fileName

    with open(full_path, 'wb') as f:
        pickle.dump({
            'best_k': self.best_k,
            'results': self.results,
            'columns': self.X_selected.columns.tolist()
        }, f)

    print(f"Clustering results saved to {full_path}")


  # def cluster_features(self, X_selected, n_clusters = 7, criterion = 'maxclust'):
  #   Z = self._hierarchical_clustering(X_selected)
  #   cluster_ids = fcluster(Z, n_clusters, criterion=criterion)
  #   feature_groups = defaultdict(list)
  #   for feature, cluster_id in zip(X_selected.columns, cluster_ids):
  #     feature_groups[cluster_id].append(feature)

  #   groups = list(feature_groups.values())
  #   return groups, Z

  def visualize_clusters(self):
    Z = self._hierarchical_clustering()
    plt.figure(figsize=(12, 6))
    dendrogram(Z, labels=self.X_selected.columns.tolist(), leaf_rotation=90)
    plt.title("Dendrogram of Feature Clustering")
    plt.tight_layout()
    plt.show()

In [11]:
# Features
# Group 1 - X1, X2 ....
# Group 2 - _qty, vol

In [12]:
# Filepaths
dataPathLoad = '/content/drive/MyDrive/Colab Notebooks/DRW/data/'
dataPathSave = '/content/drive/MyDrive/Colab Notebooks/DRW/data/v7'
fileNameTrain = 'train.parquet'

In [13]:
# Load train data and remove -inf columns and columns full of 0
loader = DataLoader(dataPathLoad, fileNameTrain)
features_group1, features_group2, label = loader.load().train_split()
del loader
gc.collect()

Data loaded


17

In [15]:
# Calculate correlations in batches
results = {}
columns = features_group1.columns
count = 1
for col in columns:
  batchesCorrelation = CorrelationBatches(features_group1, 3, label, '1D')
  out = batchesCorrelation.feature_correlation(col)

  results[col] = out
  print(f'Processed {count} out of {len(columns)}')
  count += 1

Processed 1 out of 863
Processed 2 out of 863
Processed 3 out of 863
Processed 4 out of 863
Processed 5 out of 863
Processed 6 out of 863
Processed 7 out of 863
Processed 8 out of 863
Processed 9 out of 863
Processed 10 out of 863
Processed 11 out of 863
Processed 12 out of 863
Processed 13 out of 863
Processed 14 out of 863
Processed 15 out of 863
Processed 16 out of 863
Processed 17 out of 863
Processed 18 out of 863
Processed 19 out of 863
Processed 20 out of 863
Processed 21 out of 863
Processed 22 out of 863
Processed 23 out of 863
Processed 24 out of 863
Processed 25 out of 863
Processed 26 out of 863
Processed 27 out of 863
Processed 28 out of 863
Processed 29 out of 863
Processed 30 out of 863
Processed 31 out of 863
Processed 32 out of 863
Processed 33 out of 863
Processed 34 out of 863
Processed 35 out of 863
Processed 36 out of 863
Processed 37 out of 863
Processed 38 out of 863
Processed 39 out of 863
Processed 40 out of 863
Processed 41 out of 863
Processed 42 out of 863
P

In [17]:
# Build pandas df from the results
dict_results = {
    'feature': [],
    'period_start': [],
    'correlation': [],
    'p_value': []
}
for key in results:
  for i in range(4):
    for out in results[key][i]:
      dict_results['feature'].append(out[0])
      dict_results['period_start'].append(out[1])
      dict_results['correlation'].append(out[2])
      dict_results['p_value'].append(out[3])

correlation_results_df = pd.DataFrame(dict_results)

# Save to csv
correlation_results_df.to_csv(dataPathSave + 'correlation_results_7.csv', index=False)

In [21]:
# Develop scores for each feature as:
  # base_score:
  #   correlation_mean / correlation_std

  # adjusted_score:
  #   base_score * (1 / p_value_mean)
correlation_analyzer = CorrelationAnalyzer(None, None)
summary = correlation_analyzer.score_features(correlation_results_df)

In [25]:
summary_sorted = summary.sort_values(by='adjusted_score', ascending=False)

In [40]:
# 480
selected_features = list(summary_sorted.head(480).index)

In [41]:
# Build features from the list of selected features
featureLagBuilder = FeatureLagSelector(selected_features)
X_selected = featureLagBuilder.fit_transform(features_group1)

In [42]:
# Delete not needed variables
del features_group1
gc.collect()

318

In [43]:
# Standardize X
X_selected = FeatureGenerator(X_selected).standardize_df()

In [None]:
# Cluster features
cluster_analyzer = ClusterAnalyzer(X_selected)
best_k, best_score, results = cluster_analyzer.optimal_number_of_clusters(20)

# Save results
cluster_analyzer.save_results(dataPathSave, 'cluster_results7.pkl')