In [1]:
import json
import joblib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from google.colab import drive
from joblib import Parallel, delayed
from scipy.stats import pearsonr, zscore
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import silhouette_score
from scipy.cluster.hierarchy import linkage, fcluster, dendrogram
from scipy.spatial.distance import squareform
from collections import defaultdict

In [2]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
class DataLoader:
  def __init__(self, data_path, file_name):
    self.data_path = data_path
    self.file_name = file_name

  def load(self):
    df = pd.read_parquet(self.data_path + self.file_name)
    df = df.loc[:, ~((df == -np.inf).any() | (df == 0).all())]
    return df

In [4]:
class FeatureEngineer:

  def __init__(self, df, epsilon=1e-6):
    self.df = df
    self.epsilon = epsilon

  def develop_features_from_qty_volume(self):
    '''
      Function to develop features from bid_qty, ask_qty, buy_qty, sell_qty
      and volume columns.
    '''
    imbalance = (self.df['bid_qty'] - self.df['ask_qty']) / (self.df['bid_qty'] + self.df['ask_qty'] + self.epsilon)
    buy_sell_ratio = np.log1p(self.df['buy_qty'] / (self.df['sell_qty'] + self.epsilon))
    volume_z = zscore(self.df['volume'])

    return pd.DataFrame({
        'imbalance': imbalance,
        'buy_sell_ratio': buy_sell_ratio,
        'volume_z': volume_z
    }, index=self.df.index)

  def standardize_columns(self, columns):
    '''Function to standardize specific columns wihtin a df.'''
    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(self.df[columns])
    df_scaled = pd.DataFrame(df_scaled, columns=columns, index=self.df.index)
    return pd.concat([df_scaled, self.df['label']], axis=1)

  def standardize_columns_test(self, columns):
    '''Function to standardize specific columns wihtin a test df.'''
    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(self.df[columns])
    df_scaled = pd.DataFrame(df_scaled, columns=columns, index=self.df.index)
    return df_scaled

  def standardize_df(self):
    '''Function to standardize a whole df.'''
    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(self.df)
    return pd.DataFrame(df_scaled, columns=self.df.columns, index=self.df.index)

In [5]:
class PCAProcessor:
  def __init__(self, X_selected, groups, path_to_save_results, file_name):
    self.X_selected = X_selected
    self.groups = groups
    self.path_to_save_results = path_to_save_results
    self.file_name = file_name

  def _pca_transform(self, g, n_components):
    pca = PCA(n_components=n_components)
    g_pca = pca.fit_transform(self.X_selected[g])
    return g_pca

  def _pca_explained_variance(self, g):
    pca = PCA()
    pca.fit(self.X_selected[g])
    return pca.explained_variance_ratio_

  def _penalized_score(self, g, alpha = 0.005):
    scores = []
    explained_variance = self._pca_explained_variance(g)

    for k in range(1, len(explained_variance) + 1):
      score = explained_variance[:k].sum() - alpha * k
      scores.append(score)

    return np.argmax(scores) + 1, g

  def _best_number_of_components(self, alpha = 0.005):
    '''Finds the best number of components for each group utilizing the penalized score'''
    results = Parallel(n_jobs=-1)(
        delayed(self._penalized_score)(g, alpha) for g in self.groups
    )

    self.n_components = []
    self.groups_features = []

    for n, g in results:
      self.n_components.append(n)
      self.groups_features.append(g)
    return self.n_components, self.groups_features

  def transform_data(self, penalizing_term_alpha = 0.005):
    '''A function that combines all the previous methods in order to output a
    df that contains all the pca components
    '''
    self.n_components, self.groups_features = self._best_number_of_components(penalizing_term_alpha)

    list_with_all_pca = []
    for idx, (g, n) in enumerate(zip(self.groups_features, self.n_components), start=1):
      g_pca = self._pca_transform(g, n)

      col_names = [f"G{idx}P{j+1}" for j in range(n)]
      df_pca_group = pd.DataFrame(g_pca, columns=col_names, index=self.X_selected.index)

      list_with_all_pca.append(df_pca_group)

    # Save the data to JSON
    self.n_components = [int(x) for x in self.n_components]
    best_PCA = [self.n_components, self.groups_features]
    with open(self.path_to_save_results + self.file_name, "w") as f:
        json.dump(best_PCA, f, indent=2)

    return pd.concat(list_with_all_pca, axis=1)

  def transform_data_test(self, n_components):
    list_with_all_pca = []
    for idx, (g, n) in enumerate(zip(self.groups, n_components), start=1):
      g_pca = self._pca_transform(g, n)

      col_names = [f"G{idx}P{j+1}" for j in range(n)]
      df_pca_group = pd.DataFrame(g_pca, columns=col_names, index=self.X_selected.index)

      list_with_all_pca.append(df_pca_group)

    return pd.concat(list_with_all_pca, axis=1)


In [6]:
class LagCreator:
  def __init__(self, lag_periods = 5):
    self.lag_periods = lag_periods

  def lag_features(self, X):

    lagged_columns = []
    for col in X.columns:
        for lag in range(1, self.lag_periods + 1):
          shifted = X[col].shift(lag)
          shifted.name = f"{col}_lag{lag}"
          lagged_columns.append(shifted)

    lagged = pd.concat(lagged_columns, axis=1)
    return lagged.dropna()

  def match_index_lags(self, X_lagged, Y):
    '''Y is series with a single column'''
    y_lagged = Y.loc[X_lagged.index]

    return y_lagged


In [7]:
class LoadGroups:
  def __init__(self, data_path, file_name):
    self.data_path = data_path
    self.file_name = file_name

  def _load_json(self):
    with open(self.data_path + self.file_name, 'r') as f:
      groups = json.load(f)
    return groups

  def create_components_groups_split(self):
    groups = self._load_json()
    self.g = groups[1]
    return groups[0], self.g

  def flatten_groups(self):
    return [item for sublist in self.g for item in sublist]

In [8]:
class ModelPredictor:
  def __init__(self, model_name, submission_name, path):
    self.path = path
    self.model_name = model_name
    self.submission_name = submission_name

  def _load_model(self):
    model = joblib.load(self.path + self.model_name)
    return model

  def predict(self, X):
    self.model = self._load_model()
    self.y = self.model.predict(X)
    return self.y

  def save_results(self):
    zero_part = np.zeros(5)
    combined = np.concatenate((zero_part, self.y))

    submission = pd.DataFrame({
    'ID': np.arange(1, len(combined) + 1),
    'prediction': combined
    })

    submission.to_csv(self.path + self.submission_name, index=False)

    return submission

In [9]:
# Filepaths
dataPath = '/content/drive/MyDrive/Colab Notebooks/DRW/data/'
fileNameTest = 'test.parquet'
fileNameGroups = 'best_features_5_1.json'
modelName = 'final_xgboost_model_5_1.pkl'

In [10]:
# Load the test data
dataLoader = DataLoader(dataPath, fileNameTest)
df_test = dataLoader.load()

In [11]:
# Load the number of components and the features for each PCA
groupLoader = LoadGroups(dataPath, fileNameGroups)
components, groups = groupLoader.create_components_groups_split()
flat_groups = groupLoader.flatten_groups()

In [12]:
# Features creation
featureEngineer = FeatureEngineer(df_test)
df_qty_vol = featureEngineer.develop_features_from_qty_volume()
standardized_df_qty_vol = FeatureEngineer(df_qty_vol).standardize_df()
df_scaled_X_features = featureEngineer.standardize_columns_test(flat_groups)

In [13]:
# PCA features by groups
PCAProc = PCAProcessor(df_scaled_X_features, groups, '0', '0')
X_pca = PCAProc.transform_data_test(components)

In [14]:
# Create the X _test
X = pd.concat([X_pca, standardized_df_qty_vol], axis=1)

In [15]:
# Lag the features with lag 5
lagger = LagCreator()
X_lagged = lagger.lag_features(X)

In [16]:
# Create a prediciton and build a submission
prediction = ModelPredictor(modelName, 'submission7.csv', dataPath)

y = prediction.predict(X_lagged)
subm = prediction.save_results()

In [17]:
print(subm)

            ID  prediction
0            1    0.000000
1            2    0.000000
2            3    0.000000
3            4    0.000000
4            5    0.000000
...        ...         ...
538145  538146   -0.012840
538146  538147    0.037183
538147  538148    0.072205
538148  538149    0.021216
538149  538150   -0.003651

[538150 rows x 2 columns]
