In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from typing import Dict, List, Optional, Tuple
from joblib import delayed, Parallel
from enum import Enum
from tqdm import tqdm
from numba import njit, prange
import gc


from sklearn.preprocessing import minmax_scale
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans

# Import the Keras libraries and packages
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout

import warnings

# Initialising the RNN
regressor = Sequential()

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
DATA_DIR = "../input"

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session



/kaggle/input/optiver-trading-at-the-close/public_timeseries_testing_util.py
/kaggle/input/optiver-trading-at-the-close/train.csv
/kaggle/input/optiver-trading-at-the-close/example_test_files/sample_submission.csv
/kaggle/input/optiver-trading-at-the-close/example_test_files/revealed_targets.csv
/kaggle/input/optiver-trading-at-the-close/example_test_files/test.csv
/kaggle/input/optiver-trading-at-the-close/optiver2023/competition.cpython-310-x86_64-linux-gnu.so
/kaggle/input/optiver-trading-at-the-close/optiver2023/__init__.py
/kaggle/input/optiver-v1/__results__.html
/kaggle/input/optiver-v1/submission.csv
/kaggle/input/optiver-v1/base_features.feather
/kaggle/input/optiver-v1/__notebook__.ipynb
/kaggle/input/optiver-v1/__output__.json
/kaggle/input/optiver-v1/kmeans_features.feather
/kaggle/input/optiver-v1/custom.css


In [2]:
# data configurations
USE_PRECOMPUTE_FEATURES = False  # Load precomputed features for train.csv from private dataset (just for speed up)
USE_PRECOMPUTE_KMEAN = False

# Data Extraction

In [3]:
train_data = pd.read_csv('/kaggle/input/optiver-trading-at-the-close/train.csv')
train_data.dropna(subset=['imbalance_size', 'bid_price', 'ask_price', 'wap'], how='any', inplace=True)
train_data['far_price'].fillna(train_data['reference_price'], inplace=True)
train_data['near_price'].fillna(train_data['reference_price'], inplace=True)
train_data.reset_index(drop=True, inplace=True)

# Extract Features


## Base Features

In [4]:
from numba import njit, prange
import numpy as np
import gc

@njit(parallel=True)
def get_ema(data, window=14):
    """
    Calculate Exponential Moving Average (EMA) for each column in the input DataFrame.

    Parameters:
    - data (numpy.ndarray): Input DataFrame containing price data.
    - window (int): EMA calculation window.

    Returns:
    - ema_values (numpy.ndarray): EMA values for each element in the input DataFrame.
    """
    rows, cols = data.shape
    ema_values = np.zeros((rows, cols))
    alpha = 2 / (window + 1)

    for col in prange(cols):
        ema_values[window - 1, col] = np.mean(data[:window, col])

        for i in prange(window, rows):
            ema_values[i, col] = (data[i, col] - ema_values[i - 1, col]) * alpha + ema_values[i - 1, col]
            
    return ema_values

@njit(parallel=True)
def get_rsi(data, period=14):
    """
    Calculate Relative Strength Index (RSI) for each column in the input DataFrame.

    Parameters:
    - data (numpy.ndarray): Input DataFrame containing price data.
    - period (int): RSI calculation period.

    Returns:
    - rsi_values (numpy.ndarray): RSI values for each element in the input DataFrame.
    """
    rows, cols = data.shape
    rsi_values = np.zeros((rows, cols))

    for col in prange(cols):
        delta = np.zeros(rows)
        delta[1:] = data[1:, col] - data[:-1, col]

        gain = np.where(delta > 0, delta, 0)
        loss = -np.where(delta < 0, delta, 0)

        avg_gain = np.zeros(rows)
        avg_loss = np.zeros(rows)

        avg_gain[:period] = np.mean(gain[:period])
        avg_loss[:period] = np.mean(loss[:period])

        for i in prange(period, rows):
            avg_gain[i] = (avg_gain[i - 1] * (period - 1) + gain[i]) / period
            avg_loss[i] = (avg_loss[i - 1] * (period - 1) + loss[i]) / period

        rs = avg_gain / avg_loss
        rsi_values[:, col] = 100 - (100 / (1 + rs))

    return rsi_values

@njit(parallel=True)
def get_macd(data, short_window=12, long_window=26, signal_window=9):
    """
    Calculate Moving Average Convergence Divergence (MACD) for each column in the input DataFrame.

    Parameters:
    - data (numpy.ndarray): Input DataFrame containing price data.
    - short_window (int): Short-term EMA window for MACD calculation.
    - long_window (int): Long-term EMA window for MACD calculation.
    - signal_window (int): Signal line window for MACD calculation.

    Returns:
    - macd_values (numpy.ndarray): MACD values for each element in the input DataFrame.
    - signal_line_values (numpy.ndarray): Signal line values for each element in the input DataFrame.
    - histogram_values (numpy.ndarray): MACD histogram values for each element in the input DataFrame.
    """
    rows, cols = data.shape
    macd_values = np.zeros((rows, cols))
    signal_line_values = np.zeros((rows, cols))
    histogram_values = np.zeros((rows, cols))

    short_alpha = 2 / (short_window + 1)
    long_alpha = 2 / (long_window + 1)
    signal_alpha = 2 / (signal_window + 1)

    for col in prange(cols):
        short_ema = np.zeros(rows)
        long_ema = np.zeros(rows)
        signal_line = np.zeros(rows)

        short_ema[1:] = data[1:, col].copy()
        long_ema[1:] = data[1:, col].copy()

        for i in prange(1, rows):
            short_ema[i] = (data[i, col] - short_ema[i - 1]) * short_alpha + short_ema[i - 1]
            long_ema[i] = (data[i, col] - long_ema[i - 1]) * long_alpha + long_ema[i - 1]

        macd_values[:, col] = short_ema - long_ema

        signal_line[1:] = macd_values[1:, col].copy()

        for i in prange(1, rows):
            signal_line[i] = (macd_values[i, col] - signal_line[i - 1]) * signal_alpha + signal_line[i - 1]

        signal_line_values[:, col] = signal_line
        histogram_values[:, col] = macd_values[:, col] - signal_line

    return macd_values, signal_line_values, histogram_values

@njit(parallel=True)
def get_bband(data, window=20, num_std_dev=2):
    """
    Calculate Bollinger Bands for each column in the input DataFrame.

    Parameters:
    - data (numpy.ndarray): Input DataFrame containing price data.
    - window (int): Rolling window for Bollinger Bands calculation.
    - num_std_dev (int): Number of standard deviations for upper and lower bands.

    Returns:
    - upper_bands (numpy.ndarray): Upper Bollinger Bands values for each element in the input DataFrame.
    - mid_bands (numpy.ndarray): Middle Bollinger Bands (moving average) values for each element in the input DataFrame.
    - lower_bands (numpy.ndarray): Lower Bollinger Bands values for each element in the input DataFrame.
    """
    num_rows, num_cols = data.shape
    upper_bands = np.zeros_like(data)
    lower_bands = np.zeros_like(data)
    mid_bands = np.zeros_like(data)

    for col in prange(num_cols):
        for i in prange(window - 1, num_rows):
            window_slice = data[i - window + 1 : i + 1, col]
            mid_bands[i, col] = np.mean(window_slice)
            std_dev = np.std(window_slice)
            upper_bands[i, col] = mid_bands[i, col] + num_std_dev * std_dev
            lower_bands[i, col] = mid_bands[i, col] - num_std_dev * std_dev

    return upper_bands, mid_bands, lower_bands

In [5]:
def generate_base_features(df):
    """
    Generate features dataframes and merges with base df
    
    Parameters:
    - df (DataFrame): Base dataframe
        
    Returns:
    - DataFrame with base features
    """
    
    # ------------------------------ Technical Analysis and Log returns  ------------------------------
    
    prices = ['reference_price', 'far_price', 'near_price', 'bid_price', 'ask_price', 'wap']

    for _, single_stock_prices_df in tqdm(df.groupby('stock_id')[prices]):
        # EMA
        col_ema = [f'ema_{col}' for col in single_stock_prices_df.columns]
        ema_values = get_ema(single_stock_prices_df.values)
        df.loc[single_stock_prices_df.index, col_ema] = ema_values

        # RSI
        col_rsi = [f'rsi_{col}' for col in single_stock_prices_df.columns]
        rsi_values = get_rsi(single_stock_prices_df.values)
        df.loc[single_stock_prices_df.index, col_rsi] = rsi_values

        # MACD
        macd_values, signal_line_values, histogram_values = get_macd(single_stock_prices_df.values)
        col_macd = [f'macd_{col}' for col in single_stock_prices_df.columns]
        col_signal = [f'macd_sig_{col}' for col in single_stock_prices_df.columns]
        col_hist = [f'macd_hist_{col}' for col in single_stock_prices_df.columns]

        df.loc[single_stock_prices_df.index, col_macd] = macd_values
        df.loc[single_stock_prices_df.index, col_signal] = signal_line_values
        df.loc[single_stock_prices_df.index, col_hist] = histogram_values

        # Bollinger Bands
        bband_upper_values, bband_mid_values, bband_lower_values = get_bband(single_stock_prices_df.values, window=20, num_std_dev=2)
        col_bband_upper = [f'bband_upper_{col}' for col in single_stock_prices_df.columns]
        col_bband_mid = [f'bband_mid_{col}' for col in single_stock_prices_df.columns]
        col_bband_lower = [f'bband_lower_{col}' for col in single_stock_prices_df.columns]

        df.loc[single_stock_prices_df.index, col_bband_upper] = bband_upper_values
        df.loc[single_stock_prices_df.index, col_bband_mid] = bband_mid_values
        df.loc[single_stock_prices_df.index, col_bband_lower] = bband_lower_values
        
        # Log Returns
        for col in prices:
            df.loc[single_stock_prices_df.index, f'log_return_{col}'] = np.log(single_stock_prices_df[col]).diff().fillna(0)

    # ----------------------------------------------------------------------------------------------
    
    df['state'] =  np.where(df['seconds_in_bucket'] < 300, 0, 1)
    df['volume'] = df.eval('(ask_size / ask_price) + (bid_size / bid_price)')
    df['mid_price'] = df.eval('(ask_price + bid_price) / 2')
    df['price_spread'] = df.eval('ask_price - bid_price')
    df['far_near_diff'] = df.eval('(far_price - near_price)')
    df['ask_bid_size_imb'] = df.eval('(bid_size - ask_size) / (bid_size + ask_size)')
    df['imb_mch_size_imb'] = df.eval('(imbalance_size - matched_size)/(imbalance_size + matched_size)')
   
    features = [c for c in df.columns if c not in ["row_id", "target"]]
    
    gc.collect()
    return df[features]

## K-means Features

Then we use K-means clustering based on the correlation coefficient of WAP changes between stocks to divide stocks into categories. Once we have these clusters, calculating the mean of each feature within each cluster to generate new features.

In [6]:
def generate_kmeans_features(base_feature_df):
    """
    Generate kmeans features dataframes and merges with base df
    
    Parameters:
    - base_feature_df (DataFrame): DataFrame with base features
        
    Returns:
    - DataFrame with additional features
    """
    
    # ---------------------------Create Group ID --------------------------------------
    # Load and pivot the training data
    train_df = base_feature_df.pivot(index='time_id', columns='stock_id', values='log_return_wap')

    # Calculate the correlation matrix
    corr = train_df.corr()
    ids = corr.index

    # Perform KMeans clustering to categorize stocks into 7 clusters
    kmeans = KMeans(n_clusters=7, random_state=0, n_init=7).fit(corr.values)
    cluster_labels = kmeans.labels_

    # Group stock IDs based on cluster assignments
    clustered_stock_ids = [list(ids[cluster_labels == n]) for n in range(7)]

    # Create mapping of stock_id to cluster_id
    mapping = {} 
    stock_ids = set(base_feature_df['stock_id'])

    for cluster_id, stock_ids in enumerate(clustered_stock_ids):
        for stock_id in stock_ids: 
            mapping[stock_id] = cluster_id

    def set_group(row):
        stock_id = row["stock_id"]  
        if stock_id in mapping:
            return mapping[stock_id]
        else:
            return -1 
    
    feature_with_groupID_df = base_feature_df.assign(group_id=lambda df: df.apply(set_group, axis=1).copy())
    
    gc.collect()
    # ---------------------------Merge with base features -----------------------------
    
    kmeans_df = feature_with_groupID_df

    # Define features for aggregation
    features = ['date_id', 'time_id', 'stock_id', 'group_id',
                'imbalance_buy_sell_flag', 'wap', 'far_near_diff',
                'ask_bid_size_imb', 'imb_mch_size_imb',
                'log_return_wap', 'log_return_bid_price', 'log_return_ask_price']

    for stat in ['mean', 'max', 'min', 'std', 'skew']:
        # Initialize lists to store aggregated data for training and testing sets
        mat_train = []

        # Iterate over clusters
        for cluster_id, stock_ids in enumerate(clustered_stock_ids):
            # Extract data for stocks in the current cluster from the training set
            cluster_train_data = feature_with_groupID_df.loc[feature_with_groupID_df['stock_id'].isin(stock_ids)][features]

            # Group by time_id and calculate multiple aggregations
            cluster_train_data = cluster_train_data.groupby(['time_id']).agg({
                'imbalance_buy_sell_flag': stat,
                'wap': stat,
                'log_return_wap': stat,
                'ask_bid_size_imb': stat,
                'imb_mch_size_imb': stat,
                'far_near_diff': stat,
                'log_return_wap': stat, 
                'log_return_bid_price': stat, 
                'log_return_ask_price': stat
            })

            # Assign a unique identifier for the cluster
            cluster_train_data.loc[:, 'group_id'] = cluster_id

            # Append to the list
            mat_train.append(cluster_train_data)
            gc.collect()

        # Concatenate the aggregated dataframes for training
        kmeans_data = pd.concat(mat_train)

        # Reduce precision if necessary
        kmeans_data = kmeans_data.astype({
            col: np.float32 for col in kmeans_data.columns if col not in ['time_id', 'group_id', 'date_id']
        })

        # Rename columns with 'kmeans_' prefix for columns coming from kmeans_data
        column_mapping = {col: f'kmeans_{col}_{stat}' for col in kmeans_data.columns if col not in ['time_id', 'group_id', 'date_id']}
        kmeans_data.rename(columns=column_mapping, inplace=True)
        kmeans_data = kmeans_data.reset_index()

        # Merge the two DataFrames on 'time_id' and 'group_id'
        kmeans_df = kmeans_df.merge(kmeans_data, on=['time_id', 'group_id'], how='left')
        gc.collect()

    return kmeans_df
    

In [7]:
def get_all_features(train_data, USE_PRECOMPUTE_FEATURES=True, USE_PRECOMPUTE_KMEAN=True):
    """
    Get the dataframe that combines all features
    
    Parameters:
    - train_data (DataFrame): The original dataframe downloaded from csv file
    - USE_PRECOMPUTE_FEATURES (bool)
    - USE_PRECOMPUTE_KMEAN (bool)
    
    """
    
    # ----------------------------------------Base features---------------------------------------
    # Extract important base features and store them into base_feature_df
    if USE_PRECOMPUTE_FEATURES:
        base_feature_df = pd.read_feather("/kaggle/input/feature-v1/base_features.feather")
    else:
        base_feature_df = generate_base_features(train_data)
        base_feature_df.to_feather("base_features.feather") 
        
    gc.collect() 
    
    # --------------------------------------KMeans features---------------------------------------
    
    if USE_PRECOMPUTE_KMEAN:
        kmeans_df = pd.read_feather("/kaggle/input/optiver-v1/kmeans_features.feather")
    else:
        kmeans_df = generate_kmeans_features(base_feature_df)
        kmeans_df.to_feather("kmeans_features.feather") 
        
    gc.collect() 
    return kmeans_df

In [8]:
features_df = get_all_features(train_data, USE_PRECOMPUTE_FEATURES, USE_PRECOMPUTE_KMEAN)
gc.collect()

features_df

100%|██████████| 200/200 [00:34<00:00,  5.77it/s]
  if _pandas_api.is_sparse(col):
  if _pandas_api.is_sparse(col):


Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,...,kmeans_log_return_bid_price_std,kmeans_log_return_ask_price_std,kmeans_imbalance_buy_sell_flag_skew,kmeans_wap_skew,kmeans_log_return_wap_skew,kmeans_ask_bid_size_imb_skew,kmeans_imb_mch_size_imb_skew,kmeans_far_near_diff_skew,kmeans_log_return_bid_price_skew,kmeans_log_return_ask_price_skew
0,0,0,0,3180602.69,1,0.999812,13380276.64,0.999812,0.999812,0.999812,...,0.000000,0.000000,-0.057484,0.000000,0.000000,0.266765,-0.076977,0.000000,0.000000,0.000000
1,1,0,0,166603.91,-1,0.999896,1642214.25,0.999896,0.999896,0.999896,...,0.000000,0.000000,-0.280947,0.000000,0.000000,0.064381,0.619109,0.000000,0.000000,0.000000
2,2,0,0,302879.87,-1,0.999561,1819368.03,0.999561,0.999561,0.999403,...,0.000000,0.000000,0.162070,0.000000,0.000000,-0.150177,0.872909,0.000000,0.000000,0.000000
3,3,0,0,11917682.27,-1,1.000171,18389745.62,1.000171,1.000171,0.999999,...,0.000000,0.000000,-0.057484,0.000000,0.000000,0.266765,-0.076977,0.000000,0.000000,0.000000
4,4,0,0,447549.96,-1,0.999532,17860614.95,0.999532,0.999532,0.999394,...,0.000000,0.000000,-0.057484,0.000000,0.000000,0.266765,-0.076977,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5237755,195,480,540,2440722.89,-1,1.000317,28280361.74,0.999734,0.999734,1.000317,...,0.000162,0.000147,0.054388,-0.435793,-0.296564,0.487839,0.748021,-0.591784,1.475623,1.971861
5237756,196,480,540,349510.47,-1,1.000643,9187699.11,1.000129,1.000386,1.000643,...,0.000162,0.000147,0.054388,-0.435793,-0.296564,0.487839,0.748021,-0.591784,1.475623,1.971861
5237757,197,480,540,0.00,0,0.995789,12725436.10,0.995789,0.995789,0.995789,...,0.000189,0.000193,-0.245740,-1.269168,-0.889086,0.392227,0.964835,0.780605,-1.330022,-0.676966
5237758,198,480,540,1000898.84,1,0.999210,94773271.05,0.999210,0.999210,0.998970,...,0.000124,0.000120,0.404687,0.279527,-3.608968,0.488096,0.982521,0.478664,-4.117099,-4.271659


## Nearest-Neighbor Features

Then we build nearest-neighbor features based on different metrics for both time-id and stock-id pairs in the dataset.

### Class Hierarchy:
#### Neighbors Class:
1. Represents a general structure for nearest neighbors.
2. Generates random neighbors if the metric is 'random', otherwise uses k-Nearest Neighbors (kNN) algorithm.
3. Has methods for rearranging feature values and creating aggregated nearest neighbor features.

#### TimeIdNeighbors Class (Inherits from Neighbors):
1. Specialized version for time-id based nearest neighbors.
2. Implements the rearrange_feature_values method to process feature data for time-id pairs.

#### StockIdNeighbors Class (Inherits from Neighbors):
1. Specialized version for stock-id based nearest neighbors.
2. Implements the rearrange_feature_values method to process feature data for stock-id pairs.

In [9]:
class Neighbors:
    def __init__(self,
                name: str,
                df: pd.DataFrame,
                feature_col: str,
                p: float,
                metric: str = 'minkowski',
                metric_params: Optional[Dict] = None,
                exclude_self: bool = True,
                n_neighbors_max: int = 80):
        """
        Initialize Neighbors class.

        Parameters:
        - name: Identifier for the neighbor type.
        - df: DataFrame containing feature data.
        - feature_col: Name of feature
        - p: Parameter for the Minkowski distance metric.
        - metric: Type of distance metric to be used.
        - metric_params: Additional parameters for the distance metric.
        - exclude_self: Flag indicating whether to exclude self from neighbors.
        - n_neighbors_max: Maximum number of neighbors to consider.
        """
        
        self.name, self.exclude_self, self.p, self.n_neighbors_max, self.metric = name, exclude_self, p, n_neighbors_max, metric
        
        # Generate pivot dataframe
        pivot = self.process_feature_data(df, feature_col)
        
        # Generate random neighbors or use kNN
        if metric == 'random':
            self.neighbors = np.random.randint(len(pivot), size=(len(pivot), n_neighbors_max))
            
        ##I suppose this includes itself. 
        else:
            nn = NearestNeighbors(n_neighbors=n_neighbors_max, p=p, metric=metric, metric_params=metric_params)
            nn.fit(pivot)
            _, self.neighbors = nn.kneighbors(pivot, return_distance=True)
        
        # Placeholder for feature-related attributes
        self.columns = self.index = self.feature_values = self.feature_col = None
        
    def rearrange_feature_values(self, df: pd.DataFrame, feature_col: str) -> None:
        """
        Placeholder method for rearranging feature values based on neighbors.

        Parameters:
        - df (DataFrame): DataFrame containing raw data.
        - feature_col (str): Name of the feature column to process.
        """
        raise NotImplementedError()
    
    
class TimeIdNeighbors(Neighbors):
    def rearrange_feature_values(self, df: pd.DataFrame, feature_col: str) -> None:
        """
        Process time-id based nearest neighbor features.

        Parameters:
        - df (DataFrame): DataFrame containing raw data.
        - feature_col (str): Name of the feature column to process.
        """
        
        # Process feature data for time-id neighbors
        feature_pivot = self.process_feature_data(df, feature_col)
        
        self.columns, self.index, self.feature_col = list(feature_pivot.columns), list(feature_pivot.index), feature_col
        self.feature_values = np.sum(feature_pivot.values[self.neighbors, :], axis=1)
    
    def process_feature_data(self, df: pd.DataFrame, feature_col: str) -> pd.DataFrame:
        """
        Process feature data for stock-id neighbors.

        Parameters:
        - df (DataFrame): DataFrame containing raw data.
        - feature_col (str): Name of the feature column to process.

        Returns:
        - Processed feature pivot DataFrame.
        """
        pivot = minmax_scale(df.pivot(index='time_id', 
                                      columns='stock_id', 
                                      values=feature_col).fillna(df.pivot(index='time_id', 
                                                                          columns='stock_id', 
                                                                          values=feature_col).mean()))
        return pd.DataFrame(pivot)
    
    def __repr__(self) -> str:
        return f"time-id NN (name={self.name}, metric={self.metric}, p={self.p})"
    
class StockIdNeighbors(Neighbors):
    def rearrange_feature_values(self, df: pd.DataFrame, feature_col: str) -> None:
        """
        Process stock-id based nearest neighbor features.

        Parameters:
        - df (DataFrame): DataFrame containing raw data.
        - feature_col (str): Name of the feature column to process.
        """
        # Process feature data for stock-id neighbors
        feature_pivot = self.process_feature_data(df, feature_col)
        
        self.columns, self.index, self.feature_col = list(feature_pivot.columns), list(feature_pivot.index), feature_col
        
        ## sum or mean??? and index(self.neighbors) will be out of bounds for axis 1 with size 200(feature_pivot)
        ##self.feature_values = np.sum(feature_pivot.values[:, self.neighbors], axis=2)
        
        valid_neighbors = np.minimum(self.neighbors, feature_pivot.shape[1] - 1)
        self.feature_values = np.mean(feature_pivot.values[:, valid_neighbors], axis=2)
        
    def process_feature_data(self, df: pd.DataFrame, feature_col: str) -> pd.DataFrame:
        """
        Process feature data for stock-id neighbors.

        Parameters:
        - df (DataFrame): DataFrame containing raw data.
        - feature_col (str): Name of the feature column to process.

        Returns:
        - Processed feature pivot DataFrame.
        """
        pivot = minmax_scale(df.pivot(index='stock_id', 
                                      columns='time_id', 
                                      values=feature_col).fillna(df.pivot(index='stock_id', 
                                                                          columns='time_id', 
                                                                          values=feature_col).mean()))
        return pd.DataFrame(pivot)
    
    def __repr__(self) -> str:
        return f"stock-id NN (name={self.name}, metric={self.metric}, p={self.p})"

In [10]:
def get_mapping(nn_type, feature, n_neighbors=3, p=2):
    """
    
    Parameters
    - nn_type (str): 'time'/'stock'
    - feature (str)
    - n_neighbors (int)
    - p (float)
    
    Returns:
    - A dict that mapping stock_id/time_id to the array of their neighbors' index
    """
    
    if nn_type == 'time':
        nn = TimeIdNeighbors(name="time_id_neighbors", df=features_df, feature_col=feature, p=p, metric="minkowski", n_neighbors_max=n_neighbors)
        
    elif nn_type == 'stock':
        nn = StockIdNeighbors(name="stock_id_neighbors", df=features_df, feature_col=feature, p=p, metric="minkowski", n_neighbors_max=n_neighbors)
    
    neighbors = nn.neighbors
    dst = {}
    for i in range(len(neighbors)):
        dst[i] = neighbors[i]
    
    return dst

In [11]:
test = get_mapping('stock', 'log_return_wap')
test

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


{0: array([  0, 160, 105]),
 1: array([  1, 130, 160]),
 2: array([  2, 105, 160]),
 3: array([  3, 160, 105]),
 4: array([  4, 160, 130]),
 5: array([  5,  45, 130]),
 6: array([  6, 160, 105]),
 7: array([  7, 141, 140]),
 8: array([  8, 130, 160]),
 9: array([  9, 160, 105]),
 10: array([ 10, 160, 105]),
 11: array([ 11, 112,  45]),
 12: array([ 12, 123, 160]),
 13: array([ 13, 141, 140]),
 14: array([ 14,  45, 112]),
 15: array([ 15, 130, 112]),
 16: array([ 16, 160, 105]),
 17: array([ 17, 141, 140]),
 18: array([ 18, 130, 160]),
 19: array([ 19, 160, 105]),
 20: array([ 20, 160, 105]),
 21: array([ 21, 160, 105]),
 22: array([ 22, 183, 140]),
 23: array([ 23, 140, 183]),
 24: array([ 24, 160, 105]),
 25: array([ 25, 160, 105]),
 26: array([ 26,  49, 160]),
 27: array([ 27,  23, 140]),
 28: array([ 28, 105, 160]),
 29: array([ 29, 160, 105]),
 30: array([ 30, 148, 195]),
 31: array([ 31, 105, 160]),
 32: array([ 32, 160, 105]),
 33: array([ 33, 130, 151]),
 34: array([ 34, 160, 12