In [1]:
# Local packages
import os
from typing import Union, Tuple, List
import warnings

warnings.filterwarnings("ignore")

# 3rd party modules
import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy import stats
from sklearn.preprocessing import StandardScaler, MinMaxScaler

ori_data = pd.read_csv("data/stock.csv")
from src.datautils.data_loading import real_data_loading

In [7]:
data, time = real_data_loading("stock", 150)
data = np.array(data)
print(f"Original data preview:\n{data[:2, :10, :2]}\n")

Original data preview:
[[[0.58561066 0.58714961]
  [0.57986472 0.5831197 ]
  [0.56912581 0.57502714]
  [0.5796437  0.57980907]
  [0.58998155 0.59055009]
  [0.5911766  0.59505413]
  [0.5817391  0.59589611]
  [0.58522598 0.58401885]
  [0.5880662  0.58670248]
  [0.59207694 0.59379531]]

 [[0.17674232 0.17773007]
  [0.17727497 0.17755951]
  [0.17807193 0.17878179]
  [0.18155661 0.18029239]
  [0.1792389  0.1796305 ]
  [0.17845009 0.17766104]
  [0.17842568 0.17719812]
  [0.17375777 0.17692604]
  [0.17324137 0.17268665]
  [0.17415219 0.17320642]]]



In [2]:
if ori_data.columns[0] == "Unnamed: 0":
    ori_data = ori_data.drop(["Unnamed: 0"], axis=1)
ori_data

Unnamed: 0,Idx,Open,High,Low,Close,Adj_Close,Volume
0,0,0.193767,0.194468,0.197752,0.195663,0.195663,0.061479
1,0,0.192324,0.192243,0.195193,0.193290,0.193290,0.064805
2,0,0.195943,0.194814,0.194233,0.191215,0.191215,0.146998
3,0,0.200789,0.200194,0.201437,0.198794,0.198794,0.049875
4,0,0.199065,0.200377,0.203777,0.200473,0.200473,0.063465
...,...,...,...,...,...,...,...
25741,3677,0.161547,0.163753,0.165218,0.163663,0.163663,0.296797
25742,3677,0.160535,0.160911,0.163444,0.160736,0.160736,0.101586
25743,3677,0.162161,0.161215,0.163146,0.159982,0.159982,0.116064
25744,3677,0.159286,0.160870,0.161711,0.161356,0.161356,0.133597


In [3]:
no = ori_data.shape[0]
z_scores = stats.zscore(ori_data, axis=0, nan_policy="omit")
z_filter = np.nanmax(np.abs(z_scores), axis=1) < 3
ori_data = ori_data[z_filter]
ori_data

Unnamed: 0,Idx,Open,High,Low,Close,Adj_Close,Volume
0,0,0.193767,0.194468,0.197752,0.195663,0.195663,0.061479
1,0,0.192324,0.192243,0.195193,0.193290,0.193290,0.064805
2,0,0.195943,0.194814,0.194233,0.191215,0.191215,0.146998
3,0,0.200789,0.200194,0.201437,0.198794,0.198794,0.049875
4,0,0.199065,0.200377,0.203777,0.200473,0.200473,0.063465
...,...,...,...,...,...,...,...
25741,3677,0.161547,0.163753,0.165218,0.163663,0.163663,0.296797
25742,3677,0.160535,0.160911,0.163444,0.160736,0.160736,0.101586
25743,3677,0.162161,0.161215,0.163146,0.159982,0.159982,0.116064
25744,3677,0.159286,0.160870,0.161711,0.161356,0.161356,0.133597


In [4]:
def imputer(
    curr_data: np.ndarray, impute_vals: List, zero_fill: bool = True
) -> np.ndarray:
    """Impute missing data given values for each columns.

    Args:
        curr_data (np.ndarray): Data before imputation.
        impute_vals (list): Values to be filled for each column.
        zero_fill (bool, optional): Whather to Fill with zeros the cases where
            impute_val is nan. Defaults to True.

    Returns:
        np.ndarray: Imputed data.
    """

    curr_data = pd.DataFrame(data=curr_data)
    impute_vals = pd.Series(impute_vals)

    # Impute data
    imputed_data = curr_data.fillna(impute_vals)

    # Zero-fill, in case the `impute_vals` for a particular feature is `nan`.
    imputed_data = imputed_data.fillna(0.0)

    # Check for any N/A values
    if imputed_data.isnull().any().any():
        raise ValueError("NaN values remain after imputation")

    return imputed_data.to_numpy()

In [5]:
scaling_method = "minmax"
impute_method = "median"

if scaling_method == "minmax":
    scaler = MinMaxScaler()
    scaler.fit(ori_data)
    params = [scaler.data_min_, scaler.data_max_]

elif scaling_method == "standard":
    scaler = StandardScaler()
    scaler.fit(ori_data)
    params = [scaler.mean_, scaler.var_]

# Imputation values
if impute_method == "median":
    impute_vals = ori_data.median()
elif impute_method == "mode":
    impute_vals = stats.mode(ori_data).mode[0]
else:
    raise ValueError("Imputation method should be `median` or `mode`")

In [6]:
index = "Idx"
max_seq_len = 100
uniq_id = np.unique(ori_data[index])
padding_value = -1.0
no = len(uniq_id)
dim = len(ori_data.columns) - 1
output = np.empty([no, 100, dim])  # Shape:[no, max_seq_len, dim]
output.fill(padding_value)
time = []
print(ori_data[ori_data[index] == uniq_id[1]])

# For each uniq id

    Idx      Open      High       Low     Close  Adj_Close    Volume
7     1  0.486096  0.496400  0.495676  0.494497   0.494497  0.021747
8     1  0.485228  0.488788  0.492230  0.489877   0.489877  0.015313
9     1  0.483517  0.486737  0.489156  0.487366   0.487366  0.019826
10    1  0.484631  0.485478  0.481034  0.483715   0.483715  0.026271
11    1  0.491080  0.490512  0.486519  0.486341   0.486341  0.025191
12    1  0.482568  0.489402  0.490165  0.488868   0.488868  0.026080
13    1  0.476969  0.484301  0.482233  0.485610   0.485610  0.021698


In [7]:
for i in tqdm(range(no)):
    # Extract the time-series data with a certain admissionid

    curr_data = ori_data[ori_data[index] == uniq_id[i]].to_numpy()

    # Impute missing data
    curr_data = imputer(curr_data, impute_vals)

    # Normalize data
    curr_data = scaler.transform(curr_data)

    # Extract time and assign to the preprocessed data (Excluding ID)
    curr_no = len(curr_data)

    # Pad data to `max_seq_len`
    if curr_no >= max_seq_len:
        output[i, :, :] = curr_data[:max_seq_len, 1:]  # Shape: [1, max_seq_len, dim]
        time.append(max_seq_len)
    else:
        output[i, :curr_no, :] = curr_data[:, 1:]  # Shape: [1, max_seq_len, dim]
        time.append(curr_no)
time

100%|██████████| 3676/3676 [00:02<00:00, 1309.60it/s]


[7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 6,
 7,
 7,
 7,
 7,
 4,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 6,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 6,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 5,
 7,
 7,
 7,
 6,
 7,
 6,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 3,
 7,
 7,
 7,
 4,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 2,
 7,
 7,
 7,
 7,
 7,
 7,
 5,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 3,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 6,
 7,
 7,
 7,
 7,
 7,
 6,
 7,
 7,
 7,
 7,
 7,
 4,
 5,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 6,
 7,
 7,
 7,
 5,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 5,
 7,
 7,


In [9]:
from sklearn.model_selection import train_test_split

train_data, test_data, train_time, test_time = train_test_split(
    output, time, test_size=0.3, random_state=1
)
train_time

[7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 4,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 5,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 6,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 6,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 6,
 7,
 7,
 7,
 2,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 4,
 7,
 7,
 7,
 2,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 6,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 4,
 7,
 7,
 7,
 6,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 2,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 6,
 7,
 7,
 7,
 7,
 7,
 6,
 7,
 7,
 7,
 7,
 7,
 4,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 4,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 5,
 7,
 7,
 7,
 7,
 7,
