In [1]:
# For data processing
import numpy as np
import pandas as pd

# Save features as a pickle file
import pickle

In [2]:
with open('../datasets/datasets-list', 'rb') as file:
    datasets = pickle.load(file)

In [3]:
datasets_combined = datasets # Create checkpoint
len(datasets_combined)

26

## Features Selection

Let $T_{study}$ denote the total amount of days in a study period and $n_i$ represent the number of stocks $s$ in $S$ having complete historical data available at the end of each study period $i$. Moreover, we define the adjacent closing price and opening price of any stock $s \in S$ at time $t$ by $cp^{(s)}_t$ and $op^{(s)}_t$.

Given a prediction day $t:=\tau$, we have the following inputs and prediction task.

Input: We have the historical opening prices, $op^{(s)}_t, t \in \{ 0, 1, ..., \tau -1, \tau\}$, (including the opening price of the prediction day $op^{(s)}_\tau$) as well as the historical adjacent closing prices, $cp^{(s)}_t, t \in \{ 0, 1, ..., \tau -1\}$, (excluding the opening price of the prediction day $cp^{(s)}_\tau$).

Task: Out of all n stocks, predict k stocks with highest and k stocks with lowest intraday return $ir_{\tau, 0} = \dfrac{cp_\tau}{op_\tau} - 1$.

**NOTE:** In the original paper they used all the stocks that could be scrapped from the web. Then they divided each stock into 26 datasets. Now, in this 26 datasets, some datasets may contain all 492 stocks that were originally scrapped and some datasets may contain only 251 stocks. That is why it is saying $s \in S$ because each dataset will have different number of stocks and that will be a subset of all the originally scrapped stocks.

But, in our case we are dealing with only those stocks which has all entries filled from 1990-01-02 to 2018-12-31. So, we have 251 stocks in all the datasets.



For LaTex markdown, refer to this page: [here](https://ashki23.github.io/markdown-latex.html)

### Feature generation for Random Forest

For any stock $s \in S$ and any time $t \in \{ 241, 242, ..., T_{study} \}$, the feature set we provide to the random forest comprises of 3 signal:

1. Intraday return: $ir^{(s)}_{t, m} := \dfrac{cp^{(s)}_{t-m}}{op^{(s)}_{t-m}} - 1$,


2. Returns with respect to last closing price: $cr^{(s)}_{t, m} := \dfrac{cp^{(s)}_{t-1}}{cp^{(s)}_{t-1-m}} - 1$,


3. Returns with respect to opening price: $or^{(s)}_{t, m} := \dfrac{op^{(s)}_{t}}{cp^{(s)}_{t-m}} - 1$,

where $m \in \{ 1, 2, 3, ..., 20 \} \cup \{ 40, 60, 80, ...., 240 \}$, obtaining 93 features. By the choice of m we consider in the first month the corresponding returns of each trading day, whereas for the subsequent 11 months we only consider the corresponding multi-period returns of each month.

![ir-cr-and-or-calculation.png](https://i.postimg.cc/T3sjRDQ1/ir-cr-and-or-calculation.png)

In [4]:
# This function will generate new features for Random Forest
def generate_features_rf(curr_dataset):

    # Take the total amount of days in 1st study period
    T_study = curr_dataset.shape[1]
    print("current dataset has", T_study, " days.")

    # Create the t =[241, 243, ..., T_study]
    t = np.arange(240, T_study)

    # Define the m for calculation of t-m, m = [1, 2, 3, ..., 20]
    M = np.arange(1, 21)

    # m = [1, 2, 3, ..., 20] U [40, 60, 80, ..., 240]
    M = np.concatenate((M, np.arange(40, 241, 20)))

    # Define number of stocks as it will be used to create arrays with proper shapes
    n_stocks = 251

    # Create a container to store ir, cr and or for the current dataset
    container = np.ones(shape=(n_stocks, T_study, M.shape[0]*3))

    # Put NaN values to the first 240 rows as it will be used for feature creation
    container[:, :t[0], :] = np.nan 


    # To calculate ir, we need cp_(t-m) and op_(t-m)
    cp_t_m = np.zeros((n_stocks, t.shape[0], M.shape[0]))
    op_t_m = np.zeros((n_stocks, t.shape[0], M.shape[0]))

    # To calculate cr, we need cp_(t-1-m) and cp_(t-1-m). Remember we are indexing from 0, not 1!
    cp_t_1_m = np.zeros((n_stocks, t.shape[0], M.shape[0]))
    cp_t_1 = curr_dataset[:, t-2, 0]

    # To calculate or, we need op_t and cp_t_m. Remember we are indexing from 0, not 1!
    op_t = curr_dataset[:, t-1, 1]


    # Calculate cp_(t-m), op_(t-m) and cp_(t-1-m) for each m and store them at proper axis=2 index i
    # of their respective container
    for i, m in enumerate(M):
        cp_t_m[:, :, i] = curr_dataset[:, t-m, 0]
        op_t_m[:, :, i] = curr_dataset[:, t-m, 1]
        cp_t_1_m[:, :, i] = curr_dataset[:, t-1-m, 0]


    # Calculate ir_(t-m)
    ir_t_m = np.divide(cp_t_m, op_t_m, out=np.zeros_like(cp_t_m), where=op_t_m!=0) - 1


    # Before calculating cr_(t-m), reshape the cp_(t-1-m) as it should have the same last part of shape as cp_(t-1), the divident
    # means if cp_(t-1) is (251, 774) then cp_(t-1-m) should be (_, 251, 774) notice the last of shape is same
    reshaped_cp_t_1_m = cp_t_1_m.reshape(M.shape[0], n_stocks, -1)

    # Calculating cr_(t-m)
    cr_t_m = np.divide(cp_t_1, reshaped_cp_t_1_m, where=reshaped_cp_t_1_m!=0).reshape(n_stocks, -1, M.shape[0]) - 1


    # Before calculating or_(t-m), reshape the cp_(t-m) as it should have the same last part of shape as op_t, the divident
    # means if op_t is (251, 774) then cp_(t-m) should be (_, 251, 774) notice the last of shape is same
    reshaped_cp_t_m = cp_t_m.reshape(M.shape[0], n_stocks, -1)

    # Calculating or_(t-m)
    or_t_m = np.divide(op_t, reshaped_cp_t_m, where=reshaped_cp_t_m!=0).reshape(n_stocks, -1, M.shape[0]) - 1


    # Put the ir, cr and or inside the container
    container[:, t, :] = np.dstack((ir_t_m, cr_t_m, or_t_m))

    return container

In [6]:
# It will contain all the newly processed datasets each with a shape (251, stock days in 4 years, 93)
containers = []

# Run the generate_feature_rf function for each dataset inside main_datasets
for dataset in datasets_combined:
    containers.append(generate_features_rf(dataset))

current dataset has 1013  days.
current dataset has 1012  days.
current dataset has 1011  days.
current dataset has 1011  days.
current dataset has 1011  days.
current dataset has 1011  days.
current dataset has 1011  days.
current dataset has 1009  days.
current dataset has 1004  days.
current dataset has 1004  days.
current dataset has 1004  days.
current dataset has 1004  days.
current dataset has 1008  days.
current dataset has 1007  days.
current dataset has 1006  days.
current dataset has 1007  days.
current dataset has 1007  days.
current dataset has 1008  days.
current dataset has 1009  days.
current dataset has 1006  days.
current dataset has 1006  days.
current dataset has 1006  days.
current dataset has 1006  days.
current dataset has 1008  days.
current dataset has 1007  days.
current dataset has 1005  days.


In [7]:
len(containers)

26

In [9]:
with open('../features/rf-features-part1', 'wb') as file:
    pickle.dump(containers[:13], file)

with open('../features/rf-features-part2', 'wb') as file:
    pickle.dump(containers[13:], file)