In [1]:
import sklearn
import pandas as pd
import numpy as np
from datetime import datetime,timedelta
import collections
from itertools import chain
import matplotlib.pyplot as plt
from pathlib import Path

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn import set_config
from sklearn.compose import make_column_selector, make_column_transformer
import warnings
warnings.filterwarnings('ignore')

In [3]:
pd.set_option('display.max_columns', None)

# 1. Data
### 1.1 Load/Import Data


Fill in the local data pathes below

In [4]:
TRADE_PATH='../data/raw_data/2020-01-02/AAPL_trades.csv'
QUOTE_PATH='../data/raw_data/2020-01-02/AAPL_quotes.csv'

In [5]:
trades = pd.read_csv(Path(TRADE_PATH))
quotes = pd.read_csv(Path(QUOTE_PATH))

In [6]:
trades.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 20 columns):
 #   Column                                  Non-Null Count   Dtype  
---  ------                                  --------------   -----  
 0   Unnamed: 0.1                            100000 non-null  int64  
 1   Unnamed: 0                              100000 non-null  int64  
 2   Time                                    100000 non-null  object 
 3   Date                                    100000 non-null  object 
 4   Exchange                                100000 non-null  object 
 5   Symbol                                  100000 non-null  object 
 6   Trade_Volume                            100000 non-null  int64  
 7   Trade_Price                             100000 non-null  float64
 8   Sale_Condition                          100000 non-null  object 
 9   Source_of_Trade                         100000 non-null  object 
 10  Trade_Stop_Stock_Indicator              0 non

Note: All column information of trades and quotes data and valid entries for each column can be found at https://www.nyse.com/publicdocs/nyse/data/Daily_TAQ_Client_Spec_v3.0.pdf

In [7]:
trades.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Time,Date,Exchange,Symbol,Trade_Volume,Trade_Price,Sale_Condition,Source_of_Trade,Trade_Stop_Stock_Indicator,Trade_Correction_Indicator,Sequence_Number,Trade_Id,Trade_Reporting_Facility,Participant_Timestamp,Trade_Reporting_Facility_TRF_Timestamp,Trade_Through_Exempt_Indicator,Date.1,YearMonth
0,0,0,2020-01-02 04:00:00.064010,2020-01-02,P,AAPL,3801,295.05,@ T,N,,0,1185,1,,40000063617792,,1,2020-01-02,202001
1,1,1,2020-01-02 04:00:02.828485,2020-01-02,P,AAPL,1,295.08,@FTI,N,,0,1195,2,,40002828108800,,1,2020-01-02,202001
2,2,2,2020-01-02 04:00:06.250392,2020-01-02,Q,AAPL,6,295.25,@ TI,N,,0,1197,1,,40006250366823,,0,2020-01-02,202001
3,3,3,2020-01-02 04:00:06.429757,2020-01-02,P,AAPL,1,295.08,@ TI,N,,0,1198,3,,40006429377792,,0,2020-01-02,202001
4,4,4,2020-01-02 04:00:28.894835,2020-01-02,P,AAPL,3,295.1,@ TI,N,,0,1205,4,,40028894459136,,0,2020-01-02,202001


In [8]:
quotes.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Time,Exchange,Symbol,Bid_Price,Bid_Size,Offer_Price,Offer_Size,Quote_Condition,Sequence_Number,National_BBO_Indicator,FINRA_BBO_Indicator,FINRA_ADF_MPID_Indicator,Quote_Cancel_Correction,Source_Of_Quote,Retail_Interest_Indicator,Short_Sale_Restriction_Indicator,LULD_BBO_Indicator,SIP_Generated_Message_Identifier,NBBO_LULD_Indicator,Participant_Timestamp,FINRA_ADF_Timestamp,FINRA_ADF_Market_Participant_Quote_Indicator,Security_Status_Indicator,Date,YearMonth
0,0,0,2023-04-13 03:59:00.017425,Z,WISH,0.0,0.0,0.0,0.0,L,932,1,,,,N,,2,,,,35900015280000,,,,2023-04-13,202304
1,1,1,2023-04-13 03:59:00.171749,K,WISH,0.0,0.0,0.0,0.0,L,2279,1,,,,N,,2,,,,35900170533000,,,,2023-04-13,202304
2,2,2,2023-04-13 04:00:00.008906,K,WISH,5.0,1.0,9.81,2.0,R,3756,4,,,,N,,2,,,,40000000902000,,,,2023-04-13,202304
3,3,3,2023-04-13 04:00:00.008930,K,WISH,7.5,1.0,9.81,2.0,R,3758,4,,,,N,,2,,,,40000000902000,,,,2023-04-13,202304
4,4,4,2023-04-13 04:00:00.008986,K,WISH,7.5,1.0,8.4,1.0,R,3765,4,,,,N,,2,,,,40000000902000,,,,2023-04-13,202304


### 1.2 Data Visualization & Preliminary Analysis

### 1.3 Data Cleaning

Before we move towards feature generation and building machine learning models, we have to clean the dataset. The necessary steps to clean the trades and quotes data include:
1. Get rid of unnecessary columns.
2. Get rid of the invalid trades and quotes.
3. Event reconstruction.
4. Only keep the natural best bid/offer or last MQU.
5. Assign Last Active Quote and assign to the corresponding trade.

## SCIKIT-LEARN DESIGN

https://arxiv.org/pdf/1309.0238.pdf

Scikit-Learn’s API is remarkably well designed. These are the main design components of Scikit-Learn.

All objects share a consistent and simple interface:

### Estimators

Any object that can estimate some parameters based on a dataset is called an estimator (e.g., a SimpleImputer is an estimator). The estimation itself is performed by the fit() method, and it takes a dataset as a parameter, or two for supervised learning algorithms—the second dataset contains the labels. Any other parameter needed to guide the estimation process is considered a hyperparameter (such as a SimpleImputer’s strategy), and it must be set as an instance variable (generally via a constructor parameter).

### Transformers

Some estimators (such as a SimpleImputer) can also transform a dataset; these are called transformers. Once again, the API is simple: the transformation is performed by the transform() method with the dataset to transform as a parameter. It returns the transformed dataset. This transformation generally relies on the learned parameters, as is the case for a SimpleImputer. All transformers also have a convenience method called fit_transform(), which is equivalent to calling fit() and then transform() (but sometimes fit_transform() is optimized and runs much faster).


### Predictors

Finally, some estimators, given a dataset, are capable of making predictions; they are called predictors. For example, the LinearRegression model in the previous chapter was a predictor: given a country’s GDP per capita, it predicted life satisfaction. A predictor has a predict() method that takes a dataset of new instances and returns a dataset of corresponding predictions. It also has a score() method that measures the quality of the predictions, given a test set (and the corresponding labels, in the case of supervised learning algorithms).

### ...

Reference to the base classes for all estimators in scikit-learn can be found at: https://github.com/scikit-learn/scikit-learn/blob/9aaed4987/sklearn/base.py#L153

In [9]:
from generators import *
from clean_data import *
from feature_generation import *
from preprocess_data import *

In [10]:
clean_pipeline = make_pipeline(
    CleanData()
)

In [11]:
clean_trades = clean_pipeline.fit_transform(trades)
clean_trades.head()

test


Unnamed: 0,Unnamed: 0.1,Exchange,Symbol,Trade_Volume,Trade_Price,Sale_Condition,Source_of_Trade,Trade_Stop_Stock_Indicator,Trade_Correction_Indicator,Sequence_Number,Trade_Id,Trade_Reporting_Facility,Participant_Timestamp,Trade_Reporting_Facility_TRF_Timestamp,Trade_Through_Exempt_Indicator,Date.1
2020-01-02 09:15:03.454000,5140,D,AAPL,4,296.25,@ TI,N,,0,16067,706,N,2020-01-02 09:15:03.454000,91503460000000.0,0,2020-01-02
2020-01-02 09:15:06.873000,5141,D,AAPL,1,296.25,@ TI,N,,0,16087,707,N,2020-01-02 09:15:06.873000,91506880000000.0,0,2020-01-02
2020-01-02 09:15:07.893000,5142,D,AAPL,100,296.15,@ T,N,,0,16088,708,N,2020-01-02 09:15:07.893000,91507900000000.0,0,2020-01-02
2020-01-02 09:15:08.263000,5143,D,AAPL,7,296.25,@ TI,N,,0,16090,709,N,2020-01-02 09:15:08.263000,91508270000000.0,0,2020-01-02
2020-01-02 09:15:08.271318,5144,P,AAPL,1,296.15,@ TI,N,,0,16091,1544,,2020-01-02 09:15:08.271318,,0,2020-01-02


In [12]:
clean_quotes = clean_pipeline.fit_transform(quotes)
clean_quotes.head()

test


Unnamed: 0,Unnamed: 0.1,Exchange,Symbol,Bid_Price,Bid_Size,Offer_Price,Offer_Size,Quote_Condition,Sequence_Number,National_BBO_Indicator,FINRA_BBO_Indicator,FINRA_ADF_MPID_Indicator,Quote_Cancel_Correction,Source_Of_Quote,Retail_Interest_Indicator,Short_Sale_Restriction_Indicator,LULD_BBO_Indicator,SIP_Generated_Message_Identifier,NBBO_LULD_Indicator,Participant_Timestamp,FINRA_ADF_Timestamp,FINRA_ADF_Market_Participant_Quote_Indicator,Security_Status_Indicator
2023-04-13 09:15:19.129700,675,Z,WISH,7.6,1.0,8.43,1.0,R,2375908,0,,,,N,,2,,,,2023-04-13 09:15:19.129700,,,
2023-04-13 09:15:23.799144,676,Z,WISH,7.6,1.0,8.43,2.0,R,2377449,0,,,,N,,2,,,,2023-04-13 09:15:23.799144,,,
2023-04-13 09:16:32.721118,677,P,WISH,7.96,62.0,8.03,1.0,R,2405009,4,,,,N,,2,,,,2023-04-13 09:16:32.721118,,,
2023-04-13 09:17:05.001843,678,B,WISH,5.74,1.0,0.0,0.0,R,2418043,0,,,,N,,2,,,,2023-04-13 09:17:05.001843,,,
2023-04-13 09:17:05.194170,679,B,WISH,5.74,1.0,10.27,1.0,R,2418203,0,,,,N,,2,,,,2023-04-13 09:17:05.194170,,,


### 1.4 Reconstructing Events

In [13]:
clean_trades['Is_Quote'] = False
clean_quotes['Is_Quote'] = True
trade_features = ['Symbol', 'Trade_Volume', 'Trade_Price', 'Trade_Id', 'Trade_Reporting_Facility', 'Participant_Timestamp', 'Is_Quote']
quote_features = ['Symbol', 'Bid_Price', 'Bid_Size', 'Offer_Price', 'Offer_Size', 'Participant_Timestamp', 'Is_Quote']

In [14]:
all_events = clean_trades[trade_features].append(clean_quotes[quote_features], ignore_index=True)
all_events.index.name = "RID"
all_events = all_events.sort_values(by=['Participant_Timestamp', all_events.index.name])
all_events.head(10)

Unnamed: 0_level_0,Symbol,Trade_Volume,Trade_Price,Trade_Id,Trade_Reporting_Facility,Participant_Timestamp,Is_Quote,Bid_Price,Bid_Size,Offer_Price,Offer_Size
RID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,AAPL,4.0,296.25,706.0,N,2020-01-02 09:15:03.454000,False,,,,
1,AAPL,1.0,296.25,707.0,N,2020-01-02 09:15:06.873000,False,,,,
2,AAPL,100.0,296.15,708.0,N,2020-01-02 09:15:07.893000,False,,,,
3,AAPL,7.0,296.25,709.0,N,2020-01-02 09:15:08.263000,False,,,,
4,AAPL,1.0,296.15,1544.0,,2020-01-02 09:15:08.271318,False,,,,
5,AAPL,1.0,296.15,3.0,,2020-01-02 09:15:08.274097,False,,,,
6,AAPL,50.0,296.15,1439.0,,2020-01-02 09:15:09.340550,False,,,,
7,AAPL,70.0,296.12,1440.0,,2020-01-02 09:15:09.340550,False,,,,
8,AAPL,100.0,296.12,1441.0,,2020-01-02 09:15:09.340550,False,,,,
9,AAPL,71.0,296.11,1442.0,,2020-01-02 09:15:09.340550,False,,,,


### 1.5 Preprocess Data

All preprossing steps are implemented according to the papaer: The Participant Timestamp: Get The Most Out Of TAQ Data https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3984827 

Trade direction is assigned using the tick test, which can be refered from the paper: Inferring Trade Direction from Intraday Data by Charles M. C. Lee, Mark J. Ready https://www.jstor.org/stable/2328845

#### *For labeling valid quotes using AND(&) operator:

|Is_Quote|valid_quotes|Desired|
|---|---|---|
|False|False|False|
|False|True|False|
|True|False|False|
|True|True|True|

In [15]:
preprocess_pipeline = make_pipeline(
    PreprocessData()
)

In [16]:
df_prepared = preprocess_pipeline.fit_transform(all_events)
df_prepared.head(20)

Unnamed: 0_level_0,Symbol,Trade_Volume,Trade_Price,Trade_Id,Trade_Reporting_Facility,Participant_Timestamp,Is_Quote,Bid_Price,Bid_Size,Offer_Price,Offer_Size,MOX,Valid_Quotes,Trade_Sign,Participant_Timestamp_f
RID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,AAPL,4.0,296.25,706.0,N,2020-01-02 09:15:03.454000,False,,,,,0,False,1.0,1577957000.0
1,AAPL,1.0,296.25,707.0,N,2020-01-02 09:15:06.873000,False,,,,,1,False,1.0,1577957000.0
2,AAPL,100.0,296.15,708.0,N,2020-01-02 09:15:07.893000,False,,,,,2,False,-1.0,1577957000.0
3,AAPL,7.0,296.25,709.0,N,2020-01-02 09:15:08.263000,False,,,,,3,False,1.0,1577957000.0
4,AAPL,1.0,296.15,1544.0,,2020-01-02 09:15:08.271318,False,,,,,4,False,-1.0,1577957000.0
5,AAPL,1.0,296.15,3.0,,2020-01-02 09:15:08.274097,False,,,,,5,False,-1.0,1577957000.0
6,AAPL,50.0,296.15,1439.0,,2020-01-02 09:15:09.340550,False,,,,,6,False,-1.0,1577957000.0
7,AAPL,70.0,296.12,1440.0,,2020-01-02 09:15:09.340550,False,,,,,6,False,-1.0,1577957000.0
8,AAPL,100.0,296.12,1441.0,,2020-01-02 09:15:09.340550,False,,,,,6,False,-1.0,1577957000.0
9,AAPL,71.0,296.11,1442.0,,2020-01-02 09:15:09.340550,False,,,,,6,False,-1.0,1577957000.0


In [17]:
df_prepared[df_prepared['MOX'] == 11]

Unnamed: 0_level_0,Symbol,Trade_Volume,Trade_Price,Trade_Id,Trade_Reporting_Facility,Participant_Timestamp,Is_Quote,Bid_Price,Bid_Size,Offer_Price,Offer_Size,MOX,Valid_Quotes,Trade_Sign,Participant_Timestamp_f
RID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
15,AAPL,1.0,296.11,1447.0,,2020-01-02 09:15:14.394604,False,,,,,11,False,-1.0,1577957000.0


In [18]:
# import the set_config module from sklearn
from sklearn import set_config

# set the display option for sklearn to 'diagram'
set_config(display='diagram')

# display the pipeline 
preprocess_pipeline

## 2. Feature Generating Pipeline

## 2.1 Generating Features

#### 2.1.1

#### 2.1.2 Return and Imbalance

#### 2.1.3

## 2.2 Pipeline

### 2.2.2 Feature Generation Pipeline

In [19]:
df_test = df_prepared.copy()[:40000]
df_test['Trade_Volume'] = df_test['Trade_Volume'].apply(lambda t: t if not np.isnan(t) else 0)
df_test.reset_index(drop=True, inplace=True)
df_test.head()

Unnamed: 0,Symbol,Trade_Volume,Trade_Price,Trade_Id,Trade_Reporting_Facility,Participant_Timestamp,Is_Quote,Bid_Price,Bid_Size,Offer_Price,Offer_Size,MOX,Valid_Quotes,Trade_Sign,Participant_Timestamp_f
0,AAPL,4.0,296.25,706.0,N,2020-01-02 09:15:03.454000,False,,,,,0,False,1.0,1577957000.0
1,AAPL,1.0,296.25,707.0,N,2020-01-02 09:15:06.873000,False,,,,,1,False,1.0,1577957000.0
2,AAPL,100.0,296.15,708.0,N,2020-01-02 09:15:07.893000,False,,,,,2,False,-1.0,1577957000.0
3,AAPL,7.0,296.25,709.0,N,2020-01-02 09:15:08.263000,False,,,,,3,False,1.0,1577957000.0
4,AAPL,1.0,296.15,1544.0,,2020-01-02 09:15:08.271318,False,,,,,4,False,-1.0,1577957000.0


In [20]:
RETURN_CALENDAR_SPAN = 5
RETURN_TRANSACTION_SPAN = None
RETURN_VOLUME_SPAN = None

CALENDAR_DELTAS = [(0,.1),(.1,.2),(.2,.4),(.4,.8),(.8,1.6),(1.6,3.2),(3.2,6.4),(6.4,12.8),(12.8,25.6)]
TRANSACTION_DELTAS = [(0,1),(1,2),(2,4),(4,8),(8,16),(16,32),(32,64),(64,128),(128,256)]
VOLUME_DELTAS = [(0,100),(100,200),(200,400),(400,800),(800,1600),(1600,3200),(3200,6400),(6400,12800),(12800,25600)]

In [21]:
# df_test = parent_generator_ret_imb(df_test, CALENDAR_DELTAS[:1])

In [22]:
params = {
    'return_span': RETURN_CALENDAR_SPAN,
    'clock_mode': 'calendar',
    'deltas': CALENDAR_DELTAS
}

In [None]:
feature_pipeline = make_pipeline(
    FeatureGeneration(params=params)
)
df = feature_pipeline.fit_transform(df_test)