# Example Pipelines Notebook
(Not Actively Maintained - possible for bugs)

In [1]:
import warnings
import pandas as pd
from pathlib import Path
from sklearn.pipeline import make_pipeline

from generators import *
from clean_data import *
from preprocess_data import *
from feature_generation import *

warnings.filterwarnings('ignore')

In [2]:
pd.set_option('display.max_columns', None)

# 1. Data
### 1.1 Load Sample Data

In [3]:
TRADES="../sample_data/trades.csv.gz"
QUOTES="../sample_data/quotes.csv.gz"

In [4]:
trades = pd.read_csv(Path(TRADES),compression='gzip')
quotes = pd.read_csv(Path(QUOTES),compression='gzip')

In [5]:
trades.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 18 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   Unnamed: 0                              1000 non-null   int64  
 1   Time                                    1000 non-null   object 
 2   Date                                    1000 non-null   object 
 3   Exchange                                1000 non-null   object 
 4   Symbol                                  1000 non-null   object 
 5   Trade_Volume                            1000 non-null   int64  
 6   Trade_Price                             1000 non-null   float64
 7   Sale_Condition                          1000 non-null   object 
 8   Source_of_Trade                         1000 non-null   object 
 9   Trade_Stop_Stock_Indicator              0 non-null      float64
 10  Trade_Correction_Indicator              1000 non-null   int64

Note: All column information of trades and quotes data and valid entries for each column can be found at https://www.nyse.com/publicdocs/nyse/data/Daily_TAQ_Client_Spec_v3.0.pdf

In [6]:
trades.head()

Unnamed: 0.1,Unnamed: 0,Time,Date,Exchange,Symbol,Trade_Volume,Trade_Price,Sale_Condition,Source_of_Trade,Trade_Stop_Stock_Indicator,Trade_Correction_Indicator,Sequence_Number,Trade_Id,Trade_Reporting_Facility,Participant_Timestamp,Trade_Reporting_Facility_TRF_Timestamp,Trade_Through_Exempt_Indicator,YearMonth
0,0,2020-01-06 04:00:00.064682,2020-01-06,P,AAPL,488,295.43,@ T,N,,0,1192,1,,40000064303360,,1,202001
1,1,2020-01-06 04:00:00.104067,2020-01-06,P,AAPL,100,295.26,@FT,N,,0,1195,2,,40000103686400,,1,202001
2,2,2020-01-06 04:00:00.104069,2020-01-06,P,AAPL,100,295.25,@FT,N,,0,1196,3,,40000103686400,,1,202001
3,3,2020-01-06 04:00:00.197526,2020-01-06,P,AAPL,25,295.12,@FTI,N,,0,1198,4,,40000197144064,,1,202001
4,4,2020-01-06 04:00:00.197530,2020-01-06,P,AAPL,1,295.08,@FTI,N,,0,1199,5,,40000197144064,,1,202001


In [7]:
quotes.head()

Unnamed: 0.1,Unnamed: 0,Time,Exchange,Symbol,Bid_Price,Bid_Size,Offer_Price,Offer_Size,Quote_Condition,Sequence_Number,National_BBO_Indicator,FINRA_BBO_Indicator,FINRA_ADF_MPID_Indicator,Quote_Cancel_Correction,Source_Of_Quote,Retail_Interest_Indicator,Short_Sale_Restriction_Indicator,LULD_BBO_Indicator,SIP_Generated_Message_Identifier,NBBO_LULD_Indicator,Participant_Timestamp,FINRA_ADF_Timestamp,FINRA_ADF_Market_Participant_Quote_Indicator,Security_Status_Indicator,Date,YearMonth
0,0,2020-01-06 04:00:00.065815,P,AAPL,278.0,7.0,0.0,0.0,R,2251,2,,,,N,,0,,,,40000065434368,,,,2020-01-06,202001
1,1,2020-01-06 04:00:00.065819,P,AAPL,278.0,14.0,0.0,0.0,R,2252,2,,,,N,,0,,,,40000065436672,,,,2020-01-06,202001
2,2,2020-01-06 04:00:00.065822,P,AAPL,278.0,14.0,298.28,1.0,R,2253,4,,,,N,,0,,,,40000065441792,,,,2020-01-06,202001
3,3,2020-01-06 04:00:00.065919,P,AAPL,291.2,1.0,298.28,1.0,R,2254,4,,,,N,,0,,,,40000065541888,,,,2020-01-06,202001
4,4,2020-01-06 04:00:00.065966,P,AAPL,295.26,1.0,298.28,1.0,R,2255,4,,,,N,,0,,,,40000065584896,,,,2020-01-06,202001


### 1.2 Data Visualization & Preliminary Analysis

### 1.3 Data Cleaning

Before we move towards feature generation and building machine learning models, we have to clean the dataset. The necessary steps to clean the trades and quotes data include:
1. Get rid of unnecessary columns.
2. Get rid of the invalid trades and quotes.
3. Event reconstruction.
4. Only keep the natural best bid/offer or last MQU.
5. Assign Last Active Quote and assign to the corresponding trade.

## SCIKIT-LEARN DESIGN

https://arxiv.org/pdf/1309.0238.pdf

Scikit-Learn’s API is remarkably well designed. These are the main design components of Scikit-Learn.

All objects share a consistent and simple interface:

### Estimators

Any object that can estimate some parameters based on a dataset is called an estimator (e.g., a SimpleImputer is an estimator). The estimation itself is performed by the fit() method, and it takes a dataset as a parameter, or two for supervised learning algorithms—the second dataset contains the labels. Any other parameter needed to guide the estimation process is considered a hyperparameter (such as a SimpleImputer’s strategy), and it must be set as an instance variable (generally via a constructor parameter).

### Transformers

Some estimators (such as a SimpleImputer) can also transform a dataset; these are called transformers. Once again, the API is simple: the transformation is performed by the transform() method with the dataset to transform as a parameter. It returns the transformed dataset. This transformation generally relies on the learned parameters, as is the case for a SimpleImputer. All transformers also have a convenience method called fit_transform(), which is equivalent to calling fit() and then transform() (but sometimes fit_transform() is optimized and runs much faster).


### Predictors

Finally, some estimators, given a dataset, are capable of making predictions; they are called predictors. For example, the LinearRegression model in the previous chapter was a predictor: given a country’s GDP per capita, it predicted life satisfaction. A predictor has a predict() method that takes a dataset of new instances and returns a dataset of corresponding predictions. It also has a score() method that measures the quality of the predictions, given a test set (and the corresponding labels, in the case of supervised learning algorithms).

### ...

Reference to the base classes for all estimators in scikit-learn can be found at: https://github.com/scikit-learn/scikit-learn/blob/9aaed4987/sklearn/base.py#L153

In [13]:
clean_pipeline = make_pipeline(
    CleanData(dropped_after_hourse=False,droped_irregular_hours=False)
)

In [14]:
clean_trades = clean_pipeline.fit_transform(trades)
clean_trades

test


Unnamed: 0,Exchange,Symbol,Trade_Volume,Trade_Price,Sale_Condition,Source_of_Trade,Trade_Stop_Stock_Indicator,Trade_Correction_Indicator,Sequence_Number,Trade_Id,Trade_Reporting_Facility,Participant_Timestamp,Trade_Reporting_Facility_TRF_Timestamp,Trade_Through_Exempt_Indicator
2020-01-06 04:00:00.064303,P,AAPL,488,295.43,@ T,N,,0,1192,1,,2020-01-06 04:00:00.064303,,1
2020-01-06 04:00:00.103686,P,AAPL,100,295.26,@FT,N,,0,1195,2,,2020-01-06 04:00:00.103686,,1
2020-01-06 04:00:00.103686,P,AAPL,100,295.25,@FT,N,,0,1196,3,,2020-01-06 04:00:00.103686,,1
2020-01-06 04:00:00.197144,P,AAPL,25,295.12,@FTI,N,,0,1198,4,,2020-01-06 04:00:00.197144,,1
2020-01-06 04:00:00.197144,P,AAPL,1,295.08,@FTI,N,,0,1199,5,,2020-01-06 04:00:00.197144,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-01-06 06:45:16.264262,Q,AAPL,188,294.52,@FT,N,,0,3394,612,,2020-01-06 06:45:16.264262,,1
2020-01-06 06:45:16.264262,Q,AAPL,113,294.51,@FT,N,,0,3395,613,,2020-01-06 06:45:16.264262,,1
2020-01-06 06:45:18.768598,Q,AAPL,387,294.51,@FT,N,,0,3396,614,,2020-01-06 06:45:18.768598,,1
2020-01-06 06:45:18.769039,P,AAPL,100,294.50,@FT,N,,0,3397,385,,2020-01-06 06:45:18.769039,,1


In [15]:
clean_quotes = clean_pipeline.fit_transform(quotes)
clean_quotes.head()

test


Unnamed: 0,Exchange,Symbol,Bid_Price,Bid_Size,Offer_Price,Offer_Size,Quote_Condition,Sequence_Number,National_BBO_Indicator,FINRA_BBO_Indicator,FINRA_ADF_MPID_Indicator,Quote_Cancel_Correction,Source_Of_Quote,Retail_Interest_Indicator,Short_Sale_Restriction_Indicator,LULD_BBO_Indicator,SIP_Generated_Message_Identifier,NBBO_LULD_Indicator,Participant_Timestamp,FINRA_ADF_Timestamp,FINRA_ADF_Market_Participant_Quote_Indicator,Security_Status_Indicator
2020-01-06 04:00:00.065434,P,AAPL,278.0,7.0,0.0,0.0,R,2251,2,,,,N,,0,,,,2020-01-06 04:00:00.065434,,,
2020-01-06 04:00:00.065436,P,AAPL,278.0,14.0,0.0,0.0,R,2252,2,,,,N,,0,,,,2020-01-06 04:00:00.065436,,,
2020-01-06 04:00:00.065441,P,AAPL,278.0,14.0,298.28,1.0,R,2253,4,,,,N,,0,,,,2020-01-06 04:00:00.065441,,,
2020-01-06 04:00:00.065541,P,AAPL,291.2,1.0,298.28,1.0,R,2254,4,,,,N,,0,,,,2020-01-06 04:00:00.065541,,,
2020-01-06 04:00:00.065584,P,AAPL,295.26,1.0,298.28,1.0,R,2255,4,,,,N,,0,,,,2020-01-06 04:00:00.065584,,,


### 1.4 Reconstructing Events

In [16]:
clean_trades['Is_Quote'] = False
clean_quotes['Is_Quote'] = True
trade_features = ['Symbol', 'Trade_Volume', 'Trade_Price', 'Trade_Id', 'Trade_Reporting_Facility', 'Participant_Timestamp', 'Is_Quote']
quote_features = ['Symbol', 'Bid_Price', 'Bid_Size', 'Offer_Price', 'Offer_Size', 'Participant_Timestamp', 'Is_Quote']

In [19]:
all_events = pd.concat([clean_trades[trade_features],clean_quotes[quote_features]])
all_events.index.name = "RID"
all_events = all_events.sort_values(by=['Participant_Timestamp', all_events.index.name])
all_events.head(10)

Unnamed: 0_level_0,Symbol,Trade_Volume,Trade_Price,Trade_Id,Trade_Reporting_Facility,Participant_Timestamp,Is_Quote,Bid_Price,Bid_Size,Offer_Price,Offer_Size
RID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2020-01-06 04:00:00.064303,AAPL,488.0,295.43,1.0,,2020-01-06 04:00:00.064303,False,,,,
2020-01-06 04:00:00.065434,AAPL,,,,,2020-01-06 04:00:00.065434,True,278.0,7.0,0.0,0.0
2020-01-06 04:00:00.065436,AAPL,,,,,2020-01-06 04:00:00.065436,True,278.0,14.0,0.0,0.0
2020-01-06 04:00:00.065441,AAPL,,,,,2020-01-06 04:00:00.065441,True,278.0,14.0,298.28,1.0
2020-01-06 04:00:00.065541,AAPL,,,,,2020-01-06 04:00:00.065541,True,291.2,1.0,298.28,1.0
2020-01-06 04:00:00.065584,AAPL,,,,,2020-01-06 04:00:00.065584,True,295.26,1.0,298.28,1.0
2020-01-06 04:00:00.065954,AAPL,,,,,2020-01-06 04:00:00.065954,True,295.26,1.0,298.0,1.0
2020-01-06 04:00:00.066140,AAPL,,,,,2020-01-06 04:00:00.066140,True,295.26,1.0,297.49,1.0
2020-01-06 04:00:00.067948,AAPL,,,,,2020-01-06 04:00:00.067948,True,295.26,1.0,297.13,1.0
2020-01-06 04:00:00.068447,AAPL,,,,,2020-01-06 04:00:00.068447,True,295.26,1.0,296.98,1.0


### 1.5 Preprocess Data

All preprossing steps are implemented according to the papaer: The Participant Timestamp: Get The Most Out Of TAQ Data https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3984827 

Trade direction is assigned using the tick test, which can be refered from the paper: Inferring Trade Direction from Intraday Data by Charles M. C. Lee, Mark J. Ready https://www.jstor.org/stable/2328845

#### *For labeling valid quotes using AND(&) operator:

|Is_Quote|valid_quotes|Desired|
|---|---|---|
|False|False|False|
|False|True|False|
|True|False|False|
|True|True|True|

In [20]:
preprocess_pipeline = make_pipeline(
    PreprocessData()
)

In [21]:
df_prepared = preprocess_pipeline.fit_transform(all_events)
df_prepared.head(20)

Unnamed: 0_level_0,Symbol,Trade_Volume,Trade_Price,Trade_Id,Trade_Reporting_Facility,Participant_Timestamp,Is_Quote,Bid_Price,Bid_Size,Offer_Price,Offer_Size,MOX,Valid_Quotes,Trade_Sign,Participant_Timestamp_f
RID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2020-01-06 04:00:00.064303,AAPL,488.0,295.43,1.0,,2020-01-06 04:00:00.064303,False,,,,,0,False,1.0,1578283000.0
2020-01-06 04:00:00.065441,AAPL,,,,,2020-01-06 04:00:00.065441,True,278.0,14.0,298.28,1.0,1,True,,1578283000.0
2020-01-06 04:00:00.065541,AAPL,,,,,2020-01-06 04:00:00.065541,True,291.2,1.0,298.28,1.0,2,True,,1578283000.0
2020-01-06 04:00:00.065584,AAPL,,,,,2020-01-06 04:00:00.065584,True,295.26,1.0,298.28,1.0,3,True,,1578283000.0
2020-01-06 04:00:00.065954,AAPL,,,,,2020-01-06 04:00:00.065954,True,295.26,1.0,298.0,1.0,4,True,,1578283000.0
2020-01-06 04:00:00.066140,AAPL,,,,,2020-01-06 04:00:00.066140,True,295.26,1.0,297.49,1.0,5,True,,1578283000.0
2020-01-06 04:00:00.067948,AAPL,,,,,2020-01-06 04:00:00.067948,True,295.26,1.0,297.13,1.0,6,True,,1578283000.0
2020-01-06 04:00:00.068447,AAPL,,,,,2020-01-06 04:00:00.068447,True,295.26,1.0,296.98,1.0,7,True,,1578283000.0
2020-01-06 04:00:00.068493,AAPL,,,,,2020-01-06 04:00:00.068493,True,295.26,1.0,296.98,11.0,8,False,,1578283000.0
2020-01-06 04:00:00.068493,AAPL,,,,,2020-01-06 04:00:00.068493,True,295.26,1.0,296.0,6.0,8,True,,1578283000.0


In [22]:
df_prepared[df_prepared['MOX'] == 11]

Unnamed: 0_level_0,Symbol,Trade_Volume,Trade_Price,Trade_Id,Trade_Reporting_Facility,Participant_Timestamp,Is_Quote,Bid_Price,Bid_Size,Offer_Price,Offer_Size,MOX,Valid_Quotes,Trade_Sign,Participant_Timestamp_f
RID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2020-01-06 04:00:00.197144,AAPL,25.0,295.12,4.0,,2020-01-06 04:00:00.197144,False,,,,,11,False,-1.0,1578283000.0
2020-01-06 04:00:00.197144,AAPL,1.0,295.08,5.0,,2020-01-06 04:00:00.197144,False,,,,,11,False,-1.0,1578283000.0
2020-01-06 04:00:00.197144,AAPL,20.0,295.0,6.0,,2020-01-06 04:00:00.197144,False,,,,,11,False,-1.0,1578283000.0
2020-01-06 04:00:00.197144,AAPL,,,,,2020-01-06 04:00:00.197144,True,295.0,1.0,295.55,1.0,11,True,,1578283000.0


In [23]:
# import the set_config module from sklearn
from sklearn import set_config

# set the display option for sklearn to 'diagram'
set_config(display='diagram')

# display the pipeline 
preprocess_pipeline

## 2. Feature Generating Pipeline

## 2.1 Generating Features

#### 2.1.1

#### 2.1.2 Return and Imbalance

#### 2.1.3

## 2.2 Pipeline

### 2.2.2 Feature Generation Pipeline

In [24]:
df_test = df_prepared.copy()[:40000]
df_test['Trade_Volume'] = df_test['Trade_Volume'].apply(lambda t: t if not np.isnan(t) else 0)
df_test.reset_index(drop=True, inplace=True)
df_test.head()

Unnamed: 0,Symbol,Trade_Volume,Trade_Price,Trade_Id,Trade_Reporting_Facility,Participant_Timestamp,Is_Quote,Bid_Price,Bid_Size,Offer_Price,Offer_Size,MOX,Valid_Quotes,Trade_Sign,Participant_Timestamp_f
0,AAPL,488.0,295.43,1.0,,2020-01-06 04:00:00.064303,False,,,,,0,False,1.0,1578283000.0
1,AAPL,0.0,,,,2020-01-06 04:00:00.065441,True,278.0,14.0,298.28,1.0,1,True,,1578283000.0
2,AAPL,0.0,,,,2020-01-06 04:00:00.065541,True,291.2,1.0,298.28,1.0,2,True,,1578283000.0
3,AAPL,0.0,,,,2020-01-06 04:00:00.065584,True,295.26,1.0,298.28,1.0,3,True,,1578283000.0
4,AAPL,0.0,,,,2020-01-06 04:00:00.065954,True,295.26,1.0,298.0,1.0,4,True,,1578283000.0


In [25]:
RETURN_CALENDAR_SPAN = 5
RETURN_TRANSACTION_SPAN = None
RETURN_VOLUME_SPAN = None

CALENDAR_DELTAS = [(0,.1),(.1,.2),(.2,.4),(.4,.8),(.8,1.6),(1.6,3.2),(3.2,6.4),(6.4,12.8),(12.8,25.6)]
TRANSACTION_DELTAS = [(0,1),(1,2),(2,4),(4,8),(8,16),(16,32),(32,64),(64,128),(128,256)]
VOLUME_DELTAS = [(0,100),(100,200),(200,400),(400,800),(800,1600),(1600,3200),(3200,6400),(6400,12800),(12800,25600)]

In [26]:
# df_test = parent_generator_ret_imb(df_test, CALENDAR_DELTAS[:1])

In [27]:
params = {
    'return_span': RETURN_CALENDAR_SPAN,
    'clock_mode': 'calendar',
    'deltas': CALENDAR_DELTAS
}

In [None]:
feature_pipeline = make_pipeline(
    FeatureGeneration()
)
df = feature_pipeline.fit_transform(df_test, params)