In [1]:
import sklearn
import pandas as pd
import numpy as np
from datetime import datetime,timedelta
import collections
from itertools import chain
import matplotlib.pyplot as plt
from pathlib import Path

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn import set_config
from sklearn.compose import make_column_selector, make_column_transformer

In [3]:
import warnings
warnings.filterwarnings('ignore')

# 0. Machine Learning Basics

Machine Learning is the study of computer algorithms that improve automatically through experience - Machine Learning, Tom Mitchell, McGraw Hill, 1997

#### Different types of machine learning:
-  Supervised Learning
    - eg. Regression, Classification
- Unsupervised Learning
    - eg. Clustering, Decision Tree
- Semi-Supervised Learning
- Reinforcement Learning

Interview Question: What's the difference between supervised and unsupervised learning?

#### A Typical Machine Learning Pipeline:
<img src="images/machine_learning_pipeline.png" />

Interview Question: What is a training/validation/test set?

# 1. Data

### 1.1 Load/Import Data

In [4]:
trades = pd.read_csv(Path("../data/raw_data/AAPL_trades.csv"))
quotes = pd.read_csv(Path("../data/raw_data/AAPL_quotes.csv"))

In [5]:
trades.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 283504 entries, 0 to 283503
Data columns (total 17 columns):
 #   Column                                  Non-Null Count   Dtype  
---  ------                                  --------------   -----  
 0   Unnamed: 0                              283504 non-null  int64  
 1   Time                                    283504 non-null  object 
 2   Date                                    283504 non-null  object 
 3   Exchange                                283504 non-null  object 
 4   Symbol                                  283504 non-null  object 
 5   Trade_Volume                            283504 non-null  int64  
 6   Trade_Price                             283504 non-null  float64
 7   Sale_Condition                          283504 non-null  object 
 8   Source_of_Trade                         283504 non-null  object 
 9   Trade_Stop_Stock_Indicator              0 non-null       float64
 10  Trade_Correction_Indicator              2835

|Trades Data|Description|
|---|---|
|Unnamed: 0 | dummy index |
|Time| Time the trade was published by SIP|
|Date| Date the trade was published |
|Exchange| The ID of the exchange where the trade took place|
|Symbol| Stock Symbol|
|Trade_Volume | The number of shares traded |
|Trade_Price | The share price of this trade |
|Sale_Condition | The special condition associated with the trade|
|Source_of_Trade | CTA/UTP |
|Trade_Stop_Stock_Indicator | CTA |
|Trade_Correction_Indicator |  |
|Sequence_Number | Message sequence number |
|Trade_Id | Identifier for tracking Trades. Unique per participant per symbol per session within a trading session |
|Trade_Reporting_Facility | The ID of the Trade Reporting Facility |
|Participant_Timestamp | Time when the trade was reported|
|Trade_Reporting_Facility_TRF_Timestamp | If from an Exchange or if the FINRA ADF does not have a proprietary quotation feed, then will be set to blank. If the FINRA ADF or a FINRA TRF provides a proprietary feed of its quotation information, then it’s set to be the time of the quotation|
|Trade_Through_Exempt_Indicator | Denotes whether or not a trade is exempt from Trade Through rules |

|Quotes Data|Description|
|---|---|
|Unnamed: 0 |  |
|Time| Time the quote was published by SIP|
|Exchange|The exchange that issued the quote |
|Symbol| Stock Symbol|
|Bid_Price | The highest price any buyer is willing to pay for shares of this security |
|Bid_Size | The maximum number of shares the highest bidder is willing to buy |
|Offer_Price |The lowest price any seller is willing to take for shares of this security |
|Offer_Size | The maximum number of shares available at the offer price|
|Quote_Condition | Determines whether a quote qualifies for the Best Bid and Best Offer calculation |
|Sequence_Number | message sequence numbers|
|National_BBO_Indicator | The effect this quote has on the NBBO |
|FINRA_BBO_Indicator | Indicates the effect this quote has on the FINRA BBO |
|FINRA_ADF_MPID_Indicator | Denotes  the type of appendage to be included |
|Quote_Cancel_Correction | Indicates that this record is a cancel or a correction of a previous quote|
|Source_Of_Quote | CTA or UTP |
|Retail_Interest_Indicator | Indicates the presence of Retail Price Improvement (RPI) interest between the Bid and the Offer |
|Short_Sale_Restriction_Indicator | Short Sale Restriction status |
|LULD_BBO_Indicator |  |
|SIP_Generated_Message_Identifier | Originator of the message |
|NBBO_LULD_Indicator | LULD Limit Price Band effect on the NBB and NBO |
|Participant_Timestamp | Time the quote was published by the Participant to the SIP |
|FINRA_ADF_Timestamp | A FINRA ADF- or a FINRA TRF-provided timestamp |
|FINRA_ADF_Market_Participant_Quote_Indicator | UTP - FINRA ADF Market Participant Quote Indicator representing the Top of book quotations for each FINRA ADF participant |
|Security_Status_Indicator |  |
|Date |  |
|YearMonth|  |

Note: All column information of trades and quotes data and valid entries for each column can be found at https://www.nyse.com/publicdocs/nyse/data/Daily_TAQ_Client_Spec_v3.0.pdf

In [6]:
pd.set_option('display.max_columns', None)

In [7]:
trades.head()

Unnamed: 0.1,Unnamed: 0,Time,Date,Exchange,Symbol,Trade_Volume,Trade_Price,Sale_Condition,Source_of_Trade,Trade_Stop_Stock_Indicator,Trade_Correction_Indicator,Sequence_Number,Trade_Id,Trade_Reporting_Facility,Participant_Timestamp,Trade_Reporting_Facility_TRF_Timestamp,Trade_Through_Exempt_Indicator
0,0,2020-01-02 04:00:00.064010,2020-01-02,P,AAPL,3801,295.05,@ T,N,,0,1185,1,,40000063617792,,1
1,1,2020-01-02 04:00:02.828485,2020-01-02,P,AAPL,1,295.08,@FTI,N,,0,1195,2,,40002828108800,,1
2,2,2020-01-02 04:00:06.250392,2020-01-02,Q,AAPL,6,295.25,@ TI,N,,0,1197,1,,40006250366823,,0
3,3,2020-01-02 04:00:06.429757,2020-01-02,P,AAPL,1,295.08,@ TI,N,,0,1198,3,,40006429377792,,0
4,4,2020-01-02 04:00:28.894835,2020-01-02,P,AAPL,3,295.1,@ TI,N,,0,1205,4,,40028894459136,,0


In [8]:
quotes.head(10)

Unnamed: 0.1,Unnamed: 0,Time,Exchange,Symbol,Bid_Price,Bid_Size,Offer_Price,Offer_Size,Quote_Condition,Sequence_Number,National_BBO_Indicator,FINRA_BBO_Indicator,FINRA_ADF_MPID_Indicator,Quote_Cancel_Correction,Source_Of_Quote,Retail_Interest_Indicator,Short_Sale_Restriction_Indicator,LULD_BBO_Indicator,SIP_Generated_Message_Identifier,NBBO_LULD_Indicator,Participant_Timestamp,FINRA_ADF_Timestamp,FINRA_ADF_Market_Participant_Quote_Indicator,Security_Status_Indicator,Date,YearMonth
0,0,2020-01-02 04:00:00.065165,P,AAPL,278.0,7.0,0.0,0.0,R,2228,2,,,,N,,0,,,,40000064785664,,,,2020-01-02,202001
1,1,2020-01-02 04:00:00.065167,P,AAPL,278.0,14.0,0.0,0.0,R,2229,2,,,,N,,0,,,,40000064787456,,,,2020-01-02,202001
2,2,2020-01-02 04:00:00.065170,P,AAPL,293.72,9.0,0.0,0.0,R,2230,2,,,,N,,0,,,,40000064790784,,,,2020-01-02,202001
3,3,2020-01-02 04:00:00.065681,P,AAPL,293.72,9.0,327.56,1.0,R,2231,4,,,,N,,0,,,,40000065302272,,,,2020-01-02,202001
4,4,2020-01-02 04:00:00.065738,P,AAPL,293.72,9.0,320.0,1.0,R,2232,4,,,,N,,0,,,,40000065358592,,,,2020-01-02,202001
5,5,2020-01-02 04:00:00.065738,P,AAPL,293.72,9.0,310.0,1.0,R,2233,4,,,,N,,0,,,,40000065360384,,,,2020-01-02,202001
6,6,2020-01-02 04:00:00.065744,P,AAPL,293.72,9.0,300.0,1.0,R,2235,4,,,,N,,0,,,,40000065366528,,,,2020-01-02,202001
7,7,2020-01-02 04:00:00.065813,P,AAPL,293.72,9.0,299.97,5.0,R,2237,4,,,,N,,0,,,,40000065433856,,,,2020-01-02,202001
8,8,2020-01-02 04:00:00.065816,P,AAPL,293.72,9.0,295.88,5.0,R,2238,4,,,,N,,0,,,,40000065437440,,,,2020-01-02,202001
9,9,2020-01-02 04:00:00.068515,P,AAPL,295.0,1.0,295.88,5.0,R,2241,4,,,,N,,0,,,,40000068136192,,,,2020-01-02,202001


In [9]:
all_events = pd.concat([trades, quotes], axis=0)

In [10]:
all_events.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2208691 entries, 0 to 1925186
Data columns (total 36 columns):
 #   Column                                        Dtype  
---  ------                                        -----  
 0   Unnamed: 0                                    int64  
 1   Time                                          object 
 2   Date                                          object 
 3   Exchange                                      object 
 4   Symbol                                        object 
 5   Trade_Volume                                  float64
 6   Trade_Price                                   float64
 7   Sale_Condition                                object 
 8   Source_of_Trade                               object 
 9   Trade_Stop_Stock_Indicator                    float64
 10  Trade_Correction_Indicator                    float64
 11  Sequence_Number                               int64  
 12  Trade_Id                                      float64
 1

In [11]:
all_events.head(10)

Unnamed: 0.1,Unnamed: 0,Time,Date,Exchange,Symbol,Trade_Volume,Trade_Price,Sale_Condition,Source_of_Trade,Trade_Stop_Stock_Indicator,Trade_Correction_Indicator,Sequence_Number,Trade_Id,Trade_Reporting_Facility,Participant_Timestamp,Trade_Reporting_Facility_TRF_Timestamp,Trade_Through_Exempt_Indicator,Bid_Price,Bid_Size,Offer_Price,Offer_Size,Quote_Condition,National_BBO_Indicator,FINRA_BBO_Indicator,FINRA_ADF_MPID_Indicator,Quote_Cancel_Correction,Source_Of_Quote,Retail_Interest_Indicator,Short_Sale_Restriction_Indicator,LULD_BBO_Indicator,SIP_Generated_Message_Identifier,NBBO_LULD_Indicator,FINRA_ADF_Timestamp,FINRA_ADF_Market_Participant_Quote_Indicator,Security_Status_Indicator,YearMonth
0,0,2020-01-02 04:00:00.064010,2020-01-02,P,AAPL,3801.0,295.05,@ T,N,,0.0,1185,1.0,,40000063617792,,1.0,,,,,,,,,,,,,,,,,,,
1,1,2020-01-02 04:00:02.828485,2020-01-02,P,AAPL,1.0,295.08,@FTI,N,,0.0,1195,2.0,,40002828108800,,1.0,,,,,,,,,,,,,,,,,,,
2,2,2020-01-02 04:00:06.250392,2020-01-02,Q,AAPL,6.0,295.25,@ TI,N,,0.0,1197,1.0,,40006250366823,,0.0,,,,,,,,,,,,,,,,,,,
3,3,2020-01-02 04:00:06.429757,2020-01-02,P,AAPL,1.0,295.08,@ TI,N,,0.0,1198,3.0,,40006429377792,,0.0,,,,,,,,,,,,,,,,,,,
4,4,2020-01-02 04:00:28.894835,2020-01-02,P,AAPL,3.0,295.1,@ TI,N,,0.0,1205,4.0,,40028894459136,,0.0,,,,,,,,,,,,,,,,,,,
5,5,2020-01-02 04:00:30.021361,2020-01-02,P,AAPL,2.0,295.1,@ TI,N,,0.0,1206,5.0,,40030020981248,,0.0,,,,,,,,,,,,,,,,,,,
6,6,2020-01-02 04:00:31.900055,2020-01-02,P,AAPL,7.0,295.1,@ TI,N,,0.0,1208,6.0,,40031899679744,,0.0,,,,,,,,,,,,,,,,,,,
7,7,2020-01-02 04:00:33.047715,2020-01-02,P,AAPL,5.0,295.1,@ TI,N,,0.0,1209,7.0,,40033047341056,,0.0,,,,,,,,,,,,,,,,,,,
8,8,2020-01-02 04:00:33.118294,2020-01-02,P,AAPL,5.0,295.1,@ TI,N,,0.0,1210,8.0,,40033117919744,,0.0,,,,,,,,,,,,,,,,,,,
9,9,2020-01-02 04:00:33.118809,2020-01-02,P,AAPL,10.0,295.1,@ TI,N,,0.0,1211,9.0,,40033118435584,,0.0,,,,,,,,,,,,,,,,,,,


### 1.2 Data Cleaning and Preprocessing

## SCIKIT-LEARN DESIGN

https://arxiv.org/pdf/1309.0238.pdf

Scikit-Learn’s API is remarkably well designed. These are the main design components of Scikit-Learn.

All objects share a consistent and simple interface:

### Estimators

Any object that can estimate some parameters based on a dataset is called an estimator (e.g., a SimpleImputer is an estimator). The estimation itself is performed by the fit() method, and it takes a dataset as a parameter, or two for supervised learning algorithms—the second dataset contains the labels. Any other parameter needed to guide the estimation process is considered a hyperparameter (such as a SimpleImputer’s strategy), and it must be set as an instance variable (generally via a constructor parameter).

### Transformers

Some estimators (such as a SimpleImputer) can also transform a dataset; these are called transformers. Once again, the API is simple: the transformation is performed by the transform() method with the dataset to transform as a parameter. It returns the transformed dataset. This transformation generally relies on the learned parameters, as is the case for a SimpleImputer. All transformers also have a convenience method called fit_transform(), which is equivalent to calling fit() and then transform() (but sometimes fit_transform() is optimized and runs much faster).


### Predictors

Finally, some estimators, given a dataset, are capable of making predictions; they are called predictors. For example, the LinearRegression model in the previous chapter was a predictor: given a country’s GDP per capita, it predicted life satisfaction. A predictor has a predict() method that takes a dataset of new instances and returns a dataset of corresponding predictions. It also has a score() method that measures the quality of the predictions, given a test set (and the corresponding labels, in the case of supervised learning algorithms).

### ...

Reference to the base classes for all estimators in scikit-learn can be found at: https://github.com/scikit-learn/scikit-learn/blob/9aaed4987/sklearn/base.py#L153

In [12]:
from sklearn.base import BaseEstimator, TransformerMixin, OneToOneFeatureMixin
from sortedcollections import OrderedSet
import time

In [13]:
class PreprocessData(BaseEstimator, TransformerMixin):
    
    def __init__(self, dropped_after_hourse=True, droped_irregular_hours=True):
        self.dropped_after_hourse = dropped_after_hourse
        self.droped_irregular_hours = droped_irregular_hours
        
    
    def fit(self, X, y=None):
        return self
    
    def generate_mox_identifier(self, df):
        """Generate MOX Identifier
           Referenced papaer: The Participant Timestamp: Get The Most Out Of TAQ Data
           Link: https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3984827 
        """
        # get participant timestamps
        participant_timestamps = df.index
        # convert timestamps to float
        fl_participant_timestamps = [float(ts.timestamp()*1000) for ts in participant_timestamps]
        # generate unique index for each timestamp
        time_mox_mapping = {ts: mox_idx for mox_idx, ts in enumerate(OrderedSet(fl_participant_timestamps))}
        # generate the mox_identifiers
        mox_identifiers = [time_mox_mapping[t] for t in fl_participant_timestamps]

        df['MOX_Identifiers'] = mox_identifiers

        return df
    
    def transform(self, X):
        cols = X.columns
        if 'Unnamed: 0' in cols:
            X.drop(['Unnamed: 0'], inplace=True, axis=1)
        if 'Time' in cols:
            X.drop(['Time'], inplace=True, axis=1)
        
        # parse date and participant timestamp
        X['Date'] = pd.to_datetime(X['Date'])
        X['Participant_Timestamp'] = pd.to_datetime(
            X["Participant_Timestamp"].astype(str).str.zfill(15), format="%H%M%S%f"
        )
        
        # convert datetime to index
        X["Participant_Timestamp"] = X["Date"].apply(lambda x: x) + X["Participant_Timestamp"].apply(
            lambda x: timedelta(hours=x.hour, minutes=x.minute, seconds=x.second, microseconds=x.microsecond)
        )
        X.index = X["Participant_Timestamp"].values
        
        # remove rows of all NA
        X = X.dropna(axis=1, how="all")
        
        
        # remove invalid trades
        X.drop(X[X['Trade_Price'] < 0].index, inplace=True)
        X.drop(X[X['Trade_Volume'] < 0].index, inplace=True)
        X.drop(X[X['Trade_Reporting_Facility'] == 'D'].index, inplace=True)
        
        # remove invalid quotes
        X.drop(X[X['Bid_Price'] < 0].index, inplace=True)
        X.drop(X[X['Offer_Price'] < X['Bid_Price']].index, inplace=True)
        
        
        # drop after hours if specified
        if self.dropped_after_hourse:
            afterhours_idx = []
            for t in X.index:
                str_t = t.strftime("%H:%M:%S")
                if str_t < "09:00:00" or str_t > "16:00:00":
                    afterhours_idx.append(t)
            X.drop(afterhours_idx, inplace=True)
            
        # remove first and last 15 minutes of regular trading hours
        if self.droped_irregular_hours:
            irregular_idx = []
            for t in X.index:
                str_t = t.strftime("%H:%M:%S")
                if str_t < "09:45:00" or str_t > "15:45:00":
                    irregular_idx.append(t)
            X.drop(irregular_idx, inplace=True)
        
        #sort data according to index
        X = X.sort_index()
        
        #assign MOX Identifiers
        X = self.generate_mox_identifier(X)

        
        return X

In [14]:
process_pipeline = make_pipeline(
    PreprocessData()
)

In [15]:
df_clean = process_pipeline.fit_transform(all_events)

Visualize the pipeline

In [16]:
# import the set_config module from sklearn
from sklearn import set_config

# set the display option for sklearn to 'diagram'
set_config(display='diagram')

# display the pipeline 'num_pipeline'
process_pipeline


In [17]:
df_clean.head(20)

Unnamed: 0,Date,Exchange,Symbol,Trade_Volume,Trade_Price,Sale_Condition,Source_of_Trade,Trade_Correction_Indicator,Sequence_Number,Trade_Id,Trade_Reporting_Facility,Participant_Timestamp,Trade_Reporting_Facility_TRF_Timestamp,Trade_Through_Exempt_Indicator,Bid_Price,Bid_Size,Offer_Price,Offer_Size,Quote_Condition,National_BBO_Indicator,Source_Of_Quote,Retail_Interest_Indicator,Short_Sale_Restriction_Indicator,SIP_Generated_Message_Identifier,NBBO_LULD_Indicator,Security_Status_Indicator,YearMonth,MOX_Identifiers
2020-01-02 09:45:00.001258,2020-01-02,N,AAPL,,,,,,2025557,,,2020-01-02 09:45:00.001258,,,297.11,1.0,297.15,1.0,R,2.0,N,,0.0,,A,,202001.0,0
2020-01-02 09:45:00.001451,2020-01-02,N,AAPL,,,,,,2025564,,,2020-01-02 09:45:00.001451,,,297.11,1.0,297.2,1.0,R,0.0,N,,0.0,,A,,202001.0,1
2020-01-02 09:45:00.001459,2020-01-02,N,AAPL,,,,,,2025565,,,2020-01-02 09:45:00.001459,,,297.11,2.0,297.2,1.0,R,2.0,N,,0.0,,A,,202001.0,2
2020-01-02 09:45:00.001518,2020-01-02,N,AAPL,,,,,,2025566,,,2020-01-02 09:45:00.001518,,,297.11,2.0,297.28,1.0,R,0.0,N,,0.0,,A,,202001.0,3
2020-01-02 09:45:00.001538,2020-01-02,N,AAPL,,,,,,2025567,,,2020-01-02 09:45:00.001538,,,297.11,3.0,297.28,1.0,R,2.0,N,,0.0,,A,,202001.0,4
2020-01-02 09:45:00.012368,2020-01-02,Z,AAPL,,,,,,2026234,,,2020-01-02 09:45:00.012368,,,297.08,2.0,297.13,2.0,R,2.0,N,,0.0,,A,,202001.0,5
2020-01-02 09:45:00.012486,2020-01-02,N,AAPL,,,,,,2026243,,,2020-01-02 09:45:00.012486,,,297.11,2.0,297.28,1.0,R,2.0,N,,0.0,,A,,202001.0,6
2020-01-02 09:45:00.038245,2020-01-02,K,AAPL,,,,,,2026679,,,2020-01-02 09:45:00.038245,,,297.04,1.0,297.13,1.0,R,0.0,N,,0.0,,A,,202001.0,7
2020-01-02 09:45:00.079739,2020-01-02,K,AAPL,,,,,,2027059,,,2020-01-02 09:45:00.079739,,,297.04,1.0,297.13,2.0,R,0.0,N,,0.0,,A,,202001.0,8
2020-01-02 09:45:00.117069,2020-01-02,Q,AAPL,,,,,,2027420,,,2020-01-02 09:45:00.117069,,,297.11,1.0,297.13,2.0,R,2.0,N,,0.0,,A,,202001.0,9


In [18]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1812932 entries, 2020-01-02 09:45:00.001258 to 2020-01-02 15:45:00.776189
Data columns (total 28 columns):
 #   Column                                  Dtype         
---  ------                                  -----         
 0   Date                                    datetime64[ns]
 1   Exchange                                object        
 2   Symbol                                  object        
 3   Trade_Volume                            float64       
 4   Trade_Price                             float64       
 5   Sale_Condition                          object        
 6   Source_of_Trade                         object        
 7   Trade_Correction_Indicator              float64       
 8   Sequence_Number                         int64         
 9   Trade_Id                                float64       
 10  Trade_Reporting_Facility                object        
 11  Participant_Timestamp                   datetime64[ns]


## 2. Feature Generation

In [19]:
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector, make_column_transformer

In [20]:
import sys
sys.path.insert(1, '../feature_generation')

In [21]:
from generators import parent_generator

In [22]:
raw_trade_features, raw_quote_features = df_clean.columns.values[:14], df_clean.columns.values[14:]
raw_trade_features

array(['Date', 'Exchange', 'Symbol', 'Trade_Volume', 'Trade_Price',
       'Sale_Condition', 'Source_of_Trade', 'Trade_Correction_Indicator',
       'Sequence_Number', 'Trade_Id', 'Trade_Reporting_Facility',
       'Participant_Timestamp', 'Trade_Reporting_Facility_TRF_Timestamp',
       'Trade_Through_Exempt_Indicator'], dtype=object)

In [23]:
raw_quote_features

array(['Bid_Price', 'Bid_Size', 'Offer_Price', 'Offer_Size',
       'Quote_Condition', 'National_BBO_Indicator', 'Source_Of_Quote',
       'Retail_Interest_Indicator', 'Short_Sale_Restriction_Indicator',
       'SIP_Generated_Message_Identifier', 'NBBO_LULD_Indicator',
       'Security_Status_Indicator', 'YearMonth', 'MOX_Identifiers'],
      dtype=object)

### 2.1 Some Features to Consider

#### References: 
-  How and When are High-Frequency Stock Returns Predictable?
https://papers.ssrn.com/sol3/papers.cfm?abstract_id=4095405

#### Features for Trades Data:
-  Trade Side (Tick Test)

#### Features for Quotes Data:
-  $\small\text{Effective Spread} = \text{Offer Price} - \text{Bid Price}$
-  $\small\text{Mid Price} = \large\frac{\text{(Offer Price + Bid Price)}}{2}$
-  $ \text{Microprice} = \large\frac{\text{Offer Price} \times \text{Offer Size} + \text{Bid Price} \times \text{Bid Size}}{\text{Offer Size} + \text{Bid Size}}$
-  $ \text{Imbalance} = \large\frac{\text{Bid Size}}{\text{Offer Size}} $

In [24]:
trade_features_to_generate = ["Trade_Side"]
# quote_features_to_generate = []
quote_features_to_generate = ["Effective_Spread", "Midprice", "Microprice", "Imbalance"]

In [25]:
class GenerateTradeFeatures(BaseEstimator, TransformerMixin):
    
    def __init__(self, features):
        self.features = features
        return
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        trade_features = X.columns
        for f in self.features:
            if f not in trade_features:
                X, _ = parent_generator(X, f)            
        return X
        

In [26]:
class GenerateQuoteFeatures(BaseEstimator, TransformerMixin):
    
    def __init__(self, features):
        self.features = features
        return
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        quote_features = X.columns
        for f in self.features:
            if f not in quote_features:
                X, _ = parent_generator(X, f)
        return X


In [27]:
trade_pipeline = make_pipeline(GenerateTradeFeatures(trade_features_to_generate))
quote_pipeline = make_pipeline(GenerateQuoteFeatures(quote_features_to_generate))

In [28]:
generating_features = make_column_transformer(
    (trade_pipeline, raw_trade_features),
    (quote_pipeline, raw_quote_features)
)

In [29]:
df_copy = df_clean.copy()

In [30]:
data_prepared = generating_features.fit_transform(df_copy)

In [31]:
column_names = np.concatenate((raw_trade_features, trade_features_to_generate, \
                             raw_quote_features, quote_features_to_generate), axis=0)

In [32]:
data_prepared_fr = pd.DataFrame(
    data_prepared,
    # The columns parameter specifies the column names for the DataFrame 
    # and is set to the output of preprocessing.get_feature_names_out().
    columns=column_names,
    # The index parameter sets the index of the DataFrame to the index of the housing data, 
    # preserving the original data's indices.
    index=df_copy.index)
# This line displays the first two rows of the created DataFrame using the head() method.

In [33]:
data_prepared_fr.head(20)

Unnamed: 0,Date,Exchange,Symbol,Trade_Volume,Trade_Price,Sale_Condition,Source_of_Trade,Trade_Correction_Indicator,Sequence_Number,Trade_Id,Trade_Reporting_Facility,Participant_Timestamp,Trade_Reporting_Facility_TRF_Timestamp,Trade_Through_Exempt_Indicator,Trade_Side,Bid_Price,Bid_Size,Offer_Price,Offer_Size,Quote_Condition,National_BBO_Indicator,Source_Of_Quote,Retail_Interest_Indicator,Short_Sale_Restriction_Indicator,SIP_Generated_Message_Identifier,NBBO_LULD_Indicator,Security_Status_Indicator,YearMonth,MOX_Identifiers,Effective_Spread,Midprice,Microprice,Imbalance
2020-01-02 09:45:00.001258,2020-01-02,N,AAPL,,,,,,2025557,,,2020-01-02 09:45:00.001258,,,,297.11,1.0,297.15,1.0,R,2.0,N,,0.0,,A,,202001.0,0,0.04,297.13,297.13,1.0
2020-01-02 09:45:00.001451,2020-01-02,N,AAPL,,,,,,2025564,,,2020-01-02 09:45:00.001451,,,,297.11,1.0,297.2,1.0,R,0.0,N,,0.0,,A,,202001.0,1,0.09,297.155,297.155,1.0
2020-01-02 09:45:00.001459,2020-01-02,N,AAPL,,,,,,2025565,,,2020-01-02 09:45:00.001459,,,,297.11,2.0,297.2,1.0,R,2.0,N,,0.0,,A,,202001.0,2,0.09,297.155,297.14,2.0
2020-01-02 09:45:00.001518,2020-01-02,N,AAPL,,,,,,2025566,,,2020-01-02 09:45:00.001518,,,,297.11,2.0,297.28,1.0,R,0.0,N,,0.0,,A,,202001.0,3,0.17,297.195,297.166667,2.0
2020-01-02 09:45:00.001538,2020-01-02,N,AAPL,,,,,,2025567,,,2020-01-02 09:45:00.001538,,,,297.11,3.0,297.28,1.0,R,2.0,N,,0.0,,A,,202001.0,4,0.17,297.195,297.1525,3.0
2020-01-02 09:45:00.012368,2020-01-02,Z,AAPL,,,,,,2026234,,,2020-01-02 09:45:00.012368,,,,297.08,2.0,297.13,2.0,R,2.0,N,,0.0,,A,,202001.0,5,0.05,297.105,297.105,1.0
2020-01-02 09:45:00.012486,2020-01-02,N,AAPL,,,,,,2026243,,,2020-01-02 09:45:00.012486,,,,297.11,2.0,297.28,1.0,R,2.0,N,,0.0,,A,,202001.0,6,0.17,297.195,297.166667,2.0
2020-01-02 09:45:00.038245,2020-01-02,K,AAPL,,,,,,2026679,,,2020-01-02 09:45:00.038245,,,,297.04,1.0,297.13,1.0,R,0.0,N,,0.0,,A,,202001.0,7,0.09,297.085,297.085,1.0
2020-01-02 09:45:00.079739,2020-01-02,K,AAPL,,,,,,2027059,,,2020-01-02 09:45:00.079739,,,,297.04,1.0,297.13,2.0,R,0.0,N,,0.0,,A,,202001.0,8,0.09,297.085,297.1,0.5
2020-01-02 09:45:00.117069,2020-01-02,Q,AAPL,,,,,,2027420,,,2020-01-02 09:45:00.117069,,,,297.11,1.0,297.13,2.0,R,2.0,N,,0.0,,A,,202001.0,9,0.02,297.12,297.123333,0.5
