In [1]:
import sklearn
import pandas as pd
import numpy as np
from datetime import datetime,timedelta
import collections
from itertools import chain
import matplotlib.pyplot as plt
from pathlib import Path

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn import set_config

In [3]:
import warnings
warnings.filterwarnings('ignore')

### Loading the Data
(modify later)

In [4]:
AAPL_trades = pd.read_csv(Path("../data/raw_data/AAPL_trades.csv"))

In [5]:
AAPL_trades.head(10)

Unnamed: 0.1,Unnamed: 0,Time,Date,Exchange,Symbol,Trade_Volume,Trade_Price,Sale_Condition,Source_of_Trade,Trade_Stop_Stock_Indicator,Trade_Correction_Indicator,Sequence_Number,Trade_Id,Trade_Reporting_Facility,Participant_Timestamp,Trade_Reporting_Facility_TRF_Timestamp,Trade_Through_Exempt_Indicator
0,0,2020-01-02 04:00:00.064010,2020-01-02,P,AAPL,3801,295.05,@ T,N,,0,1185,1,,40000063617792,,1
1,1,2020-01-02 04:00:02.828485,2020-01-02,P,AAPL,1,295.08,@FTI,N,,0,1195,2,,40002828108800,,1
2,2,2020-01-02 04:00:06.250392,2020-01-02,Q,AAPL,6,295.25,@ TI,N,,0,1197,1,,40006250366823,,0
3,3,2020-01-02 04:00:06.429757,2020-01-02,P,AAPL,1,295.08,@ TI,N,,0,1198,3,,40006429377792,,0
4,4,2020-01-02 04:00:28.894835,2020-01-02,P,AAPL,3,295.1,@ TI,N,,0,1205,4,,40028894459136,,0
5,5,2020-01-02 04:00:30.021361,2020-01-02,P,AAPL,2,295.1,@ TI,N,,0,1206,5,,40030020981248,,0
6,6,2020-01-02 04:00:31.900055,2020-01-02,P,AAPL,7,295.1,@ TI,N,,0,1208,6,,40031899679744,,0
7,7,2020-01-02 04:00:33.047715,2020-01-02,P,AAPL,5,295.1,@ TI,N,,0,1209,7,,40033047341056,,0
8,8,2020-01-02 04:00:33.118294,2020-01-02,P,AAPL,5,295.1,@ TI,N,,0,1210,8,,40033117919744,,0
9,9,2020-01-02 04:00:33.118809,2020-01-02,P,AAPL,10,295.1,@ TI,N,,0,1211,9,,40033118435584,,0


In [30]:
AAPL_quotes = pd.read_csv(Path("../data/raw_data/AAPL_quotes.csv"))
AAPL_quotes.head(10)

Unnamed: 0.1,Unnamed: 0,Time,Exchange,Symbol,Bid_Price,Bid_Size,Offer_Price,Offer_Size,Quote_Condition,Sequence_Number,...,Short_Sale_Restriction_Indicator,LULD_BBO_Indicator,SIP_Generated_Message_Identifier,NBBO_LULD_Indicator,Participant_Timestamp,FINRA_ADF_Timestamp,FINRA_ADF_Market_Participant_Quote_Indicator,Security_Status_Indicator,Date,YearMonth
0,0,2020-01-02 04:00:00.065165,P,AAPL,278.0,7.0,0.0,0.0,R,2228,...,0,,,,40000064785664,,,,2020-01-02,202001
1,1,2020-01-02 04:00:00.065167,P,AAPL,278.0,14.0,0.0,0.0,R,2229,...,0,,,,40000064787456,,,,2020-01-02,202001
2,2,2020-01-02 04:00:00.065170,P,AAPL,293.72,9.0,0.0,0.0,R,2230,...,0,,,,40000064790784,,,,2020-01-02,202001
3,3,2020-01-02 04:00:00.065681,P,AAPL,293.72,9.0,327.56,1.0,R,2231,...,0,,,,40000065302272,,,,2020-01-02,202001
4,4,2020-01-02 04:00:00.065738,P,AAPL,293.72,9.0,320.0,1.0,R,2232,...,0,,,,40000065358592,,,,2020-01-02,202001
5,5,2020-01-02 04:00:00.065738,P,AAPL,293.72,9.0,310.0,1.0,R,2233,...,0,,,,40000065360384,,,,2020-01-02,202001
6,6,2020-01-02 04:00:00.065744,P,AAPL,293.72,9.0,300.0,1.0,R,2235,...,0,,,,40000065366528,,,,2020-01-02,202001
7,7,2020-01-02 04:00:00.065813,P,AAPL,293.72,9.0,299.97,5.0,R,2237,...,0,,,,40000065433856,,,,2020-01-02,202001
8,8,2020-01-02 04:00:00.065816,P,AAPL,293.72,9.0,295.88,5.0,R,2238,...,0,,,,40000065437440,,,,2020-01-02,202001
9,9,2020-01-02 04:00:00.068515,P,AAPL,295.0,1.0,295.88,5.0,R,2241,...,0,,,,40000068136192,,,,2020-01-02,202001


### Data Preprocessing

In [27]:
class QuotesPreprocess:
    
    def __init__(self, drop_after_hours=True):
        self.drop_after_hours = drop_after_hours
        return

    # Author: Jason
    def _clean_quotes(self, quotes, drop_after_hours):
 
        """Cleans Quotes by removing quotes with invalid spreads, quotes with bid or offer price of 0, and quotes outside of market hours

        Sets Index to Effective Date + Participant Timestamp
        """

        quotes = quotes.drop(columns=quotes.columns[0:2])

        # parse date and time
        quotes["Date"] = pd.to_datetime(quotes["Date"])
        quotes["Participant_Timestamp"] = pd.to_datetime(
            quotes["Participant_Timestamp"].astype(str).str.zfill(15), format="%H%M%S%f"
        )

        # convert datetime to index
        quotes.index = quotes["Date"].apply(lambda x: x) + quotes["Participant_Timestamp"].apply(
            lambda x: timedelta(hours=x.hour, minutes=x.minute, seconds=x.second, microseconds=x.microsecond)
        )

        quotes = quotes.sort_index()

        quotes = quotes.dropna(axis=1, how="all")

        quotes = quotes[quotes["Offer_Price"] > quotes["Bid_Price"]]  # removed quotes with invalid spreads
        quotes = quotes[quotes["Bid_Price"] > 0]  # bid and offer price >0

        # drop after hours for quotes, preserve if want to prepend lob
        if drop_after_hours:
            quotes["Date"] = quotes.index.date

            grouped_quotes = quotes.groupby("Date").groups

            # drop trade data outside of market hours

            for day in grouped_quotes.keys():
                subset = quotes[quotes["Date"] == day]
                grouped_quotes[day] = subset[subset.index < datetime.strptime(f"{day} 16:00:00", "%Y-%m-%d %H:%M:%S")]
                grouped_quotes[day] = subset[subset.index > datetime.strptime(f"{day} 09:30:00", "%Y-%m-%d %H:%M:%S")]
            new_quotes = pd.concat(list(grouped_quotes.values())).sort_index()

            return new_quotes
        else:
            return quotes

    def transform(self, X):
        return self._clean_quotes(X, drop_after_hours = self.drop_after_hours)

In [33]:
class TradesPreprocess:
    
    def __init__(self):
        return
    
    def _clean_trades(self, trades):

        """Cleans trade data by dropping columns, converting datetime to index, and dropping rows with 0 trade volume or price
        Sets Index to be Effective Date + Participant Timestamp
        """
        trades = trades.drop(columns=trades.columns[0:2])

        # convert datetime to index using participant timestamp
        trades["Date"] = pd.to_datetime(trades["Date"])
        trades["Participant_Timestamp"] = pd.to_datetime(
            trades["Participant_Timestamp"].astype(str).str.zfill(15), format="%H%M%S%f"
        )
        trades.index = trades["Date"].apply(lambda x: x) + trades["Participant_Timestamp"].apply(
            lambda x: timedelta(hours=x.hour, minutes=x.minute, seconds=x.second, microseconds=x.microsecond)
        )

        trades = trades.sort_index()
        
        trades = trades.dropna(axis=1, how="all")

        trades = trades[trades["Trade_Volume"] > 0]

        trades = trades[trades["Trade_Price"] > 0]

        grouped_trades = trades.groupby("Date").groups

        # drop trade data outside of market hours

        for day in grouped_trades.keys():
            subset = trades[trades["Date"] == day]
            grouped_trades[day] = subset[subset.index < datetime.strptime(f"{day.date()} 16:00:00", "%Y-%m-%d %H:%M:%S")]
            grouped_trades[day] = subset[subset.index > datetime.strptime(f"{day.date()} 09:30:00", "%Y-%m-%d %H:%M:%S")]

        new_trades = pd.concat(list(grouped_trades.values())).sort_index()

        return new_trades
    
    
    def transform(self, X):
        return self._clean_trades(X)

In [28]:
prepq_pipe = make_pipeline(QuotesPreprocess())
prepq_pipe.transform(AAPL_quotes)

Unnamed: 0,Exchange,Symbol,Bid_Price,Bid_Size,Offer_Price,Offer_Size,Quote_Condition,Sequence_Number,National_BBO_Indicator,Source_Of_Quote,Retail_Interest_Indicator,Short_Sale_Restriction_Indicator,SIP_Generated_Message_Identifier,NBBO_LULD_Indicator,Participant_Timestamp,Security_Status_Indicator,Date,YearMonth
2020-01-02 09:30:00.089010,Q,AAPL,296.00,87.0,296.40,1.0,R,261711,0,N,,0,,,1900-01-01 09:30:00.089010122,,2020-01-02,202001
2020-01-02 09:30:00.095108,Q,AAPL,296.00,107.0,296.40,1.0,R,261792,0,N,,0,,,1900-01-01 09:30:00.095108640,,2020-01-02,202001
2020-01-02 09:30:00.134049,P,AAPL,296.09,1.0,296.30,66.0,R,262392,0,N,,0,,,1900-01-01 09:30:00.134049792,,2020-01-02,202001
2020-01-02 09:30:00.134062,P,AAPL,296.09,1.0,296.29,1.0,R,262393,2,N,,0,,,1900-01-01 09:30:00.134062592,,2020-01-02,202001
2020-01-02 09:30:00.134071,Z,AAPL,295.91,1.0,296.81,1.0,R,262384,0,N,,0,,,1900-01-01 09:30:00.134071000,,2020-01-02,202001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-01-02 19:59:52.611817,Q,AAPL,301.19,1.0,302.00,6.0,R,30868585,2,N,,0,,,1900-01-01 19:59:52.611817002,,2020-01-02,202001
2020-01-02 19:59:53.854647,K,AAPL,301.10,46.0,301.25,2.0,R,30868587,0,N,,0,,,1900-01-01 19:59:53.854647000,,2020-01-02,202001
2020-01-02 19:59:53.855780,K,AAPL,301.10,46.0,301.25,2.0,R,30868588,0,N,,0,,,1900-01-01 19:59:53.855780000,,2020-01-02,202001
2020-01-02 19:59:57.963903,K,AAPL,301.10,46.0,301.25,1.0,R,30868600,2,N,,0,,,1900-01-01 19:59:57.963903000,,2020-01-02,202001


In [29]:
set_config(display='diagram')
prepq_pipe

In [34]:
prept_pipe = make_pipeline(TradesPreprocess())
prept_pipe.transform(AAPL_trades)

Unnamed: 0,Date,Exchange,Symbol,Trade_Volume,Trade_Price,Sale_Condition,Source_of_Trade,Trade_Correction_Indicator,Sequence_Number,Trade_Id,Trade_Reporting_Facility,Participant_Timestamp,Trade_Reporting_Facility_TRF_Timestamp,Trade_Through_Exempt_Indicator
2020-01-02 09:30:00.134336,2020-01-02,K,AAPL,250,296.24,@,N,0,19803,1100,,1900-01-01 09:30:00.134336000,,0
2020-01-02 09:30:00.134371,2020-01-02,K,AAPL,50,296.24,@ I,N,0,19804,1101,,1900-01-01 09:30:00.134371000,,0
2020-01-02 09:30:00.134532,2020-01-02,K,AAPL,108,296.21,@,N,0,19805,1102,,1900-01-01 09:30:00.134532000,,0
2020-01-02 09:30:00.157191,2020-01-02,P,AAPL,1,296.28,@F I,N,0,19878,1763,,1900-01-01 09:30:00.157191936,,1
2020-01-02 09:30:00.157191,2020-01-02,P,AAPL,1,296.28,@ Q,N,0,19879,1764,,1900-01-01 09:30:00.157191936,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-01-02 19:59:52.611412,2020-01-02,K,AAPL,100,301.19,@FT,N,0,3133007,19384,,1900-01-01 19:59:52.611412000,,1
2020-01-02 19:59:52.611424,2020-01-02,Z,AAPL,100,301.19,@FT,N,0,3133008,32716,,1900-01-01 19:59:52.611424000,,1
2020-01-02 19:59:57.963903,2020-01-02,K,AAPL,70,301.21,@ TI,N,0,3133023,19386,,1900-01-01 19:59:57.963903000,,1
2020-01-02 19:59:57.963903,2020-01-02,K,AAPL,1,301.20,@ TI,N,0,3133022,19385,,1900-01-01 19:59:57.963903000,,1


In [35]:
prept_pipe