## Example Notebook on how to make Pandas FASTER! 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from numba import jit

In [2]:
print(np.__version__)

1.24.3


### 1.Reading using Pandas is SLOW!

It's ironic that the first mistake related to using Pandas for tasks with large datasets is to read them into your environment with Pandas! Use datatable instead. Other options: Dask, Vaex, cuDF(with NVIDIA GPU)

https://datatable.readthedocs.io/en/latest/start/install.html#install-latest-dev-version

Install with pip:

pip install git+https://github.com/h2oai/datatable


In [4]:
import datatable as dt

In [9]:
%time
pd_df = pd.read_csv("data/quotes_NVDA_20200106-20200107.csv")

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 5.01 µs


  pd_df = pd.read_csv("data/quotes_NVDA_20200106-20200107.csv")


In [10]:
%time
df_dt = dt.fread("data/quotes_NVDA_20200106-20200107.csv").to_pandas()

CPU times: user 1 µs, sys: 1 µs, total: 2 µs
Wall time: 3.81 µs


Generally when you deal with datasets that are gigabytes in size, it's gonna make a huge difference...

In [11]:
%time
df_quotes = dt.fread("data/quotes_NVDA_20200106-20200107.csv").to_pandas()
df_trades = dt.fread("data/trades_NVDA_20200106-20200107.csv").to_pandas()

CPU times: user 1 µs, sys: 1e+03 ns, total: 2 µs
Wall time: 3.81 µs


### 2. Check the memory usage of your Pandas Dataframe

For one day of trades and quotes for one ticker, we have ...

In [12]:
print (f'Trades size: {df_trades.shape}')
print (f'Quotes size: {df_quotes.shape}')

Trades size: (73920, 18)
Quotes size: (681656, 26)


In [13]:
memory_usage = df_trades.memory_usage(deep=True) + df_quotes.memory_usage(deep=True)
memory_usage_in_mbs = np.sum(memory_usage / 1024 ** 2)
print (f'Total memory usage: {memory_usage_in_mbs} MB')

Total memory usage: 111.68913269042969 MB


#### It's not too bad for one day...

How about a year of data for one Ticker? Then we are looking at:

111MB x 300 / 1024 ≈ 32.5195 GB

Large enough to care about memory usage in this case ...

Some data type info below ...

<img src="https://pbpython.com/images/pandas_dtypes.png" alt="Python Data type" style="width: 400px;"/>

##### Ideally, you want to cast floats and integers to their smallest subtypes.

In [14]:
def reduce_memory_usage(df, verbose=True):
    numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if (
                    c_min > np.finfo(np.float16).min
                    and c_max < np.finfo(np.float16).max
                ):
                    df[col] = df[col].astype(np.float16)
                elif (
                    c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

Source code:
https://gist.github.com/BexTuychiev/99883092ca8864e4495aeb8aa5390f19#file-9004-py

In [15]:
reduce_memory_usage(df_quotes)

Mem. usage decreased to 102.71 Mb (14.6% reduction)


Unnamed: 0,C0,Time,Exchange,Symbol,Bid_Price,Bid_Size,Offer_Price,Offer_Size,Quote_Condition,Sequence_Number,...,Short_Sale_Restriction_Indicator,LULD_BBO_Indicator,SIP_Generated_Message_Identifier,NBBO_LULD_Indicator,Participant_Timestamp,FINRA_ADF_Timestamp,FINRA_ADF_Market_Participant_Quote_Indicator,Security_Status_Indicator,Date,YearMonth
0,0,2020-01-06 04:00:00.037344,P,NVDA,0.000,0.0,366.000,2.0,R,1423,...,False,,,,40000036930816,,,,2020-01-06,202001
1,1,2020-01-06 04:00:00.037399,P,NVDA,0.000,0.0,280.000,1.0,R,1430,...,False,,,,40000037020928,,,,2020-01-06,202001
2,2,2020-01-06 04:00:00.037421,P,NVDA,175.000,1.0,280.000,1.0,R,1431,...,False,,,,40000037026816,,,,2020-01-06,202001
3,3,2020-01-06 04:00:00.037421,P,NVDA,216.000,1.0,280.000,1.0,R,1432,...,False,,,,40000037027584,,,,2020-01-06,202001
4,4,2020-01-06 04:00:00.037427,P,NVDA,230.000,1.0,280.000,1.0,R,1433,...,False,,,,40000037030400,,,,2020-01-06,202001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
681651,681651,2020-01-06 19:59:41.267964,Q,NVDA,235.250,1.0,237.500,5.0,R,28815631,...,False,,,,195941267946370,,,,2020-01-06,202001
681652,681652,2020-01-06 19:59:41.272177,K,NVDA,236.875,1.0,237.375,2.0,R,28815632,...,False,,,,195941271920000,,,,2020-01-06,202001
681653,681653,2020-01-06 19:59:41.272204,K,NVDA,236.875,1.0,237.500,1.0,R,28815633,...,False,,,,195941271920000,,,,2020-01-06,202001
681654,681654,2020-01-06 19:59:41.291267,Q,NVDA,236.250,1.0,237.500,5.0,R,28815634,...,False,,,,195941291250633,,,,2020-01-06,202001


In [16]:
df_quotes = reduce_memory_usage(df_quotes)
df_trades = reduce_memory_usage(df_trades)

Mem. usage decreased to 102.71 Mb (0.0% reduction)
Mem. usage decreased to 6.91 Mb (13.3% reduction)


  return arr.astype(dtype, copy=True)


In [17]:
df_quotes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 681656 entries, 0 to 681655
Data columns (total 26 columns):
 #   Column                                        Non-Null Count   Dtype         
---  ------                                        --------------   -----         
 0   C0                                            681656 non-null  int32         
 1   Time                                          681656 non-null  datetime64[ns]
 2   Exchange                                      681656 non-null  object        
 3   Symbol                                        681656 non-null  object        
 4   Bid_Price                                     681656 non-null  float16       
 5   Bid_Size                                      681656 non-null  float16       
 6   Offer_Price                                   681656 non-null  float16       
 7   Offer_Size                                    681656 non-null  float16       
 8   Quote_Condition                               681656 n

Also maybe drop some unnecessary columns because 'Object' is the most memory-consuming data type... 

In [18]:
df_quotes.drop(['C0', 'National_BBO_Indicator', 'FINRA_BBO_Indicator',
       'FINRA_ADF_MPID_Indicator', 'Quote_Cancel_Correction',
       'Source_Of_Quote', 'Retail_Interest_Indicator',
       'Short_Sale_Restriction_Indicator', 'LULD_BBO_Indicator',
       'SIP_Generated_Message_Identifier', 'NBBO_LULD_Indicator','FINRA_ADF_Timestamp',
       'FINRA_ADF_Market_Participant_Quote_Indicator',
       'Security_Status_Indicator'], axis=1, inplace=True)

In [19]:
df_quotes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 681656 entries, 0 to 681655
Data columns (total 12 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   Time                   681656 non-null  datetime64[ns]
 1   Exchange               681656 non-null  object        
 2   Symbol                 681656 non-null  object        
 3   Bid_Price              681656 non-null  float16       
 4   Bid_Size               681656 non-null  float16       
 5   Offer_Price            681656 non-null  float16       
 6   Offer_Size             681656 non-null  float16       
 7   Quote_Condition        681656 non-null  object        
 8   Sequence_Number        681656 non-null  int32         
 9   Participant_Timestamp  681656 non-null  int64         
 10  Date                   681656 non-null  datetime64[s] 
 11  YearMonth              681656 non-null  int32         
dtypes: datetime64[ns](1), datetime64[s](1), floa

In [20]:
df_trades.drop(['C0','Sale_Condition', 'Source_of_Trade',
       'Trade_Stop_Stock_Indicator', 'Trade_Correction_Indicator', 'Trade_Reporting_Facility_TRF_Timestamp',
       'Trade_Through_Exempt_Indicator', 'YearMonth'], axis=1, inplace = True)

In [21]:
df_trades.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73920 entries, 0 to 73919
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   Time                      73920 non-null  datetime64[ns]
 1   Date                      73920 non-null  datetime64[s] 
 2   Exchange                  73920 non-null  object        
 3   Symbol                    73920 non-null  object        
 4   Trade_Volume              73920 non-null  int32         
 5   Trade_Price               73920 non-null  float16       
 6   Sequence_Number           73920 non-null  int32         
 7   Trade_Id                  73920 non-null  int16         
 8   Trade_Reporting_Facility  73920 non-null  object        
 9   Participant_Timestamp     73920 non-null  int64         
dtypes: datetime64[ns](1), datetime64[s](1), float16(1), int16(1), int32(2), int64(1), object(3)
memory usage: 4.2+ MB


### 3. Some data cleaning

In [22]:
from datetime import timedelta
import time

For example, we want to get the correct value of the column Participate Timestamp, i.e. converting Participate Timestamp from integer representation to datetime.



#### Remember to use vectorization than loop!! And that vectorization works faster with numpy array!!

Note that lambda function is also a 'loop'...

For data type,
 
    df_trades['Date'] --> pandas.Series

    df_trades['Date'].values --> numpy.ndarray

In [5]:
def v_convertParticipantTimestamp(pts, date):
    """
    Convert participant timestamps to the correct datetime representation.

    Parameters:
    - pts: numpy.ndarray
    - date: numpy.ndarray

    Returns:
    - datetime64[ns] Series: A pandas Series of datetime64[ns]

    """
    date = pd.to_datetime(date)
    pts = pd.to_datetime(np.char.zfill(pts.astype(str), 15),format="%H%M%S%f")

    return date + pd.to_timedelta(
        pts.hour * 60 * 60 * 1e9 +   # Convert hours to nanoseconds
        pts.minute * 60 * 1e9 +     # Convert minutes to nanoseconds
        pts.second * 1e9 +          # Convert seconds to nanoseconds
        pts.microsecond * 1e3       # Convert microseconds to nanoseconds
    )

In [26]:
start = time.time()
df_trades['Participant_Timestamp_date'] = v_convertParticipantTimestamp(df_trades['Participant_Timestamp'].values, df_trades['Date'].values)
df_quotes['Participant_Timestamp_date'] = v_convertParticipantTimestamp(df_quotes['Participant_Timestamp'].values, df_quotes['Date'].values)
print (f'Total time: {time.time()-start}s')

Total time: 1.4279651641845703s


We can also remove the after hours trading

In [37]:
def drop_after_hours(df, pts):
    """
    Drop rows from the DataFrame based on timestamps outside the range 09:00:00 to 16:00:00.

    Parameters:
    - df: (pd.DataFrame)
    - pts: (pd.Series): 
    """
    mask = (pts.dt.time < pd.Timestamp("09:00:00").time()) | \
           (pts.dt.time > pd.Timestamp("16:00:00").time())
    drop_idx = df[mask].index
    df.drop(drop_idx, inplace=True)
    return

In [38]:
start = time.time()
drop_after_hours(df_trades, df_trades['Participant_Timestamp_date'])
drop_after_hours(df_quotes, df_quotes['Participant_Timestamp_date'])
print (f'Total time: {time.time() - start}')

Total time: 0.3738057613372803


In [29]:
df_trades['Participant_Timestamp_date']

1011    2020-01-06 09:00:00.684000
1012    2020-01-06 09:00:00.687000
1013    2020-01-06 09:00:04.890000
1014    2020-01-06 09:00:14.750000
1015    2020-01-06 09:00:14.752000
                   ...            
73219   2020-01-06 15:59:59.104637
73220   2020-01-06 15:59:59.279263
73221   2020-01-06 15:59:59.402718
73222   2020-01-06 15:59:59.921022
73223   2020-01-06 15:59:59.970230
Name: Participant_Timestamp_date, Length: 72213, dtype: datetime64[ns]

### 3. Reconstructing Events

In [30]:
df_trades['Is_Quote'] = False
df_quotes['Is_Quote'] = True
trade_features = ['Participant_Timestamp_date', 'Symbol', 'Is_Quote', 'Trade_Volume', 'Trade_Price', 'Trade_Id', 'Trade_Reporting_Facility']
quote_features = ['Participant_Timestamp_date', 'Symbol', 'Is_Quote', 'Bid_Price', 'Bid_Size', 'Offer_Price', 'Offer_Size']

In [31]:
df1 = df_trades[trade_features]
df2 = df_quotes[quote_features]

df_all = pd.concat([df1, df2], ignore_index=True)
df_all = df_all.sort_values(by=['Participant_Timestamp_date']).reset_index(drop=True)
df_all.head(15)

Unnamed: 0,Participant_Timestamp_date,Symbol,Is_Quote,Trade_Volume,Trade_Price,Trade_Id,Trade_Reporting_Facility,Bid_Price,Bid_Size,Offer_Price,Offer_Size
0,2020-01-06 09:00:00.684000,NVDA,False,17.0,232.5,90.0,N,,,,
1,2020-01-06 09:00:00.687000,NVDA,False,10.0,232.25,91.0,N,,,,
2,2020-01-06 09:00:03.784965,NVDA,True,,,,,221.875,1.0,243.0,1.0
3,2020-01-06 09:00:04.890000,NVDA,False,10.0,232.5,92.0,N,,,,
4,2020-01-06 09:00:14.750000,NVDA,False,138.0,232.5,93.0,N,,,,
5,2020-01-06 09:00:14.752000,NVDA,False,2.0,232.5,94.0,N,,,,
6,2020-01-06 09:00:14.752000,NVDA,False,3.0,232.5,95.0,N,,,,
7,2020-01-06 09:00:14.752000,NVDA,False,2.0,232.5,96.0,N,,,,
8,2020-01-06 09:00:14.752000,NVDA,False,10.0,232.5,97.0,N,,,,
9,2020-01-06 09:00:14.752000,NVDA,False,1.0,232.5,98.0,N,,,,


### 3. Some Feature Generation

Generate int representation of recalculated Participate Timestamp, again, using vectorization

In [32]:
%time
df_all['Participant_Timestamp_f']= (df_all["Participant_Timestamp_date"].astype(int) / 1e9)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.05 µs


In [33]:
df_all.head(15)

Unnamed: 0,Participant_Timestamp_date,Symbol,Is_Quote,Trade_Volume,Trade_Price,Trade_Id,Trade_Reporting_Facility,Bid_Price,Bid_Size,Offer_Price,Offer_Size,Participant_Timestamp_f
0,2020-01-06 09:00:00.684000,NVDA,False,17.0,232.5,90.0,N,,,,,1578301000.0
1,2020-01-06 09:00:00.687000,NVDA,False,10.0,232.25,91.0,N,,,,,1578301000.0
2,2020-01-06 09:00:03.784965,NVDA,True,,,,,221.875,1.0,243.0,1.0,1578301000.0
3,2020-01-06 09:00:04.890000,NVDA,False,10.0,232.5,92.0,N,,,,,1578301000.0
4,2020-01-06 09:00:14.750000,NVDA,False,138.0,232.5,93.0,N,,,,,1578301000.0
5,2020-01-06 09:00:14.752000,NVDA,False,2.0,232.5,94.0,N,,,,,1578301000.0
6,2020-01-06 09:00:14.752000,NVDA,False,3.0,232.5,95.0,N,,,,,1578301000.0
7,2020-01-06 09:00:14.752000,NVDA,False,2.0,232.5,96.0,N,,,,,1578301000.0
8,2020-01-06 09:00:14.752000,NVDA,False,10.0,232.5,97.0,N,,,,,1578301000.0
9,2020-01-06 09:00:14.752000,NVDA,False,1.0,232.5,98.0,N,,,,,1578301000.0


#### Question: iloc or loc?

When choosing a row or multiple rows, iloc is faster.
    
    e.g. df.iloc[:100]

When choosing columns with their labels, loc is better.

    e.g. df.iloc[:,['col1', 'col2']]



In [39]:
class CalendarMode:
    def __init__(self, df, delta1, delta2, start_idx, end_idx) -> None:
        self.df = df
        self.delta1 = delta1
        self.delta2 = delta2
        self.start_idx = start_idx
        self.end_idx = end_idx
    
    @classmethod
    def from_deltas(cls, df, delta1, delta2):
        timestamps = df_all['Participant_Timestamp_f'].sort_values().values
        #'left' always give you the index of the first suitable location found is given.
        #‘right’ return the last such index'
        start_idx = np.searchsorted(timestamps, timestamps - delta2, side='left')
        end_idx = np.searchsorted(timestamps, timestamps - delta1, side='right')
        return cls(
            df = df,
            delta1 = delta1,
            delta2 = delta2,
            start_idx = start_idx,
            end_idx = end_idx
        )
    
    #Breath
    def getBreath(self) -> pd.DataFrame:
        if 'Breath' in self.df.columns:
            return self.df['Breath']
        else:
            self.df['Breath'] = [self.df.iloc[start:end]['Is_Quote'].sum() \
                                       for start, end in zip(self.start_idx, self.end_idx)]
            return self.df['Breath']
    
    #Immediacy
    def getImmediacy(self) -> pd.Series:
        if 'Immediacy' in self.df.columns:
            return self.df['Immediacy']
        else:
            breath = self.getBreath()
            self.df['Immediacy'] = np.where(breath == 0, np.nan, \
                                            (self.delta2-self.delta1) / breath)
            return self.df['Immediacy']

    #VolumeAll
    def getVolumeAll(self) -> pd.Series:
        if 'VolumeAll' in self.df.columns:
            return self.df['VolumeAll']
        else:
            self.df['VolumeAll'] = [self.df.iloc[start:end]['Trade_Volume'].sum() \
                                          for start, end in zip(self.start_idx, self.end_idx)]
            return self.df['VolumeAll']




In [40]:
delta1, delta2 = 0, 5 #compute features from last 5 seconds
caln_f = CalendarMode.from_deltas(df_all, delta1, delta2)

In [41]:
start = time.time()
caln_f.getBreath()
print (f'Total time: {time.time()-start}s')

Total time: 19.472888946533203s


In [42]:
start = time.time()
caln_f.getImmediacy()
print (f'Total time: {time.time()-start}s')

Total time: 0.004512786865234375s


In [43]:
start = time.time()
caln_f.getVolumeAll()
print (f'Total time: {time.time()-start}s')

Total time: 21.975414037704468s


#### Tips:

When you have to loop through the whole dataframe, looping a numpy ndarray is much faster. 

For example, use:

    for v in df.values