## Example Notebook on how to make Pandas FASTER! 

Requires Manual Installation of Python Packages (See Below)

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from numba import jit

In [4]:
print(np.__version__)

1.26.2


### 1.Reading using Pandas is SLOW!

It's ironic that the first mistake related to using Pandas for tasks with large datasets is to read them into your environment with Pandas! Use datatable instead. Other options: Dask, Vaex, cuDF(with NVIDIA GPU)

https://datatable.readthedocs.io/en/latest/start/install.html#install-latest-dev-version

Install with pip:

```
pip install git+https://github.com/h2oai/datatable
```


In [5]:
! pip install git+https://github.com/h2oai/datatable

Collecting git+https://github.com/h2oai/datatable
  Cloning https://github.com/h2oai/datatable to /tmp/pip-req-build-m__ixmbv
  Running command git clone --filter=blob:none --quiet https://github.com/h2oai/datatable /tmp/pip-req-build-m__ixmbv
  Resolved https://github.com/h2oai/datatable to commit b1a87103c55433dc2069df28a460f3ad15450025
  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hBuilding wheels for collected packages: datatable
  Building wheel for datatable (pyproject.toml) ... [?25ldone
[?25h  Created wheel for datatable: filename=datatable-1.2.0a0+build.1702084526.jbohne-cp311-cp311-linux_x86_64.whl size=101495759 sha256=e516c1c0e6a4d14466f1c7419e25af5f0c1af6ebce33f3a58975eb69c881412b
  Stored in directory: /tmp/pip-ephem-wheel-cache-ipswpjbz/wheels/aa/fb/e9/03c7d1eea9cab7949ad57abf48baa07aa6b21e391876119739
Successfully built datatable
Installing collected packages: datatable
Successfully installed datat

In [6]:
import datatable as dt

In [7]:
TRADES="../sample_data/trades.csv.gz"
QUOTES="../sample_data/quotes.csv.gz"

In [8]:
trades = pd.read_csv(Path(TRADES),compression='gzip')
quotes = pd.read_csv(Path(QUOTES),compression='gzip')

In [9]:
%time
pd_df = pd.read_csv(Path(TRADES),compression='gzip')

CPU times: user 6 µs, sys: 1 µs, total: 7 µs
Wall time: 13.8 µs


In [10]:
%time
df_dt = dt.fread(Path(TRADES)).to_pandas()

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.44 µs


Generally when you deal with datasets that are gigabytes in size, it's gonna make a huge difference...

In [11]:
%time
df_quotes = dt.fread(Path(QUOTES)).to_pandas()
df_trades = dt.fread(Path(TRADES)).to_pandas()

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.91 µs


### 2. Check the memory usage of your Pandas Dataframe

For one day of trades and quotes for one ticker, we have ...

In [12]:
print (f'Trades size: {df_trades.shape}')
print (f'Quotes size: {df_quotes.shape}')

Trades size: (1000, 18)
Quotes size: (1000, 26)


In [13]:
memory_usage = df_trades.memory_usage(deep=True) + df_quotes.memory_usage(deep=True)
memory_usage_in_mbs = np.sum(memory_usage / 1024 ** 2)
print (f'Total memory usage: {memory_usage_in_mbs} MB')

Total memory usage: 0.29589080810546875 MB


#### It's not too bad for one day...

How about a year of data for one Ticker? Then we are looking at:

111MB x 300 / 1024 ≈ 32.5195 GB

Large enough to care about memory usage in this case ...

Some data type info below ...

<img src="https://pbpython.com/images/pandas_dtypes.png" alt="Python Data type" style="width: 400px;"/>

##### Ideally, you want to cast floats and integers to their smallest subtypes.

In [14]:
def reduce_memory_usage(df, verbose=True):
    numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if (
                    c_min > np.finfo(np.float16).min
                    and c_max < np.finfo(np.float16).max
                ):
                    df[col] = df[col].astype(np.float16)
                elif (
                    c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

Source code:
https://gist.github.com/BexTuychiev/99883092ca8864e4495aeb8aa5390f19#file-9004-py

In [15]:
reduce_memory_usage(df_quotes)

Mem. usage decreased to 0.15 Mb (15.7% reduction)


Unnamed: 0,C0,Time,Exchange,Symbol,Bid_Price,Bid_Size,Offer_Price,Offer_Size,Quote_Condition,Sequence_Number,...,Short_Sale_Restriction_Indicator,LULD_BBO_Indicator,SIP_Generated_Message_Identifier,NBBO_LULD_Indicator,Participant_Timestamp,FINRA_ADF_Timestamp,FINRA_ADF_Market_Participant_Quote_Indicator,Security_Status_Indicator,Date,YearMonth
0,0,2020-01-06 04:00:00.065815,P,AAPL,278.00,7.0,0.00,0.0,R,2251,...,False,,,,40000065434368,,,,2020-01-06,202001
1,1,2020-01-06 04:00:00.065819,P,AAPL,278.00,14.0,0.00,0.0,R,2252,...,False,,,,40000065436672,,,,2020-01-06,202001
2,2,2020-01-06 04:00:00.065822,P,AAPL,278.00,14.0,298.25,1.0,R,2253,...,False,,,,40000065441792,,,,2020-01-06,202001
3,3,2020-01-06 04:00:00.065919,P,AAPL,291.25,1.0,298.25,1.0,R,2254,...,False,,,,40000065541888,,,,2020-01-06,202001
4,4,2020-01-06 04:00:00.065966,P,AAPL,295.25,1.0,298.25,1.0,R,2255,...,False,,,,40000065584896,,,,2020-01-06,202001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,995,2020-01-06 05:04:50.678214,P,AAPL,294.25,22.0,295.00,8.0,R,66828,...,False,,,,50450677824000,,,,2020-01-06,202001
996,996,2020-01-06 05:04:50.678365,Q,AAPL,295.00,1.0,295.25,2.0,R,66829,...,False,,,,50450678348740,,,,2020-01-06,202001
997,997,2020-01-06 05:04:52.251912,Q,AAPL,295.00,1.0,295.25,2.0,R,66866,...,False,,,,50452251894963,,,,2020-01-06,202001
998,998,2020-01-06 05:04:52.931792,Q,AAPL,294.75,1.0,295.25,2.0,R,66870,...,False,,,,50452931776108,,,,2020-01-06,202001


In [16]:
df_quotes = reduce_memory_usage(df_quotes)
df_trades = reduce_memory_usage(df_trades)

Mem. usage decreased to 0.15 Mb (0.0% reduction)
Mem. usage decreased to 0.09 Mb (12.7% reduction)


In [17]:
df_quotes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 26 columns):
 #   Column                                        Non-Null Count  Dtype         
---  ------                                        --------------  -----         
 0   C0                                            1000 non-null   int16         
 1   Time                                          1000 non-null   datetime64[ns]
 2   Exchange                                      1000 non-null   object        
 3   Symbol                                        1000 non-null   object        
 4   Bid_Price                                     1000 non-null   float16       
 5   Bid_Size                                      1000 non-null   float16       
 6   Offer_Price                                   1000 non-null   float16       
 7   Offer_Size                                    1000 non-null   float16       
 8   Quote_Condition                               1000 non-null   object 

Also maybe drop some unnecessary columns because 'Object' is the most memory-consuming data type... 

In [18]:
df_quotes.drop(['C0', 'National_BBO_Indicator', 'FINRA_BBO_Indicator',
       'FINRA_ADF_MPID_Indicator', 'Quote_Cancel_Correction',
       'Source_Of_Quote', 'Retail_Interest_Indicator',
       'Short_Sale_Restriction_Indicator', 'LULD_BBO_Indicator',
       'SIP_Generated_Message_Identifier', 'NBBO_LULD_Indicator','FINRA_ADF_Timestamp',
       'FINRA_ADF_Market_Participant_Quote_Indicator',
       'Security_Status_Indicator'], axis=1, inplace=True)

In [19]:
df_quotes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   Time                   1000 non-null   datetime64[ns]
 1   Exchange               1000 non-null   object        
 2   Symbol                 1000 non-null   object        
 3   Bid_Price              1000 non-null   float16       
 4   Bid_Size               1000 non-null   float16       
 5   Offer_Price            1000 non-null   float16       
 6   Offer_Size             1000 non-null   float16       
 7   Quote_Condition        1000 non-null   object        
 8   Sequence_Number        1000 non-null   int32         
 9   Participant_Timestamp  1000 non-null   int64         
 10  Date                   1000 non-null   datetime64[s] 
 11  YearMonth              1000 non-null   int32         
dtypes: datetime64[ns](1), datetime64[s](1), float16(4), int32(2), i

In [20]:
df_trades.drop(['C0','Sale_Condition', 'Source_of_Trade',
       'Trade_Stop_Stock_Indicator', 'Trade_Correction_Indicator', 'Trade_Reporting_Facility_TRF_Timestamp',
       'Trade_Through_Exempt_Indicator', 'YearMonth'], axis=1, inplace = True)

In [21]:
df_trades.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   Time                      1000 non-null   datetime64[ns]
 1   Date                      1000 non-null   datetime64[s] 
 2   Exchange                  1000 non-null   object        
 3   Symbol                    1000 non-null   object        
 4   Trade_Volume              1000 non-null   int16         
 5   Trade_Price               1000 non-null   float16       
 6   Sequence_Number           1000 non-null   int16         
 7   Trade_Id                  1000 non-null   int16         
 8   Trade_Reporting_Facility  0 non-null      object        
 9   Participant_Timestamp     1000 non-null   int64         
dtypes: datetime64[ns](1), datetime64[s](1), float16(1), int16(3), int64(1), object(3)
memory usage: 54.8+ KB


### 3. Some data cleaning

In [22]:
from datetime import timedelta
import time

For example, we want to get the correct value of the column Participate Timestamp, i.e. converting Participate Timestamp from integer representation to datetime.



#### Remember to use vectorization than loop!! And that vectorization works faster with numpy array!!

Note that lambda function is also a 'loop'...

For data type,
 
    df_trades['Date'] --> pandas.Series

    df_trades['Date'].values --> numpy.ndarray

In [23]:
def v_convertParticipantTimestamp(pts, date):
    """
    Convert participant timestamps to the correct datetime representation.

    Parameters:
    - pts: numpy.ndarray
    - date: numpy.ndarray

    Returns:
    - datetime64[ns] Series: A pandas Series of datetime64[ns]

    """
    date = pd.to_datetime(date)
    pts = pd.to_datetime(np.char.zfill(pts.astype(str), 15),format="%H%M%S%f")

    return date + pd.to_timedelta(
        pts.hour * 60 * 60 * 1e9 +   # Convert hours to nanoseconds
        pts.minute * 60 * 1e9 +     # Convert minutes to nanoseconds
        pts.second * 1e9 +          # Convert seconds to nanoseconds
        pts.microsecond * 1e3       # Convert microseconds to nanoseconds
    )

In [24]:
start = time.time()
df_trades['Participant_Timestamp_date'] = v_convertParticipantTimestamp(df_trades['Participant_Timestamp'].values, df_trades['Date'].values)
df_quotes['Participant_Timestamp_date'] = v_convertParticipantTimestamp(df_quotes['Participant_Timestamp'].values, df_quotes['Date'].values)
print (f'Total time: {time.time()-start}s')

Total time: 0.03335165977478027s


We can also remove the after hours trading

In [25]:
def drop_after_hours(df, pts):
    """
    Drop rows from the DataFrame based on timestamps outside the range 09:00:00 to 16:00:00.

    Parameters:
    - df: (pd.DataFrame)
    - pts: (pd.Series): 
    """
    mask = (pts.dt.time < pd.Timestamp("09:00:00").time()) | \
           (pts.dt.time > pd.Timestamp("16:00:00").time())
    drop_idx = df[mask].index
    df.drop(drop_idx, inplace=True)
    return

In [26]:
start = time.time()
drop_after_hours(df_trades, df_trades['Participant_Timestamp_date'])
drop_after_hours(df_quotes, df_quotes['Participant_Timestamp_date'])
print (f'Total time: {time.time() - start}')

Total time: 0.017377138137817383


In [27]:
df_trades['Participant_Timestamp_date']

Series([], Name: Participant_Timestamp_date, dtype: datetime64[ns])

### 3. Reconstructing Events

In [28]:
df_trades['Is_Quote'] = False
df_quotes['Is_Quote'] = True
trade_features = ['Participant_Timestamp_date', 'Symbol', 'Is_Quote', 'Trade_Volume', 'Trade_Price', 'Trade_Id', 'Trade_Reporting_Facility']
quote_features = ['Participant_Timestamp_date', 'Symbol', 'Is_Quote', 'Bid_Price', 'Bid_Size', 'Offer_Price', 'Offer_Size']

In [29]:
df1 = df_trades[trade_features]
df2 = df_quotes[quote_features]

df_all = pd.concat([df1, df2], ignore_index=True)
df_all = df_all.sort_values(by=['Participant_Timestamp_date']).reset_index(drop=True)
df_all.head(15)

Unnamed: 0,Participant_Timestamp_date,Symbol,Is_Quote,Trade_Volume,Trade_Price,Trade_Id,Trade_Reporting_Facility,Bid_Price,Bid_Size,Offer_Price,Offer_Size


### 3. Some Feature Generation

Generate int representation of recalculated Participate Timestamp, again, using vectorization

In [30]:
%time
df_all['Participant_Timestamp_f']= (df_all["Participant_Timestamp_date"].astype(int) / 1e9)

CPU times: user 5 µs, sys: 1e+03 ns, total: 6 µs
Wall time: 12.2 µs


In [31]:
df_all.head(15)

Unnamed: 0,Participant_Timestamp_date,Symbol,Is_Quote,Trade_Volume,Trade_Price,Trade_Id,Trade_Reporting_Facility,Bid_Price,Bid_Size,Offer_Price,Offer_Size,Participant_Timestamp_f


#### Question: iloc or loc?

When choosing a row or multiple rows, iloc is faster.
    
    e.g. df.iloc[:100]

When choosing columns with their labels, loc is better.

    e.g. df.iloc[:,['col1', 'col2']]



In [32]:
class CalendarMode:
    def __init__(self, df, delta1, delta2, start_idx, end_idx) -> None:
        self.df = df
        self.delta1 = delta1
        self.delta2 = delta2
        self.start_idx = start_idx
        self.end_idx = end_idx
    
    @classmethod
    def from_deltas(cls, df, delta1, delta2):
        timestamps = df_all['Participant_Timestamp_f'].sort_values().values
        #'left' always give you the index of the first suitable location found is given.
        #‘right’ return the last such index'
        start_idx = np.searchsorted(timestamps, timestamps - delta2, side='left')
        end_idx = np.searchsorted(timestamps, timestamps - delta1, side='right')
        return cls(
            df = df,
            delta1 = delta1,
            delta2 = delta2,
            start_idx = start_idx,
            end_idx = end_idx
        )
    
    #Breath
    def getBreath(self) -> pd.DataFrame:
        if 'Breath' in self.df.columns:
            return self.df['Breath']
        else:
            self.df['Breath'] = [self.df.iloc[start:end]['Is_Quote'].sum() \
                                       for start, end in zip(self.start_idx, self.end_idx)]
            return self.df['Breath']
    
    #Immediacy
    def getImmediacy(self) -> pd.Series:
        if 'Immediacy' in self.df.columns:
            return self.df['Immediacy']
        else:
            breath = self.getBreath()
            self.df['Immediacy'] = np.where(breath == 0, np.nan, \
                                            (self.delta2-self.delta1) / breath)
            return self.df['Immediacy']

    #VolumeAll
    def getVolumeAll(self) -> pd.Series:
        if 'VolumeAll' in self.df.columns:
            return self.df['VolumeAll']
        else:
            self.df['VolumeAll'] = [self.df.iloc[start:end]['Trade_Volume'].sum() \
                                          for start, end in zip(self.start_idx, self.end_idx)]
            return self.df['VolumeAll']




In [33]:
delta1, delta2 = 0, 5 #compute features from last 5 seconds
caln_f = CalendarMode.from_deltas(df_all, delta1, delta2)

In [34]:
start = time.time()
caln_f.getBreath()
print (f'Total time: {time.time()-start}s')

Total time: 0.002733945846557617s


In [35]:
start = time.time()
caln_f.getImmediacy()
print (f'Total time: {time.time()-start}s')

Total time: 0.002003192901611328s


In [36]:
start = time.time()
caln_f.getVolumeAll()
print (f'Total time: {time.time()-start}s')

Total time: 0.001116037368774414s


#### Tips:

When you have to loop through the whole dataframe, looping a numpy ndarray is much faster. 

For example, use:

    for v in df.values