In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('./drive/MyDrive/blockhouse_assignment/first_25000_rows.csv')

In [3]:
data.head()

Unnamed: 0,ts_recv,ts_event,rtype,publisher_id,instrument_id,action,side,depth,price,size,...,ask_sz_08,bid_ct_08,ask_ct_08,bid_px_09,ask_px_09,bid_sz_09,ask_sz_09,bid_ct_09,ask_ct_09,symbol
0,2024-10-21T11:54:29.221230963Z,2024-10-21T11:54:29.221064336Z,10,2,38,C,B,1,233.62,2,...,155,1,7,233.25,234.13,55,400,2,1,AAPL
1,2024-10-21T11:54:29.223936626Z,2024-10-21T11:54:29.223769812Z,10,2,38,A,B,0,233.67,2,...,155,1,7,233.25,234.13,55,400,2,1,AAPL
2,2024-10-21T11:54:29.225196809Z,2024-10-21T11:54:29.225030400Z,10,2,38,A,B,0,233.67,3,...,155,1,7,233.25,234.13,55,400,2,1,AAPL
3,2024-10-21T11:54:29.712600612Z,2024-10-21T11:54:29.712434212Z,10,2,38,A,B,2,233.52,200,...,155,1,7,233.25,234.13,55,400,2,1,AAPL
4,2024-10-21T11:54:29.764839221Z,2024-10-21T11:54:29.764673165Z,10,2,38,C,B,2,233.52,200,...,155,1,7,233.25,234.13,55,400,2,1,AAPL


In [4]:
data.info()   ##establish some method to show that there are no missing values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 74 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   ts_recv        5000 non-null   object 
 1   ts_event       5000 non-null   object 
 2   rtype          5000 non-null   int64  
 3   publisher_id   5000 non-null   int64  
 4   instrument_id  5000 non-null   int64  
 5   action         5000 non-null   object 
 6   side           5000 non-null   object 
 7   depth          5000 non-null   int64  
 8   price          5000 non-null   float64
 9   size           5000 non-null   int64  
 10  flags          5000 non-null   int64  
 11  ts_in_delta    5000 non-null   int64  
 12  sequence       5000 non-null   int64  
 13  bid_px_00      5000 non-null   float64
 14  ask_px_00      5000 non-null   float64
 15  bid_sz_00      5000 non-null   int64  
 16  ask_sz_00      5000 non-null   int64  
 17  bid_ct_00      5000 non-null   int64  
 18  ask_ct_0

In [5]:
# defining order flow imbalance function for any level

def OFI(data, level):
  level = '0'+ f'{level}'
  data[f'bid_order_flow_{level}'] = 0

  for i in range(data.shape[0]-1):
    if data.loc[i+1, f'bid_px_{level}'] > data.loc[i, f'bid_px_{level}']:
      data.loc[i+1, f'bid_order_flow_{level}'] = data.loc[i+1,f'bid_sz_{level}']
    elif data.loc[i+1, f'bid_px_{level}'] < data.loc[i, f'bid_px_{level}']:
      data.loc[i+1, f'bid_order_flow_{level}'] = -data.loc[i+1, f'bid_sz_{level}']
    else:
      data.loc[i+1,f'bid_order_flow_{level}'] = data.loc[i+1, f'bid_sz_{level}'] - data.loc[i, f'bid_sz_{level}']

  data[f'ask_order_flow_{level}'] = 0

  for i in range(data.shape[0]-1):
    if data.loc[i+1, f'ask_px_{level}'] > data.loc[i, f'ask_px_{level}']:
      data.loc[i+1, f'ask_order_flow_{level}'] = -data.loc[i+1,f'ask_sz_{level}']
    elif data.loc[i+1, f'ask_px_{level}'] < data.loc[i, f'ask_px_{level}']:
      data.loc[i+1, f'ask_order_flow_{level}'] = data.loc[i+1, f'ask_sz_{level}']
    else:
      data.loc[i+1,f'ask_order_flow_{level}'] = data.loc[i+1, f'ask_sz_{level}'] - data.loc[i, f'ask_sz_{level}']

  data[f'order_flow_imbalance_{level}'] = data[f'bid_order_flow_{level}'] - data[f'ask_order_flow_{level}']

  return data[f'order_flow_imbalance_{level}'].sum()

In [6]:
best_level_OFI = OFI(data, 0)
print(f'Best-Level OFI: {best_level_OFI}')

Best-Level OFI: -71026


In [7]:
#defining function calculating multi-level ofi
def multi_level_OFI(data, tot_lev):
  multi_level_OFI = []

  Q = 0.0

  for i in range(tot_lev):
    level = '0'+ f'{i}'
    Q = Q + 1/(2*data.shape[0])*(data[f'bid_sz_{level}'] + data[f'ask_sz_{level}']).sum()

  Q = Q/tot_lev

  for i in range(tot_lev):
    multi_level_OFI.append(OFI(data, i))

  multi_level_OFI = multi_level_OFI/Q
  return multi_level_OFI

In [8]:
ofi_vec = multi_level_OFI(data, 10)

In [9]:
print(f'Multi-Level OFI: {ofi_vec}')

Multi-Level OFI: [-363.93937083 -325.57575653 -172.1520435   569.36686663   -9.31548695
  330.41284107  579.65591932   73.18139969  -85.01790951   43.38003989]


In [16]:
# calculating integrated OFI
from sklearn.decomposition import PCA

def integrated_OFI(data, ofi_vec):

  data_hist = np.empty((data.shape[0], 10))

  #fetching historical data from order flow imbalance for each instance
  for i in range(10):
    level = '0'+ f'{i}'
    data_hist[:,i] = data[f'order_flow_imbalance_{level}'].to_numpy()

  pca = PCA(n_components=1)
  pca.fit(data_hist)

  w1 = pca.components_[0]

  integrated_ofi = np.dot(ofi_vec, w1)/np.linalg.norm(w1, ord=1)

  return integrated_ofi

In [13]:
ofi_I = integrated_OFI(data, ofi_vec)

In [15]:
print(f'Integrated OFI: {ofi_I}')

Integrated OFI: 125.20076231989887


The calculation of cross-asset OFI would not be possible since data only provides information about one type of stock (AAPL)

In [18]:
data['symbol'].nunique()

1