In [1]:
import os 
import sys 
import csv 
import yaml
import pickle 
import numpy as np 
import pandas as pd 
from tqdm import tqdm 
import datetime 

In [2]:
# define fixed variable and user inputs : 

IST = datetime.timezone(datetime.timedelta(hours=5, minutes=30))
 
header_list = ['trading_date','data_source','stream_id','exchange_epoch_nanos','server_epoch_nanos','capture_epoch_nanos',
'contract_id','venue_token','contract_name','market_segment_id','symbol','contract_type','strike_price','expiry date',
'display_factor','handle_value','currency_unit','is_touchline_change','feed_message_id','inpacket_sequence_num',
'is_last_in_feed_message','event_type','packet_sequence_num','contract_sequence_num','is_implied','transact_epoch_nanos',
'bp1','bq1','boc1','has_hidden_qty_bid','ap1','aq1','aoc1','has_hidden_qty_ask','bp2','bq2','boc2','ap2','aq2','aoc2',
'bp3','bq3','boc3','ap3','aq3','aoc3','bp4','bq4','boc4','ap4','aq4','aoc4','bp5','bq5','boc5','ap5','aq5','aoc5',
'implied_bp1','implied_bq1','implied_ap1','implied_aq1','implied_bp2','implied_bq2','implied_ap2','implied_aq2',
'previous_mid','mid','is_mid_change','weighted_mid','contract_status','side','price','qty','order_count','qpos',
'old_qpos','oid1','oid2','priority','old_price','old_qty','aggressor_type','qty_in_last_trade','level','capture_ts timestamptz']

header_list_to_keep = ['exchange_epoch_nanos','server_epoch_nanos','contract_name','symbol','expiry date','is_touchline_change','event_type','transact_epoch_nanos',
'bp1','bq1','boc1','has_hidden_qty_bid','ap1','aq1','aoc1','has_hidden_qty_ask','bp2','bq2','boc2','ap2','aq2','aoc2',
'bp3','bq3','boc3','ap3','aq3','aoc3','bp4','bq4','boc4','ap4','aq4','aoc4','bp5','bq5','boc5','ap5','aq5','aoc5',
'previous_mid','mid','is_mid_change','weighted_mid','side','price','qty','order_count','qpos',
'old_qpos','oid1','oid2','priority','old_price','old_qty','aggressor_type','qty_in_last_trade','level'
]

cols_to_check = ['market_segment_id','contract_type',
'display_factor','handle_value','is_touchline_change','inpacket_sequence_num',
'is_last_in_feed_message','contract_sequence_num','is_implied','contract_status','qpos',
'old_qpos']

qty_cols = ['bq1','has_hidden_qty_bid',
    'aq1','has_hidden_qty_ask','bq2','aq2','bq3','aq3',
    'bq4','aq4','bq5','aq5','qty',
    'old_qty','qty_in_last_trade'
]

ns_cols = ['exchange_epoch_nanos' , 'server_epoch_nanos' , 'capture_epoch_nanos' , 'transact_epoch_nanos' , 'priority']

with open('user_input.yaml' , 'r' ) as f : 
    inputs = yaml.safe_load( f ) 


usecols = [header_list.index(col) for col in header_list_to_keep ]

In [3]:
# path to obo file :
date_dir =  os.path.join(inputs['parent_dir'] , inputs['date'] )
for file in os.listdir( date_dir ) : 
    if file.startswith('obo') : 
        file_path = os.path.join( date_dir , file )
        break 

file_path 

'/data/ashmit_data/20250519/obo_20250519.log'

In [4]:
def get_iter() : 
    return pd.read_csv(file_path, header=None, names=header_list_to_keep , chunksize=inputs['chunk_size'] , dtype = {82 : str} , usecols= usecols ) 

In [5]:
def get_unique( cols_to_check ): 
    check_dict = { col : []  for col in cols_to_check }
    for chunk in get_iter() : 
        for col in cols_to_check : 
            temp = chunk.dropna(subset=[col])
            check_dict[col] = check_dict[col] + temp[col].unique().tolist()
    for col in check_dict : 
        print(f'for {col} : {np.unique(check_dict[col])}')

In [6]:
def convert_ns_to_time(time_ns ) : 
    dt = datetime.datetime.fromtimestamp( np.floor(time_ns/1e9) , tz=IST).replace(tzinfo=None).time()
    ns_part = f"{time_ns%int(1e9)*1e-9:.9f}"

    return dt.strftime('%H:%M:%S') + ns_part[1:]

In [7]:
convert_ns_to_time( 1747648988764685036 )

'15:33:08.764685036'

In [8]:
# get_unique(['event_type' , 'qpos' , 'level'])
# get_unique(['order_count' , 'has_hidden_qty_bid' , 'aggressor_type'])

In [9]:
next(get_iter()).fillna('--').to_csv('temp.csv' , index = False )

In [10]:
def get_lot_size_path(inputs) : 
    return os.path.join( inputs['contract_dir'], inputs['near'] + '_NSE_FO_FUT_LOT_SIZE.pickel' )

def get_contract_master_path(inputs) : 
    year_month_str = pd.to_datetime( inputs['date'] ).strftime(format = '%Y_%m')
    contract_path = [ os.path.join(inputs['contract_dir'], f) for f in os.listdir(inputs['contract_dir']) if f.startswith(year_month_str) and os.path.isfile(os.path.join(inputs['contract_dir'], f))]
    if( len(contract_path) == 0 ) : 
        raise ValueError(f"Contrate Path for {inputs['date']} not found.")
    return contract_path[0]

month_to_nse_code = {
    1: 'F',   # January
    2: 'G',   # February
    3: 'H',   # March
    4: 'J',   # April
    5: 'K',   # May
    6: 'M',   # June
    7: 'N',   # July
    8: 'Q',   # August
    9: 'U',   # September
    10: 'V',  # October
    11: 'X',  # November
    12: 'Z'   # December
}

def convert_date_to_code( date ) : 
    if( type(date) != str ): 
        date = str( date )
    date = pd.to_datetime( date )
    return month_to_nse_code[int(date.strftime('%m'))] + date.strftime('%Y')[2:]

def get_lot_size(inputs) : 
    lot_path = get_lot_size_path(inputs) 
    try : 
        with open( lot_path , 'rb' ) as f : 
            lot_dict = pickle.load(f)
    except :  
        df = pd.read_csv(
            get_contract_master_path(inputs) , 
            compression='gzip' , 
            usecols = [2,3,7,12]
        )
        df = df[df['type'] == 'FUT' ]
        df['expiry_date'] = df['expiry_date'].apply(convert_date_to_code)
        df['symbol'] = 'NSEFNO_' + df['symbol'] + '_' + df['expiry_date']
        df.pop('type')
        df.pop('expiry_date')
        df = df.set_index('symbol')
        lot_dict = df['lotsize'].to_dict()
        with open( lot_path , 'wb' ) as f : 
            pickle.dump( lot_dict , f ) 
    return lot_dict 

In [11]:
near_symbol = '_'.join(['NSEFNO',inputs['underlying'],inputs['near']]) 
far_symbol = '_'.join(['NSEFNO',inputs['underlying'],inputs['far']])
farfar_symbol = '_'.join(['NSEFNO',inputs['underlying'],inputs['farfar']])

near_symbol , far_symbol

('NSEFNO_HFCL_K25', 'NSEFNO_HFCL_M25')

In [12]:
lot_dict = get_lot_size(inputs)
if( lot_dict[near_symbol] != lot_dict[far_symbol] ) : 
    raise ValueError(f'Lot size for {near_symbol} and {far_symbol} are not the same ... ')
else : 
    lotsize = lot_dict[near_symbol] 
lotsize 

4150

In [13]:
example = next(get_iter())
example = example[~(example['contract_name'] == farfar_symbol)]
example[qty_cols] = (example[qty_cols] / lotsize )
example.fillna('--').to_csv('temp2.csv' , index = False ) 