## Import Libraries 

In [None]:
import os 
import yaml 
import random 
import pickle
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

random.seed( 17 )

In [None]:
plt.rcParams['font.size'] = 14 
plt.rcParams['figure.figsize'] = (15,5)
plt.rcParams['lines.linewidth'] = 2

In [None]:
# parent dir 
parent_dir = '/data/NSE/bindata_indices/'
date_name = [ name for name in os.listdir(parent_dir) if os.path.isdir(os.path.join(parent_dir, name)) and name.startswith('20')]
dates = pd.to_datetime( date_name , format='%Y%m%d')
dates = dates.strftime('%d-%m-%Y')
print( f'Number of Dates: {len(date_name)}')

In [None]:
date_format = '%Y%m%d'
date_time_format = '%d-%m-%Y %H:%M:%S'

## Example workflow for one date 

In [None]:
with open('user_input.yaml', 'r') as f:
    input = yaml.safe_load(f)

In [None]:
def drop_weekends(date_range: pd.DatetimeIndex) -> pd.DatetimeIndex:
    """Remove weekends (Saturday and Sunday) from a pandas date range."""
    return date_range[~date_range.weekday.isin([5, 6])]

In [None]:
# user input : date range 

dates = pd.date_range( start= input['start_date'] , end = input['end_date'] , freq = 'D' )
dates = drop_weekends( dates )
dates = dates.strftime( date_format= date_format).to_list()

dates 

In [None]:
def format_dates(date_list):
    parsed_dates = pd.to_datetime(date_list, format='%Y%m%d')
    return parsed_dates.strftime('%d-%m-%Y').tolist()

In [None]:
dates_formated = format_dates( dates )
dates_formated

In [None]:
def get_log_file_path( date_name : str , parent_dir : str =  '/data/NSE/bindata_indices/' ) : 
    dir = os.path.join( parent_dir , date_name , 'bin_data_archival_' + date_name + '.log')
    return dir 

In [None]:
dates_log_path = [ get_log_file_path( date ) for date in dates ]
dates_log_path

In [None]:
for path , date in zip( dates_log_path , dates_formated ) : 
    small_chunk_data = pd.read_csv( path  ,  nrows  = 20 , usecols= [0,1,2,4,8,12] )
    print(f'Log file for {date}')
    print( small_chunk_data )
    print('-'*10)

In [None]:
import datetime 
import calendar 
import holidays 

In [None]:
month_to_nse_code = {
    1: 'F',   # January
    2: 'G',   # February
    3: 'H',   # March
    4: 'J',   # April
    5: 'K',   # May
    6: 'M',   # June
    7: 'N',   # July
    8: 'Q',   # August
    9: 'U',   # September
    10: 'V',  # October
    11: 'X',  # November
    12: 'Z'   # December
}

In [None]:
def last_thursday(year, month):
    # Find the last day of the month
    last_day = calendar.monthrange(year, month)[1]
    # Create a date object for the last day of the month
    last_date = datetime.date(year, month, last_day)
    # Calculate the offset to the last Thursday (weekday 3)
    offset = (last_date.weekday() - 3) % 7
    # Subtract the offset to get the last Thursday
    last_thursday_date = last_date - datetime.timedelta(days=offset)
    return last_thursday_date

In [None]:
def get_date_code( date : str ) : 
    date = pd.to_datetime( date , format = '%Y%m%d')
    
    yy = date.strftime('%y')  # Last two digits of the year
    mm = date.strftime('%m')  # Two-digit month

    mm = int( mm )

    code = month_to_nse_code[mm] + str(yy)

    return code 

### User Inputs : 

In [None]:
last_thursday( 2025 , 3 )

In [None]:
get_date_code( input['exp'] )

In [None]:
time_indexs = [ pd.date_range(start= date + ' ' + "09:15:01", end=date + ' ' + "15:30:00", freq='s') for date in dates ]
time_indexs

In [None]:
def groupby_to_nested_dict(df, group_cols):
    if not group_cols:
        return df.reset_index(drop=True)
    col = group_cols[0]
    return {
        key: groupby_to_nested_dict(sub_df, group_cols[1:])
        for key, sub_df in df.groupby(col)
    }

In [None]:
def process_spot_data(spot_rows:list,date:str,time_index): 
    result_df = pd.concat(spot_rows, ignore_index=True )
    if result_df.empty : 
        raise ValueError(f'No Underlying found on {date}')
    result_df['Time'] = pd.to_datetime(date + ' ' + result_df['Time'] , format = '%d-%m-%Y %H:%M:%S')

    result_df['Close'] = result_df['Close']/100
    result_df = result_df.set_index('Time').reindex( time_index )
    result_df['Close'] = result_df['Close'].ffill().bfill()

    result_df['Spot Return %'] = result_df['Close'].pct_change()*100

    return result_df 

In [None]:
def process_strike_dict(strike_dict:dict,date:str,time_index,nresult_df) : 
    rm_strikes = []
    for strike in strike_dict.keys() :
        df = pd.DataFrame()
        try : 
            df = strike_dict[strike]['PE'].copy()
            df = df[[ 'Time' , 'Close']]
            df = df.rename(columns={'Close': 'PE_Close'})
            df['Time'] = pd.to_datetime(date + ' ' + df['Time'] , format = '%d-%m-%Y %H:%M:%S')
            df = df.set_index('Time').reindex( time_index )
            try : 
                df2 = strike_dict[strike]['CE'].copy()
                df2 = df2[['Time' , 'Close']]
                df2 = df2.rename(columns={'Close': 'CE_Close'})
                df2['Time'] = pd.to_datetime(date + ' ' + df2['Time'] , format = '%d-%m-%Y %H:%M:%S')
                df2 = df2.set_index('Time').reindex( time_index )
                df['CE_Close'] = df2['CE_Close']

                cols_to_fill = ['CE_Close', 'PE_Close']
                df[cols_to_fill] = df[cols_to_fill].ffill().bfill()

                df['Orb2'] = df['CE_Close'] - df['PE_Close'] - nresult_df['Close'] + strike
                df['Orb2_diff'] = df['Orb2'].diff()

            except : 
                print(f'No Conjugate Call Option is Present for {strike} Put Option on {date} \n This strike price will be droped .... ') 
                rm_strikes.append( strike )
        except : 
            print(f'No Conjugate Put Option is Present for {strike} Call Option on {date} \n This strike price will be droped .... ') 
            rm_strikes.append( strike )

        strike_dict[strike] = df 
    return rm_strikes 

In [None]:
def process_options_data( strike_rows:list, date:str, time_index , nresult_df ): 
    strike_df = pd.concat(strike_rows, ignore_index=True )
    strike_df['Close'] = strike_df['Close']/100

    if strike_df.empty : 
        raise ValueError(f'!!!!!Strike Bounds are Out of Range for {date}!!!')
        

    strike_dict = groupby_to_nested_dict( strike_df , ['Strike'  , 'Type' ] )
    rm_strikes = process_strike_dict( strike_dict , date , time_index , nresult_df )

    for key in rm_strikes : 
        strike_dict.pop( key , None )

    return strike_dict 

In [None]:
def get_pickel_name( input ) : 
    return '_'.join([ str(v) for v in list(input.values())[:-1]]) + '.pkl' 

In [None]:
def create_main_dict( dates_formated , dates_log_path , time_indexs ) : 
    main_dict = {}
    for date , path , time_index in zip( dates_formated , dates_log_path , time_indexs): 
        symbol = input['underlying'] + '_' + get_date_code( input['exp'] )

        main_dict[date] = {}

        chunk_size = 100000 # Tune as needed
        spot_rows = []
        strike_rows = []

        for chunk in pd.read_csv(path, chunksize=chunk_size , usecols= [1,2,8]):
            # Filter rows where Symbol exactly matches 'NSEFNO_BANKNIFTY_F25'
            matched = chunk[chunk['Symbol'] == symbol]
            spot_rows.append(matched)
            # save to dict 
            main_dict[date]['Spot'] = process_spot_data( spot_rows , date , time_index )

            # For strike rows : 
            matched = chunk[
                chunk['Symbol'].str.startswith(symbol) & 
                (chunk['Symbol'] != symbol ) 
            ].copy()
            split_cols = matched['Symbol'].str.split('_', expand=True)

            matched.loc[:, 'Strike'] = split_cols[3].astype(float)
            matched.loc[:, 'Type'] = split_cols[5]

            # get the strike range : 
            # spot at 09:15:05 
            spot = main_dict[date]['Spot']['Close'].iloc[4]
            min_strike = spot - input['strike_range']/2 
            max_strike = spot + input['strike_range']/2

            matched['Strike'] = matched['Strike'].astype(float)
            
            matched = matched[
                (matched['Strike'] >= min_strike ) & 
                (matched['Strike'] <= max_strike )
            ]
            strike_rows.append(matched)

        # save to dict 
        main_dict[date]['Strike'] = process_options_data(strike_rows , date , time_index , main_dict[date]['Spot'])
    return main_dict 

In [None]:
pkl_name = get_pickel_name( input )
pkl_name 

In [None]:
try : 
    with open(pkl_name , 'rb' ) as f : 
        main_dict = pickle.load(f)
except : 
    main_dict = create_main_dict( dates_formated , dates_log_path , time_indexs )
    with open(pkl_name , 'wb' ) as f : 
        pickle.dump( main_dict , f )

In [None]:
for date in main_dict.keys() : 
    print(F'--------FOR {date}-----------')
    print('SPOT DATA : ')
    print( main_dict[date]['Spot'].head() )
    print('STRIKE DATA:  ')
    for strike , df in main_dict[date]['Strike'].items() : 
        print(f'\t\t STRIKE = {strike}')
        print(df.head() ) 

In [None]:
for date in main_dict.keys() : 
    main_dict[date]['Stamp'] = (
        pd.to_datetime(date + ' ' + '09:15:00', format = '%d-%m-%Y %H:%M:%S') , 
        pd.to_datetime(date + ' ' + '15:30:00' , format = '%d-%m-%Y %H:%M:%S')
    )

In [None]:

def plot_time_series( df , col  , strike , start_end_stamp , date ,  x : bool = True , save_dir = None  ): 
    start_time , end_time = start_end_stamp
    plt.figure()
    if x : 
        plt.plot( df.index[1:] , df[col][1:]  , 'o-' , color = 'r' , markersize=0.1)
    else : 
        plt.plot( df.index , df[col]  , 'o-' , color = 'r' , markersize=0.1)
    plt.xlim( start_time , end_time )
    plt.tight_layout()
    plt.xticks(
        ticks = pd.date_range(start=start_time, end=end_time, freq='1h') , 
        labels = pd.date_range( start = start_time  , end = end_time, freq = '1h' ).time
    )
    plt.grid()
    plt.title( f"{col} for {input['underlying']}; exp: {input['exp']}; strike: {strike}; date : {date} {pd.to_datetime( date , dayfirst=True).day_name()}")
    if save_dir != None : 
        plt.savefig( save_dir )
    plt.show()

In [None]:
# for date  in main_dict.keys()  : 
#     for strike , df in main_dict[date]['Strike'].items(): 
#         plot_time_series(  df  ,'Orb2' , strike , main_dict[date]['Stamp'] , date ,x = False , save_dir = f'Date-Range-Plots/Orb/Orb_strike_{strike}.png')

In [None]:
# for date  in main_dict.keys()  : 
#     for strike , df in main_dict[date]['Strike'].items(): 
#         plot_time_series(  df  ,'Orb2_diff' , strike , main_dict[date]['Stamp'] , date , save_dir = f'Date-Range-Plots/Orb_diff/Orb_diff_strike_{strike}.png')

In [None]:
# for date  in main_dict.keys()  : 
#         plot_time_series(  main_dict[date]['Spot']  ,'Spot Return %' , '-' , main_dict[date]['Stamp'] , date  , save_dir = f'Date-Range-Plots/Spot-Return/spot_return_{date}.png')

In [None]:
filtered_time_indexs = [ pd.date_range(start= date + ' ' + "09:15:01", end=date + ' ' + "15:30:00", freq=f"{input['dt']}T") for date in dates ]
filtered_time_indexs

In [None]:
date = dates_formated[0]
strikes = [48300.0 , 50000.0 ]

for strike in strikes : 
    main_dict[date]['Strike'][strike] = main_dict[date]['Strike'][strike].reindex(filtered_time_indexs[0])
main_dict[date]['Spot'] = main_dict[date]['Spot'].reindex(filtered_time_indexs[0])

In [None]:
for strike in strikes : 
    x = main_dict[date]['Strike'][strike]['Orb2'] + 1000 
    x = x.replace( 0 , np.nan )
    x = x.ffill().bfill()
    print( ( x <= 0 ).sum())
    main_dict[date]['Strike'][strike]['orb_log_return'] = np.log( x /x.shift(1) )
main_dict[date]['Spot']['log_return'] =  np.log(main_dict[date]['Spot']['Close'] / main_dict[date]['Spot']['Close'].shift(1))

In [None]:
plt.rcParams['lines.linewidth'] = 2

In [None]:
date = dates_formated[0]

x = main_dict[date]['Spot']['log_return']
y = main_dict[date]['Strike'][48300.0]['orb_log_return']

start_time , end_time = main_dict[date]['Stamp']
plt.figure()
plt.plot( x.index[1:] , y[1:]  , 'o-' , color = 'r' , markersize=0.1 , label = 'Orb Log Diff' , alpha = 0.5 )
plt.plot( x.index[1:] , x[1:]  , 'o-' , color = 'g' , markersize=0.1 , label = 'Spot Log Diff', alpha = 0.5 )
plt.xlim( start_time , end_time )
plt.tight_layout()
plt.xticks(
    ticks = pd.date_range(start=start_time, end=end_time, freq='1h') , 
    labels = pd.date_range( start = start_time  , end = end_time, freq = '1h' ).time
)
plt.grid()
plt.legend()
plt.title( f" strike = 48300.0; date : {date} {pd.to_datetime( date , dayfirst=True).day_name()}")
plt.show()

In [None]:
result = np.corrcoef( x[1:]  , y[1:]) 
result

In [None]:
date = dates_formated[0]

x = main_dict[date]['Spot']['log_return']
y = main_dict[date]['Strike'][48300.0]['Orb2_diff']

start_time , end_time = main_dict[date]['Stamp']
plt.figure()
plt.plot( x.index[1:] , y[1:]  , 'o-' , color = 'r' , markersize=0.1 , label = 'Orb Diff' , alpha = 0.5 )
plt.plot( x.index[1:] , x[1:]  , 'o-' , color = 'g' , markersize=0.1 , label = 'Spot Log Diff', alpha = 0.5 )
plt.xlim( start_time , end_time )
plt.tight_layout()
plt.xticks(
    ticks = pd.date_range(start=start_time, end=end_time, freq='1h') , 
    labels = pd.date_range( start = start_time  , end = end_time, freq = '1h' ).time
)
plt.grid()
plt.legend()
plt.title( f" strike = 48300.0; date : {date} {pd.to_datetime( date , dayfirst=True).day_name()}")
plt.show()


result = np.corrcoef( x[1:]  , y[1:]) 
result

In [None]:
date = dates_formated[0]

x = main_dict[date]['Spot']['log_return']
y = main_dict[date]['Strike'][50000.0]['Orb2_diff']

start_time , end_time = main_dict[date]['Stamp']
plt.figure()
plt.plot( x.index[1:] , y[1:]  , 'o-' , color = 'r' , markersize=0.1 , label = 'Orb Diff' , alpha = 0.5 )
plt.plot( x.index[1:] , x[1:]  , 'o-' , color = 'g' , markersize=0.1 , label = 'Spot Log Diff', alpha = 0.5 )
plt.xlim( start_time , end_time )
plt.tight_layout()
plt.xticks(
    ticks = pd.date_range(start=start_time, end=end_time, freq='1h') , 
    labels = pd.date_range( start = start_time  , end = end_time, freq = '1h' ).time
)
plt.grid()
plt.legend()
plt.title( f" strike = 50000.0; date : {date} {pd.to_datetime( date , dayfirst=True).day_name()}")
plt.show()


result = np.corrcoef( x[1:]  , y[1:]) 
result

In [None]:
x = main_dict[date]['Spot']['log_return']
y = main_dict[date]['Strike'][50000.0]['orb_log_return']

start_time , end_time = main_dict[date]['Stamp']
plt.figure()
plt.plot( x.index[1:] , y[1:]  , 'o-' , color = 'r' , markersize=0.1 , label = 'Orb Log Diff' , alpha = 0.5 )
plt.plot( x.index[1:] , x[1:]  , 'o-' , color = 'g' , markersize=0.1 , label = 'Spot Log Diff', alpha = 0.5 )
plt.xlim( start_time , end_time )
plt.tight_layout()
plt.xticks(
    ticks = pd.date_range(start=start_time, end=end_time, freq='1h') , 
    labels = pd.date_range( start = start_time  , end = end_time, freq = '1h' ).time
)
plt.grid()
plt.legend()
plt.title( f" strike = 50000.0; date : {date} {pd.to_datetime( date , dayfirst=True).day_name()}")
plt.show()

In [None]:
result = np.corrcoef( x[1:]  , y[1:]) 
result

In [None]:
for strike in strikes : 
    print(f'For {date} and Strike = {strike}')
    print(main_dict[date]['Strike'][strike].describe())
    print('-'*50)

print(main_dict[date]['Spot'].describe())

In [None]:
result = np.corrcoef( x[1:]  , y[1:]) 

In [None]:
result 

In [None]:
x = main_dict[date]['Spot']['Close']
y = main_dict[date]['Strike'][48300.0]['Orb2']

plt.figure( figsize = (12,6))
plt.scatter( x , y, s = 2  )
plt.xlabel('Spot')
plt.ylabel('Orb2')
plt.title('Strike = 48300.0 ')
plt.grid()
plt.show()

In [None]:
start_time , end_time = main_dict[date]['Stamp']
plt.figure()
plt.plot( x.index[1:] , y[1:]  , 'o-' , color = 'r' , markersize=0.1 , label = 'Orb' , alpha = 0.5 )
plt.plot( x.index[1:] , x[1:]  , 'o-' , color = 'g' , markersize=0.1 , label = 'Spot', alpha = 0.5 )
plt.xlim( start_time , end_time )
plt.tight_layout()
plt.xticks(
    ticks = pd.date_range(start=start_time, end=end_time, freq='1h') , 
    labels = pd.date_range( start = start_time  , end = end_time, freq = '1h' ).time
)
plt.grid()
plt.legend()
plt.title( f" strike = 48300.0; date : {date} {pd.to_datetime( date , dayfirst=True).day_name()}")
plt.show()

In [None]:
x = main_dict[date]['Spot']['Close']
y = main_dict[date]['Strike'][50000.0]['Orb2']

plt.figure( figsize = (12,6))
plt.scatter( x , y, s = 2  )
plt.xlabel('Spot')
plt.ylabel('Orb2')
plt.title('Strike = 50000.0 ')
plt.grid()
plt.show()

In [None]:
start_time , end_time = main_dict[date]['Stamp']
plt.figure()
plt.plot( x.index[1:] , y[1:]  , 'o-' , color = 'r' , markersize=0.1 , label = 'Orb' , alpha = 0.5 )
plt.plot( x.index[1:] , x[1:]  , 'o-' , color = 'g' , markersize=0.1 , label = 'Spot', alpha = 0.5 )
plt.xlim( start_time , end_time )
plt.tight_layout()
plt.xticks(
    ticks = pd.date_range(start=start_time, end=end_time, freq='1h') , 
    labels = pd.date_range( start = start_time  , end = end_time, freq = '1h' ).time
)
plt.grid()
plt.legend()
plt.title( f" strike = 50000.0; date : {date} {pd.to_datetime( date , dayfirst=True).day_name()}")
plt.show()

In [None]:
orb_dict = {}
orb_df = pd.DataFrame()


for date in main_dict.keys() : 
    orb_dict[date] =  pd.DataFrame() 
    spot_return_corr = []
    for strike , df in main_dict[date]['Strike'].items() : 
        orb_dict[date][strike] = df['Orb2_diff']
        spot_return_corr.append(main_dict[date]['Spot']['Spot Return %'].corr(df['Orb2_diff']))
    orb_dict[date] = orb_dict[date].corr()
    orb_dict[date]['Spot'] = spot_return_corr 


In [None]:
import seaborn as sns

In [None]:
for date , corr in orb_dict.items() : 

    plt.figure(figsize=(10, 8))
    sns.heatmap(corr, annot=False, cmap='coolwarm', fmt=".2f")
    plt.title(f"Orb2 First Diff Correlation Matrix for \n {input['underlying']}; exp: {input['exp']}; on {date} {pd.to_datetime( date , dayfirst=True).day_name()}")
    plt.xlabel("Strike Price")
    plt.ylabel("Strike Price")
    plt.tight_layout()
    plt.savefig(f"Date-Range-Plots/Corr/Orb_diff_corr_{date}.png")
    plt.show()

In [None]:
date = dates_formated[0]
row , col = np.where( (orb_dict[date] < 0 ) == True )

In [None]:
orb_dict[date].columns[row] , orb_dict[date].columns[col]

In [None]:
orb_dict[date].iloc[row[0] , col[0]]

In [None]:
orb_dict[date]