In [2]:
import pandas as pd
import numpy as np

import os
import sys

sys.path.append("../../")
from src.config.paths import RAW_DIR, PROCESSED_DIR


In [None]:
"""
1. Feature Engineering
Feature Engineering for Predicting Hedging Pressure

We can infer delta-hedging activity using several key metrics even without direct options data:

1. Gamma Exposure Proxy
- Approximate market-wide gamma exposure using rolling volatility expansion model
- Formula: Γt = ∂²P/∂S² 
- When gamma exposure is high:
  - Market makers must hedge aggressively
  - Expect mean-reverting price action (buying dips, selling rallies)

2. Price & Volume Impact Metrics
- Monitor stock price deviations from normal liquidity levels
- Track unusual price shifts relative to rolling volatility
- Identify high volume periods that signal hidden gamma exposure

3. VWAP & Institutional Execution Pressure  
- Institutions typically execute trades at VWAP (Volume Weighted Average Price)
- Market makers actively hedge around VWAP deviations
- VWAP dislocations provide predictable mean reversion opportunities
"""



def load_data(ticker: str):
    df = pd.read_csv(RAW_DIR / f'{ticker}_1min.csv')
    required_columns = ['timestamp', 'open', 'high', 'low', 'close', 'volume']
    for col in required_columns:
        if col not in df.columns:
            raise ValueError(f"Column {col} not found in {ticker}_1min.csv")
    return df

df = load_data('AAPL')
df.tail()

#2025-02-14 19:59:00
#2023-03-01 04:00:00	

Unnamed: 0,timestamp,open,high,low,close,volume
458940,2023-03-01 04:04:00,146.754,146.8431,146.7243,146.8233,2949
458941,2023-03-01 04:03:00,146.8233,146.8233,146.8035,146.8233,1061
458942,2023-03-01 04:02:00,146.7243,146.8233,146.7243,146.8233,1049
458943,2023-03-01 04:01:00,146.8035,146.8233,146.655,146.8233,1318
458944,2023-03-01 04:00:00,146.1897,146.8233,146.1897,146.8233,1265


In [4]:

def load_data(ticker: str):
    df = pd.read_csv(PROCESSED_DIR / f'{ticker}_features.csv')
    # required_columns = ['timestamp', 'open', 'high', 'low', 'close', 'volume']
    # for col in required_columns:
    #     if col not in df.columns:
    #         raise ValueError(f"Column {col} not found in {ticker}_1min.csv")
    return df

df = load_data('AAPL')
df.head()
#2025-02-14 19:59:00	
#2023-03-01 04:00:00	

Unnamed: 0,timestamp,open,high,low,close,volume,log_returns,daily_volatility,vpin,hurst,asc,vwpd
0,2025-02-14 19:59:00,244.65,244.65,244.5,244.64,794.0,0.0,0.0,0.0,0.512431,0.0,0.0
1,2025-02-14 19:58:00,244.7,244.75,244.65,244.65,1003.0,4.1e-05,0.0,0.558152,0.512431,0.0,0.0
2,2025-02-14 19:57:00,244.7,244.75,244.68,244.7,35.0,0.000204,0.0,0.566594,0.512431,0.0,0.0
3,2025-02-14 19:56:00,244.75,244.75,244.67,244.67,245.0,-0.000123,0.0,0.381801,0.512431,0.0,0.0
4,2025-02-14 19:55:00,244.71,244.74,244.7,244.73,19.0,0.000245,0.0,0.387405,0.512431,0.0,0.0


In [None]:

def preprocess_data(df: pd.DataFrame, ticker: str): 
    # Convert timestamp into proper time-series format
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df.set_index('timestamp', inplace=True)

    # Remove outliers like spikes & errorneous entries
    df = df[df['volume'] > 0]
    df = df[df['close'] > 0]
    df = df[df['high'] > 0]
    df = df[df['low'] > 0]

    # Normalize price series to avoid bias
    df['close'] = df['close'] / df['close'].iloc[0]
    df['high'] = df['high'] / df['high'].iloc[0]
    df['low'] = df['low'] / df['low'].iloc[0]

    # Save structured data for feature engineering.
    df.to_csv(PROCESSED_DIR / f'{ticker}_1min_processed.csv', index=False)

preprocess_data(df, 'MMM')


In [23]:
def feature_engineering(df: pd.DataFrame, ticker: str):
    pass


# Gamma Exposure Proxy: Measures how much hedging pressure exists / gamma exposure proxy (infers market marker hedging pressure)
# Formula: Gamma_proxy,t = |sigma_t - sigma_t-1| / sigma_t-1

# Realized Volatility (RV): Detects regime changes (momentum vs. mean-reversion)
# Formula: RV_t = sqrt(sum((log P_i - log P_i-1)^2)) from i=t-N to t

# VWAP Spread: Measures how far price deviates from VWAP
# Formula: VWAP_t = sum(P_i * V_i) / sum(V_i)

# Order Flow Imbalance: Detects buy/sell dominance
# Formula: Imbalance_t = (sum(Buy Volume) - sum(Sell Volume)) / sum(Total Volume)

print(df.head())
print(df.columns)

                       open    high     low   close  volume
timestamp                                                  
2025-02-14 19:32:00  147.66  147.66  147.66  147.66       2
2025-02-14 19:31:00  148.41  148.41  148.41  148.41       1
2025-02-14 19:28:00  147.66  147.66  147.66  147.66       2
2025-02-14 19:25:00  148.60  148.60  148.58  148.58       2
2025-02-14 19:23:00  148.72  148.72  148.58  148.72      21
Index(['open', 'high', 'low', 'close', 'volume'], dtype='object')


In [None]:
class FeatureEngineering:
    def __init__(self, df: pd.DataFrame, ticker: str):
        self.df = df
        self.ticker = ticker

    def gamma_exposure_proxy(self):
        pass

    def imbalance(self):
        pass

    def vwap_spread(self):
        pass
    
    def realized_volatility(self):
        pass



In [26]:
imbalance = df['volume'].rolling(window=2).sum()
buy_volume = df['volume'][df['close'] > df['close'].shift(1)]
sell_volume = df['volume'][df['close'] < df['close'].shift(1)]

print(imbalance)
print(buy_volume)
print(sell_volume)


timestamp
2025-02-14 19:32:00       NaN
2025-02-14 19:31:00       3.0
2025-02-14 19:28:00       3.0
2025-02-14 19:25:00       4.0
2025-02-14 19:23:00      23.0
                        ...  
2023-03-01 06:55:00    1141.0
2023-03-01 06:31:00    1107.0
2023-03-01 06:30:00     458.0
2023-03-01 06:21:00     473.0
2023-03-01 04:20:00     581.0
Name: volume, Length: 271623, dtype: float64
timestamp
2025-02-14 19:31:00         1
2025-02-14 19:25:00         2
2025-02-14 19:23:00        21
2025-02-14 18:30:00    288666
2025-02-14 18:13:00         1
                        ...  
2023-03-01 07:35:00       100
2023-03-01 07:31:00      1088
2023-03-01 07:28:00      1895
2023-03-01 06:55:00       841
2023-03-01 04:20:00       300
Name: volume, Length: 121211, dtype: int64
timestamp
2025-02-14 19:28:00         2
2025-02-14 19:00:00    288666
2025-02-14 18:51:00         3
2025-02-14 18:36:00         1
2025-02-14 18:35:00         5
                        ...  
2023-03-01 07:23:00       119
2023-03-01 0

In [25]:
# vmap 
P_i = df['close']
V_i = df['volume']
VWAP_t = np.sum(P_i * V_i) / np.sum(V_i)

print(VWAP_t)



98.27838585381825


In [24]:
# realized volatility

P_i = df['close']
P_i_1 = df['close'].shift(1)
RV_t = np.sqrt(np.sum((np.log(P_i) - np.log(P_i_1))**2))

print(RV_t)

1.261466860438353


In [19]:
sigma_t = df['close'].rolling(window=2).std()
sigma_t_1 = df['close'].rolling(window=3).std()
gamma_proxy = abs((sigma_t - sigma_t_1)) / sigma_t_1

print(gamma_proxy)


timestamp
2025-02-14 19:32:00         NaN
2025-02-14 19:31:00         NaN
2025-02-14 19:28:00    0.224745
2025-02-14 19:25:00    0.328921
2025-02-14 19:23:00    0.828088
                         ...   
2023-03-01 06:55:00    0.224743
2023-03-01 06:31:00    0.272419
2023-03-01 06:30:00    1.000000
2023-03-01 06:21:00    0.224745
2023-03-01 04:20:00    0.249501
Name: close, Length: 271623, dtype: float64
