In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:

# Load the dataset
df = pd.read_csv('../data/CL_F_data.csv')

window = 5 # set Window to the past 5 days

# Compute volatility over window -------------------------------------------------
df['realized_vol']= df['log_return'].rolling(window).std() *np.sqrt(252)
df['rolling_mean'] = df['log_return'].rolling(window).mean() # Rolling mean
df['rolling_std'] = df['log_return'].rolling(window).std() # Rolling standard deviation

# Add diffrent established volatility estimators -------------------------------------------------

## 1) Parkisons volatility
df['parkinson_vol'] = np.sqrt((1 / (4 * np.log(2))) * (np.log(df['High'] / df['Low']) ** 2)) 

## 2) Garman–Klass
df['garman_klass'] = np.sqrt(
    0.5 * (np.log(df['High'] / df['Low']) ** 2)
  - (2 * np.log(2) - 1) * (np.log(df['Close'] / df['Open']) ** 2)
)

## 3) Rogers–Satchell
df['rogers_satchell'] = np.sqrt(
    (np.log(df['High'] / df['Open']) * 
     (np.log(df['High'] / df['Open']) - np.log(df['Close'] / df['Open'])))
  + (np.log(df['Low']  / df['Open']) * 
     (np.log(df['Low']  / df['Open']) - np.log(df['Close'] / df['Open'])))
)

## 4) Yang–Zhang
###   a) Overnight & open-to-close returns
df['overnight_ret']    = np.log(df['Open'] / df['Close'].shift(1))
df['open_close_ret']   = np.log(df['Close'] / df['Open'])

###   b) rolling variances
k = 0.34
ov_var = df['overnight_ret'].rolling(window).var()       
oc_var = df['open_close_ret'].rolling(window).var()      
rs_var = df['rogers_satchell']**2                        

###   c) combine and annualize by sqrt(252)
yz_var = ov_var + k * oc_var + (1 - k) * rs_var
df['yang_zhang'] = np.sqrt(yz_var * 252)

## 5) Volume dynamics: daily percentage change in trading volume -------------------------------------------------
df['volume_change'] = df['Volume'].pct_change()
df = df.dropna() 
df['future_vol'] = df['realized_vol'].shift(-window)
df = df.dropna()
df['target'] = (df['future_vol'] > df['realized_vol']).astype(int)

## 6) Add new features to improve AUC scores -------------------------------------------------
df['return_lag1'] = df['log_return'].shift(1)
df['volume_lag1'] = df['Volume'].shift(1)

def rolling_stats(series, window):
    return series.rolling(window).mean(), series.rolling(window).std()

df['ma_5'], df['std_5'] = rolling_stats(df['log_return'], 5)
df['ma_10'], df['std_10'] = rolling_stats(df['log_return'], 10)
df['momentum_5'] = df['log_return'] - df['log_return'].shift(5)
df['volatility_5'] = df['log_return'].rolling(5).std()
df.dropna(inplace=True)

In [None]:
# set predictors and target
feature = ['log_return', 'rolling_mean', 'rolling_std',  'volume_change', 
            'parkinson_vol','garman_klass', 'rogers_satchell', 'yang_zhang',
            'return_lag1', 'volume_lag1', 'ma_5', 'std_5', 'ma_10', 'std_10', 
            'momentum_5', 'volatility_5'
            ]

predictor = df[feature]
target = df['target']

# train/test split, shuffle set to false to be time-aware
X_train, X_test, y_train, y_test = train_test_split(
    predictor, target, test_size=1/5, shuffle=False
)

# scale your inputs
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled  = scaler.transform(X_test)

# build and fit model
model = LogisticRegression(
    penalty='l2', C=1.0, solver='liblinear', class_weight='balanced'
)
model.fit(X_train_scaled, y_train)

# extract and sort coefficients to see feature importance
coef_df = (
    pd.DataFrame({
        'feature': feature,
        'coef':    model.coef_[0]
    })
    .assign(abs_coef=lambda d: d.coef.abs())
    .sort_values('abs_coef', ascending=False)
)
print(coef_df)