In [34]:
import pandas as pd
from statsmodels.tsa.stattools import grangercausalitytests
import warnings
warnings.filterwarnings("ignore")

df = pd.read_csv('hourly_data_ff.csv')

max_lag = 30
significance_level = 0.05

features = df.columns.difference(['vol', 'timestamp'])

useful_features = {}


In [35]:
for feature in features:
    print(f"\nTesting Granger causality for feature: {feature}")
    try:
        test_result = grangercausalitytests(df[['vol', feature]].dropna(), maxlag=max_lag, verbose=False)
        significant_lags = [
            lag for lag in range(1, max_lag + 1)
            if test_result[lag][0]['ssr_ftest'][1] < significance_level
        ]
        if significant_lags:
            useful_features[feature] = significant_lags
            print(f"  Significant lags: {significant_lags}")
        else:
            print("  No significant lags")
    except Exception as e:
        print(f"  Error testing feature {feature}: {e}")


Testing Granger causality for feature: ask_depth
  Significant lags: [1, 6, 7, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]

Testing Granger causality for feature: ask_slope
  No significant lags

Testing Granger causality for feature: ask_volume
  No significant lags

Testing Granger causality for feature: bid_depth
  Significant lags: [1, 2, 3, 4, 5, 6, 7, 8, 9]

Testing Granger causality for feature: bid_slope
  No significant lags

Testing Granger causality for feature: bid_volume
  Significant lags: [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]

Testing Granger causality for feature: difference
  No significant lags

Testing Granger causality for feature: spread
  Significant lags: [1, 2, 3, 4, 5, 6, 7, 8]

Testing Granger causality for feature: volume_difference
  Significant lags: [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]



In [36]:
for feat, lags in useful_features.items():
    print(f"{feat}: lags {lags}")

ask_depth: lags [1, 6, 7, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]
bid_depth: lags [1, 2, 3, 4, 5, 6, 7, 8, 9]
bid_volume: lags [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]
spread: lags [1, 2, 3, 4, 5, 6, 7, 8]
volume_difference: lags [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]


In [37]:
sent = pd.read_csv('sentiment.csv')
sentiment = sent['Aggregate sentiment'].values

In [38]:
useful_features = {}

# Define features and deduplicate while preserving order
features = ['Aggregate sentiment', 'fng_feature']
features = list(dict.fromkeys(features))

# Loop over features
for feat in features:
    try:
        # Align on dates, drop missing, rename columns
        data = (
            pd.concat([df['vol'], sent[feat]], axis=1, join='inner')
              .dropna()
              .rename(columns={'vol': 'vol', feat: feat})
        )

        # Check for enough observations
        if len(data) <= max_lag:
            print(f'{feat}: not enough observations for max_lag={max_lag}')
            continue

        # Run Granger test
        results = grangercausalitytests(data, maxlag=max_lag, verbose=False)

        # Check for significant lags
        sig_lags = [
            lag for lag in range(1, max_lag + 1)
            if results[lag][0]['ssr_ftest'][1] < significance_level
        ]

        if sig_lags:
            useful_features[feat] = sig_lags
            print(f'{feat}: significant lags {sig_lags}')
        else:
            print(f'{feat}: no significant lags')

    except Exception as e:
        print(f'{feat}: error – {e}')

Aggregate sentiment: significant lags [1, 2]
fng_feature: significant lags [1, 2, 3, 4, 5, 6, 7, 8, 18, 19]
