In [None]:
# Advanced Modeling for Brent Oil Prices

## 1. Setup and Imports

```python
import sys
sys.path.append('..')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Import custom modules
from src.utils.data_loader import load_brent_prices, load_events_data
from src.utils.visualization import plot_regime_analysis, plot_event_impact
from src.models.advanced_models import VectorAutoregressionModel, MarkovSwitchingModel, GaussianMixtureRegimeDetection

# Import statsmodels
from statsmodels.tsa.stattools import grangercausalitytests, coint
from statsmodels.tsa.api import VAR, VARMAX
from statsmodels.tsa.vector_ar.var_model import VARResults
from arch import arch_model

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

In [None]:
# Load cleaned data
df = pd.read_csv('../data/processed/cleaned_prices.csv', parse_dates=['Date'])
events_df = pd.read_csv('../data/processed/events_dataset.csv', parse_dates=['Date'])

print(f"Data loaded: {len(df)} observations")
print(f"Events loaded: {len(events_df)} events")

# Display recent data
print("\nRecent Data:")
print(df.tail())

In [None]:
# Create synthetic macroeconomic data for demonstration
# In practice, you would use actual macroeconomic data
np.random.seed(42)
n = len(df)

# Create synthetic macroeconomic variables
macro_vars = pd.DataFrame({
    'Date': df['Date'],
    'GDP_Growth': np.random.normal(2.5, 1.5, n) + 0.01 * np.sin(2 * np.pi * np.arange(n) / 252),
    'Inflation': np.random.normal(2.0, 1.0, n) + 0.0005 * np.arange(n),
    'USD_Index': np.random.normal(90, 10, n),
    'VIX': np.random.normal(20, 5, n) + df['Return'].abs() * 100,
})

# Merge with price data
var_data = pd.merge(df[['Date', 'Price', 'Return', 'Volatility_30']], 
                    macro_vars, on='Date', how='inner')
var_data = var_data.set_index('Date')

print("VAR Data Shape:", var_data.shape)
print("\nVariables:")
print(var_data.columns.tolist())

In [None]:
def perform_granger_causality(data, max_lag=10):
    """
    Perform Granger causality tests between variables
    """
    results = {}
    
    variables = data.columns.tolist()
    
    for cause in variables:
        for effect in variables:
            if cause != effect:
                test_data = data[[effect, cause]].dropna()
                
                if len(test_data) > max_lag * 2:
                    try:
                        granger_test = grangercausalitytests(test_data, maxlag=max_lag, verbose=False)
                        
                        # Get p-values for different lags
                        p_values = [granger_test[i+1][0]['ssr_ftest'][1] for i in range(max_lag)]
                        min_p_value = min(p_values)
                        best_lag = p_values.index(min_p_value) + 1
                        
                        if min_p_value < 0.05:
                            results[(cause, effect)] = {
                                'best_lag': best_lag,
                                'p_value': min_p_value,
                                'causal': True
                            }
                    except:
                        continue
    
    return results

# Perform Granger causality tests
print("Performing Granger causality tests...")
granger_results = perform_granger_causality(var_data[['Price', 'VIX', 'USD_Index']].dropna(), max_lag=5)

print("\nSignificant Granger Causality Results (p < 0.05):")
for (cause, effect), stats in granger_results.items():
    if stats['causal']:
        print(f"{cause} -> {effect}: lag={stats['best_lag']}, p={stats['p_value']:.4f}")

In [None]:
def test_cointegration(series1, series2, series1_name, series2_name):
    """
    Test for cointegration between two series
    """
    # Drop NaN values and align indices
    aligned_data = pd.concat([series1, series2], axis=1).dropna()
    if len(aligned_data) < 10:
        return None
    
    try:
        coint_result = coint(aligned_data.iloc[:, 0], aligned_data.iloc[:, 1])
        
        return {
            'series1': series1_name,
            'series2': series2_name,
            'test_statistic': coint_result[0],
            'p_value': coint_result[1],
            'critical_values': coint_result[2],
            'is_cointegrated': coint_result[1] < 0.05
        }
    except:
        return None

# Test cointegration between key variables
print("Cointegration Tests:")
coint_tests = [
    ('Price', 'GDP_Growth'),
    ('Price', 'USD_Index'),
    ('Price', 'VIX'),
]

for var1, var2 in coint_tests:
    result = test_cointegration(var_data[var1], var_data[var2], var1, var2)
    if result:
        status = "COINTEGRATED" if result['is_cointegrated'] else "Not cointegrated"
        print(f"{var1} vs {var2}: {status} (p={result['p_value']:.4f})")

In [None]:
# Prepare data for VAR
var_model_data = var_data[['Price', 'VIX', 'USD_Index']].dropna()

# Create and fit VAR model
var_model = VectorAutoregressionModel(max_lags=10)
var_results = var_model.fit(var_model_data, lags=2)

# Generate forecasts
forecast_steps = 30
forecasts = var_model.forecast(steps=forecast_steps)

print(f"\nVAR Forecasts (next {forecast_steps} days):")
print(forecasts.head())

# Plot forecasts
fig, axes = plt.subplots(3, 1, figsize=(15, 12))

for idx, var in enumerate(['Price', 'VIX', 'USD_Index']):
    # Historical data
    axes[idx].plot(var_model_data.index[-100:], var_model_data[var][-100:], 
                   label='Historical', color='steelblue', linewidth=2)
    
    # Forecast
    forecast_dates = pd.date_range(start=var_model_data.index[-1], 
                                   periods=forecast_steps+1, freq='D')[1:]
    axes[idx].plot(forecast_dates, forecasts[var], 
                   label='Forecast', color='red', linewidth=2, linestyle='--')
    
    axes[idx].set_title(f'{var} - VAR Forecast', fontsize=12)
    axes[idx].set_ylabel(var)
    axes[idx].legend()
    axes[idx].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../reports/var_forecasts.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Fit Markov Switching model
print("Fitting Markov Switching Model...")
ms_model = MarkovSwitchingModel(n_regimes=3)
ms_results = ms_model.fit(df['Price'].dropna(), model_type='mean_var')

# Get regime probabilities
regime_probs = ms_model.get_regime_probabilities()

# Plot regime probabilities
fig, axes = plt.subplots(3, 1, figsize=(15, 10))

# Plot price
axes[0].plot(df['Date'], df['Price'], color='steelblue', linewidth=1)
axes[0].set_ylabel('Price (USD)')
axes[0].set_title('Brent Price with Regime Probabilities')
axes[0].grid(True, alpha=0.3)

# Plot regime probabilities
for i in range(3):
    axes[1].fill_between(df['Date'][:len(regime_probs)], 
                        regime_probs.iloc[:, i], 
                        alpha=0.5, label=f'Regime {i+1}')
axes[1].set_ylabel('Probability')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

# Plot most likely regime
most_likely_regime = regime_probs.idxmax(axis=1)
axes[2].plot(df['Date'][:len(most_likely_regime)], most_likely_regime, 
            color='green', linewidth=2, drawstyle='steps-post')
axes[2].set_ylabel('Regime')
axes[2].set_xlabel('Date')
axes[2].set_yticks([0, 1, 2])
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../reports/markov_regimes.png', dpi=300, bbox_inches='tight')
plt.show()

# Print regime statistics
print("\nRegime Statistics:")
params = ms_model.get_regime_parameters()
for key, value in params.items():
    print(f"{key}: {value}")

In [None]:
# Use GMM for regime detection
print("\nGaussian Mixture Model for Regime Detection...")
gmm_model = GaussianMixtureRegimeDetection(n_regimes=3)

# Extract features
features = gmm_model.extract_features(df['Price'], window=30)
print(f"Extracted {len(features)} features")

# Fit GMM
features_with_regimes = gmm_model.fit(features)

# Plot regimes
fig, axes = plt.subplots(2, 1, figsize=(15, 10))

# Plot price colored by regime
scatter = axes[0].scatter(df['Date'][:len(features_with_regimes)], 
                         df['Price'][:len(features_with_regimes)], 
                         c=features_with_regimes['regime'], 
                         cmap='viridis', s=10, alpha=0.6)
axes[0].set_ylabel('Price (USD)')
axes[0].set_title('Price Colored by GMM Regime')
plt.colorbar(scatter, ax=axes[0], label='Regime')

# Plot regime statistics
regime_stats = pd.DataFrame(gmm_model.regime_stats).T
axes[1].bar(regime_stats.index, regime_stats['probability'], 
           color=['red', 'green', 'blue'], alpha=0.6)
axes[1].set_xlabel('Regime')
axes[1].set_ylabel('Probability')
axes[1].set_title('Regime Probabilities')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../reports/gmm_regimes.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nGMM Regime Statistics:")
print(regime_stats)

In [None]:
# Analyze impact of major events
print("\nEvent Study Analysis")

# Select major events
major_events = events_df[events_df['Impact_Score'] >= 2]

event_impacts = []

for _, event in major_events.iterrows():
    event_date = event['Date']
    event_name = event['Event Name']
    
    # Calculate pre- and post-event returns
    pre_window = 30
    post_window = 30
    
    # Find event in data
    event_idx = (df['Date'] - event_date).abs().argmin()
    actual_event_date = df.iloc[event_idx]['Date']
    
    if event_idx > pre_window and event_idx < len(df) - post_window:
        # Pre-event returns
        pre_returns = df['Return'].iloc[event_idx-pre_window:event_idx].values
        pre_mean_return = np.mean(pre_returns) * 100
        
        # Post-event returns
        post_returns = df['Return'].iloc[event_idx:event_idx+post_window].values
        post_mean_return = np.mean(post_returns) * 100
        
        # Calculate impact
        impact = post_mean_return - pre_mean_return
        
        event_impacts.append({
            'Event': event_name,
            'Date': actual_event_date.date(),
            'Category': event['Category'],
            'Pre_Event_Return': pre_mean_return,
            'Post_Event_Return': post_mean_return,
            'Impact': impact,
            'Absolute_Impact': abs(impact)
        })

# Create impact DataFrame
impact_df = pd.DataFrame(event_impacts).sort_values('Absolute_Impact', ascending=False)

print("\nTop 10 Events by Absolute Impact:")
print(impact_df[['Event', 'Date', 'Category', 'Impact']].head(10).to_string())

# Plot event impacts
fig, ax = plt.subplots(figsize=(15, 8))

# Sort by impact
impact_df_sorted = impact_df.sort_values('Impact')

# Create bar plot
colors = ['red' if x < 0 else 'green' for x in impact_df_sorted['Impact']]
bars = ax.barh(range(len(impact_df_sorted)), impact_df_sorted['Impact'], color=colors, alpha=0.6)
ax.set_yticks(range(len(impact_df_sorted)))
ax.set_yticklabels(impact_df_sorted['Event'])
ax.set_xlabel('Impact on Returns (%)')
ax.set_title('Event Impact Analysis (30-day window)')
ax.grid(True, alpha=0.3, axis='x')

# Add value labels
for i, bar in enumerate(bars):
    width = bar.get_width()
    ax.text(width if width >= 0 else width - 0.5, bar.get_y() + bar.get_height()/2,
            f'{width:.1f}%', ha='left' if width >= 0 else 'right', va='center')

plt.tight_layout()
plt.savefig('../reports/event_impacts.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Compare different models
print("="*80)
print("MODEL COMPARISON")
print("="*80)

# Calculate metrics for comparison
models_metrics = {}

# 1. Bayesian Change Point Model (from Task 2)
print("\n1. Bayesian Change Point Model:")
print("   • Strengths: Identifies structural breaks, provides uncertainty estimates")
print("   • Weaknesses: Computationally intensive, assumes abrupt changes")
print("   • Best for: Detecting regime shifts, event impact analysis")

# 2. VAR Model
print("\n2. Vector Autoregression (VAR) Model:")
print("   • Strengths: Captures interdependencies, good for forecasting")
print("   • Weaknesses: Requires stationary data, sensitive to lag selection")
print("   • Best for: Multi-variable analysis, short-term forecasting")

# 3. Markov Switching Model
print("\n3. Markov Switching Model:")
print("   • Strengths: Explicit regime modeling, handles smooth transitions")
print("   • Weaknesses: Complex estimation, may overfit")
print("   • Best for: Regime detection, risk management")

# 4. GARCH Model
print("\n4. GARCH Model:")
print("   • Strengths: Captures volatility clustering, good for risk metrics")
print("   • Weaknesses: Assumes symmetric response, may miss structural breaks")
print("   • Best for: Volatility forecasting, risk measurement")

# 5. Gaussian Mixture Model
print("\n5. Gaussian Mixture Model:")
print("   • Strengths: Flexible regime detection, handles complex distributions")
print("   • Weaknesses: No temporal structure, may be sensitive to initialization")
print("   • Best for: Market regime classification, feature-based analysis")

print("\n" + "="*80)
print("RECOMMENDATIONS FOR STAKEHOLDERS:")
print("="*80)
print("\nFor Investors:")
print("   • Use Bayesian change point for risk management triggers")
print("   • Employ Markov switching for portfolio regime adjustments")
print("   • Monitor GARCH volatility for position sizing")

print("\nFor Policymakers:")
print("   • VAR models for understanding market interdependencies")
print("   • Event study analysis for policy impact assessment")
print("   • Regime detection for early warning systems")

print("\nFor Energy Companies:")
print("   • Combine multiple models for robust forecasting")
print("   • Use change point analysis for contract timing")
print("   • Implement regime-aware hedging strategies")

In [None]:
print("\n" + "="*80)
print("FUTURE WORK AND EXTENSIONS")
print("="*80)

print("\n1. Machine Learning Approaches:")
print("   • LSTM/GRU neural networks for price forecasting")
print("   • Random Forests for feature importance analysis")
print("   • XGBoost for regime classification")

print("\n2. Advanced Bayesian Methods:")
print("   • Bayesian Structural Time Series (BSTS)")
print("   • Gaussian Processes for uncertainty quantification")
print("   • Hierarchical models for multiple time series")

print("\n3. Alternative Data Integration:")
print("   • Satellite imagery for inventory tracking")
print("   • News sentiment analysis from financial media")
print("   • Shipping and logistics data for supply chain insights")

print("\n4. Real-time Analysis:")
print("   • Streaming data pipelines for live monitoring")
print("   • Automated event detection from news feeds")
print("   • Real-time change point alerts")

print("\n5. Causal Inference:")
print("   • Difference-in-differences for policy evaluation")
print("   • Synthetic control methods for counterfactual analysis")
print("   • Instrumental variable approaches for causal effects")

# Save results
impact_df.to_csv('../results/event_impact_analysis.csv', index=False)
print("\nResults saved to ../results/event_impact_analysis.csv")