In [None]:
# Brent Oil Price - Data Exploration

## 1. Setup and Imports

```python
import sys
sys.path.append('..')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Import custom modules
from src.utils.data_loader import load_brent_prices, load_events_data
from src.utils.visualization import plot_price_timeline, plot_distributions
from src.utils.data_preparation import prepare_event_dataset

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

In [None]:
# Load Brent oil prices
data_path = '../data/raw/brent_prices.csv'
df = load_brent_prices(data_path)

# Display basic info
print("Data Info:")
print(df.info())
print("\nFirst 5 rows:")
print(df.head())
print("\nLast 5 rows:")
print(df.tail())

In [None]:
# Calculate statistics
stats = {
    'Total Observations': len(df),
    'Date Range': f"{df['Date'].min().date()} to {df['Date'].max().date()}",
    'Mean Price': f"${df['Price'].mean():.2f}",
    'Median Price': f"${df['Price'].median():.2f}",
    'Standard Deviation': f"${df['Price'].std():.2f}",
    'Minimum Price': f"${df['Price'].min():.2f}",
    'Maximum Price': f"${df['Price'].max():.2f}",
    'Average Daily Return': f"{df['Return'].mean()*100:.4f}%",
    'Annualized Volatility': f"{df['Log_Return'].std() * np.sqrt(252)*100:.2f}%",
}

print("Basic Statistics:")
for key, value in stats.items():
    print(f"{key}: {value}")

# Display distribution statistics
print("\nReturn Statistics:")
print(f"Skewness: {df['Return'].skew():.4f}")
print(f"Kurtosis: {df['Return'].kurtosis():.4f}")
print(f"Sharpe Ratio: {df['Return'].mean()/df['Return'].std()*np.sqrt(252):.4f}")

In [None]:
# Plot full timeline
fig = plot_price_timeline(df)
plt.savefig('../reports/price_timeline.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Plot distributions
fig = plot_distributions(df)
plt.savefig('../reports/distribution_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
from statsmodels.tsa.stattools import adfuller, kpss

# ADF Test for stationarity
def test_stationarity(series, title="Series"):
    print(f"\n{title} Stationarity Tests:")
    
    # ADF Test
    adf_result = adfuller(series.dropna())
    print(f"ADF Statistic: {adf_result[0]:.4f}")
    print(f"p-value: {adf_result[1]:.4f}")
    print(f"Critical Values:")
    for key, value in adf_result[4].items():
        print(f"\t{key}: {value:.4f}")
    
    # KPSS Test
    try:
        kpss_result = kpss(series.dropna(), regression='c')
        print(f"\nKPSS Statistic: {kpss_result[0]:.4f}")
        print(f"p-value: {kpss_result[1]:.4f}")
    except:
        print("\nKPSS Test failed - series may have insufficient data")
    
    return adf_result, kpss_result if 'kpss_result' in locals() else None

# Test price and returns
adf_price, _ = test_stationarity(df['Price'], "Price")
adf_returns, _ = test_stationarity(df['Return'], "Returns")
adf_log_returns, _ = test_stationarity(df['Log_Return'], "Log Returns")

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose

# Resample to monthly for seasonality analysis
monthly_data = df.resample('M', on='Date')['Price'].mean()

# Decompose time series
decomposition = seasonal_decompose(monthly_data.dropna(), model='additive', period=12)

# Plot decomposition
fig, axes = plt.subplots(4, 1, figsize=(15, 12))

axes[0].plot(decomposition.observed)
axes[0].set_ylabel('Observed')
axes[0].set_title('Time Series Decomposition')

axes[1].plot(decomposition.trend)
axes[1].set_ylabel('Trend')

axes[2].plot(decomposition.seasonal)
axes[2].set_ylabel('Seasonal')

axes[3].plot(decomposition.resid)
axes[3].set_ylabel('Residual')
axes[3].set_xlabel('Date')

plt.tight_layout()
plt.savefig('../reports/decomposition.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Calculate different volatility measures
volatility_windows = [5, 10, 20, 30, 60, 90, 252]

fig, axes = plt.subplots(len(volatility_windows), 1, figsize=(15, 20))

for idx, window in enumerate(volatility_windows):
    df[f'Vol_{window}D'] = df['Log_Return'].rolling(window=window).std() * np.sqrt(252)
    
    axes[idx].plot(df['Date'], df[f'Vol_{window}D'] * 100, color='purple', linewidth=1)
    axes[idx].set_title(f'{window}-Day Rolling Volatility', fontsize=12)
    axes[idx].set_ylabel('Volatility (%)')
    axes[idx].grid(True, alpha=0.3)
    
    # Add mean line
    mean_vol = df[f'Vol_{window}D'].mean() * 100
    axes[idx].axhline(y=mean_vol, color='red', linestyle='--', alpha=0.5, 
                     label=f'Mean: {mean_vol:.1f}%')
    axes[idx].legend()

axes[-1].set_xlabel('Date')
plt.tight_layout()
plt.savefig('../reports/volatility_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Prepare event dataset
events_df = prepare_event_dataset('../data/processed/events_dataset.csv')

# Display events
print("Major Events:")
print(events_df[['Date', 'Event Name', 'Category', 'Expected Impact']].to_string())

# Plot price with events
fig = plot_price_timeline(df, events_df[events_df['Impact_Score'] >= 2])
plt.savefig('../reports/price_with_events.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Calculate correlations between different metrics
correlation_metrics = df[['Price', 'Return', 'Log_Return', 'Volatility_30']].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_metrics, annot=True, cmap='coolwarm', center=0,
           square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Correlation Matrix of Key Metrics', fontsize=14)
plt.tight_layout()
plt.savefig('../reports/correlation_matrix.png', dpi=300, bbox_inches='tight')
plt.show()

# Autocorrelation analysis
from pandas.plotting import autocorrelation_plot

fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Price autocorrelation
autocorrelation_plot(df['Price'].dropna(), ax=axes[0])
axes[0].set_title('Price Autocorrelation')

# Returns autocorrelation
autocorrelation_plot(df['Return'].dropna(), ax=axes[1])
axes[1].set_title('Returns Autocorrelation')

plt.tight_layout()
plt.savefig('../reports/autocorrelation.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Identify key periods
crisis_periods = {
    '2008 Financial Crisis': ('2008-08-01', '2009-03-31'),
    '2014 Oil Price Crash': ('2014-06-01', '2015-01-31'),
    '2020 COVID-19': ('2020-02-01', '2020-05-31'),
    '2022 Russia-Ukraine': ('2022-02-01', '2022-04-30')
}

fig, axes = plt.subplots(2, 2, figsize=(15, 10))
axes = axes.flatten()

for idx, (period_name, (start_date, end_date)) in enumerate(crisis_periods.items()):
    period_data = df[(df['Date'] >= start_date) & (df['Date'] <= end_date)]
    
    if len(period_data) > 0:
        axes[idx].plot(period_data['Date'], period_data['Price'], color='red', linewidth=2)
        axes[idx].set_title(period_name, fontsize=12)
        axes[idx].set_ylabel('Price (USD)')
        axes[idx].grid(True, alpha=0.3)
        
        # Calculate statistics
        start_price = period_data.iloc[0]['Price']
        end_price = period_data.iloc[-1]['Price']
        change_pct = (end_price - start_price) / start_price * 100
        
        axes[idx].text(0.05, 0.95, f'Change: {change_pct:.1f}%', 
                      transform=axes[idx].transAxes, fontsize=10,
                      verticalalignment='top',
                      bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

plt.tight_layout()
plt.savefig('../reports/crisis_periods.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
print("="*80)
print("DATA EXPLORATION SUMMARY")
print("="*80)

print("\n1. DATA OVERVIEW:")
print(f"   • Period: {df['Date'].min().date()} to {df['Date'].max().date()}")
print(f"   • Observations: {len(df):,} daily prices")
print(f"   • Price Range: ${df['Price'].min():.2f} to ${df['Price'].max():.2f}")

print("\n2. STATISTICAL PROPERTIES:")
print(f"   • Average Price: ${df['Price'].mean():.2f}")
print(f"   • Volatility (Annualized): {df['Log_Return'].std() * np.sqrt(252)*100:.1f}%")
print(f"   • Returns Skewness: {df['Return'].skew():.4f} (Positive skew)")
print(f"   • Returns Kurtosis: {df['Return'].kurtosis():.4f} (Heavy tails)")

print("\n3. STATIONARITY:")
print(f"   • Price: Non-stationary (ADF p-value: {adf_price[1]:.4f})")
print(f"   • Returns: Stationary (ADF p-value: {adf_returns[1]:.4f})")

print("\n4. KEY INSIGHTS:")
print("   • Clear structural breaks visible in price series")
print("   • High volatility clustering, especially during crises")
print("   • Returns show fat tails and volatility persistence")
print("   • Multiple regimes evident from visual inspection")

print("\n5. IMPLICATIONS FOR MODELING:")
print("   • Change point analysis is appropriate due to structural breaks")
print("   • Bayesian methods can handle uncertainty in regime changes")
print("   • Volatility modeling (e.g., GARCH) may be beneficial")
print("   • External events should be incorporated as potential triggers")

In [None]:
df.to_csv('../data/processed/cleaned_prices.csv', index=False)
print("\nProcessed data saved to: ../data/processed/cleaned_prices.csv")