In [None]:
# ercot_volatility_analysis.py (Enhanced with diagnostics)

import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from arch import arch_model
from matplotlib import cm
from scipy.stats import ttest_ind, mannwhitneyu
from statsmodels.stats.diagnostic import acorr_ljungbox, het_arch

# === Load dataset ===
file_path = r"C:\Users\amina.talipova\Desktop\ercot\datasets\ERCOT_all_hourly_2016.csv"
full_df = pd.read_csv(file_path, parse_dates=['datetime'])

# === Define zones and colormap ===
zones = ['LZ_HOUSTON', 'LZ_NORTH', 'LZ_SOUTH', 'LZ_WEST']
colorscale = cm.get_cmap('Blues', 4)

zone_colors = {
    zone: 'rgba({},{},{},{})'.format(*((np.array(colorscale(i)[:3]) * 255).astype(int)), 0.6)
    for i, zone in enumerate(zones)
}

bess_start_date = pd.to_datetime('2024-01-01')
fig_bess = make_subplots(
    rows=2, cols=2, subplot_titles=zones,
    vertical_spacing=0.12, horizontal_spacing=0.08
)

summary_table = []

for i, zone in enumerate(zones):
    price_col = f'{zone}_rtm'
    df = full_df[['datetime', price_col]].copy()
    df = df[df[price_col] > 0]
    df['log_return'] = np.log(df[price_col]).diff().replace([np.inf, -np.inf], np.nan).clip(-3, 3)
    df = df.dropna(subset=['log_return'])

    df_pre = df[df['datetime'] < bess_start_date].copy()
    df_post = df[df['datetime'] >= bess_start_date].copy()

    try:
        res_pre = arch_model(df_pre['log_return'] * 100, vol='Garch', p=1, q=1).fit(disp='off')
        df_pre['vol'] = res_pre.conditional_volatility
    except:
        df_pre['vol'] = np.nan

    try:
        res_post = arch_model(df_post['log_return'] * 100, vol='Garch', p=1, q=1).fit(disp='off')
        df_post['vol'] = res_post.conditional_volatility
    except:
        df_post['vol'] = np.nan

    df_pre['year'] = df_pre['datetime'].dt.year
    df_post['year'] = df_post['datetime'].dt.year
    avg_yearly = pd.concat([df_pre, df_post]).groupby('year')['vol'].mean().reset_index()

    # Rolling average
    df_pre['rolling_vol'] = df_pre['vol'].rolling(24 * 30).mean()
    df_post['rolling_vol'] = df_post['vol'].rolling(24 * 30).mean()

    r, c = i // 2 + 1, i % 2 + 1
    color = zone_colors[zone]

    fig_bess.add_trace(go.Scatter(
        x=df_pre['datetime'], y=df_pre['rolling_vol'], mode='lines',
        name=f'{zone} Pre-BESS', line=dict(color='lightgray', width=1.2),
        showlegend=(i == 0)
    ), row=r, col=c)

    fig_bess.add_trace(go.Scatter(
        x=df_post['datetime'], y=df_post['rolling_vol'], mode='lines',
        name=f'{zone} Post-BESS', line=dict(color=color, width=1.5),
        showlegend=(i == 0)
    ), row=r, col=c)

    fig_bess.add_trace(go.Scatter(
        x=avg_yearly['year'], y=avg_yearly['vol'],
        mode='lines+text', line=dict(color='black', dash='dash'),
        text=avg_yearly['vol'].round(1), textposition='top right',
        name=f'{zone} Yearly Avg', showlegend=False
    ), row=r, col=c)

    # Summary stats and statistical tests
    mean_pre = df_pre['vol'].mean()
    mean_post = df_post['vol'].mean()
    std_pre = df_pre['vol'].std()
    std_post = df_post['vol'].std()
    _, p_ttest = ttest_ind(df_pre['vol'].dropna(), df_post['vol'].dropna(), equal_var=False)
    _, p_mw = mannwhitneyu(df_pre['vol'].dropna(), df_post['vol'].dropna(), alternative='two-sided')

    # Residual diagnostics (only on post)
    lb_pval = het_arch(res_post.resid.dropna())[1] if res_post else np.nan
    arch_pval = acorr_ljungbox(res_post.resid.dropna(), lags=[12], return_df=True)['lb_pvalue'].iloc[0] if res_post else np.nan

    summary_table.append([
        zone, round(mean_pre, 2), round(mean_post, 2),
        round(std_pre, 1), round(std_post, 1),
        round(p_ttest, 4), round(p_mw, 4), round(lb_pval, 4), round(arch_pval, 4)
    ])

# Set subplot font size and layout
for ann in fig_bess['layout']['annotations']:
    ann['font'] = dict(size=14)

fig_bess.update_layout(
    height=950,
    title="Rolling GARCH Volatility â€“ Pre vs Post BESS",
    plot_bgcolor='white',
    font=dict(family="Arial", size=12),
    margin=dict(l=40, r=40, t=60, b=40)
)
fig_bess.show()

# Print statistical summary
sum_df = pd.DataFrame(summary_table, columns=[
    "Zone", "Pre-BESS Mean", "Post-BESS Mean", "Pre Std", "Post Std",
    "p-value (t-test)", "p-value (MWU)", "ARCH Test (p)", "Ljung-Box (p)"
])
print("\nStatistical Summary of Volatility Pre vs Post BESS:")
print(sum_df.to_string(index=False))
