# Weather Forecast Data Download (Nov 20-29, 2025)

This notebook downloads hourly weather **forecast** data from Open-Meteo API for multiple load areas across the PJM region.
Data is retrieved in UTC and converted to Eastern Time (ET).

**Date Range**: November 20, 2025 00:00 UTC to November 29, 2025 23:00 UTC  
**Note**: Uses forecast API for future dates

## 1. Import Required Libraries

In [46]:
import requests
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import pytz
import os
import time
import holidays
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import warnings
import holidays
import pickle

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import confusion_matrix, classification_report
from xgboost import XGBRegressor, XGBClassifier
from lightgbm import LGBMRegressor, LGBMClassifier
from catboost import CatBoostRegressor
from sklearn.linear_model import LogisticRegression
from tqdm import tqdm

warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("husl")

print("Libraries imported successfully")

Libraries imported successfully


## 2. Define Load Area Coordinates

Define the latitude and longitude for each load area in the PJM region.

In [12]:
zone_coords = pd.DataFrame({
    'load_area': ['AECO', 'AEPAPT', 'AEPIMP', 'AEPKPT', 'AEPOPT', 'AP', 'BC', 'CE', 'DAY', 'DEOK',
                  'DOM', 'DPLCO', 'DUQ', 'EASTON', 'EKPC', 'JC', 'ME', 'OE', 'OVEC', 'PAPWR',
                  'PE', 'PEPCO', 'PLCO', 'PN', 'PS', 'RECO', 'SMECO', 'UGI', 'VMEU'],
    'lat': [39.45, 37.25, 38.45, 38.20, 39.90, 37.30, 40.80, 41.85, 39.75, 39.10,
            37.55, 38.90, 40.45, 39.55, 37.75, 40.35, 40.20, 41.10, 38.85, 40.70,
            40.00, 38.90, 40.95, 41.15, 40.75, 41.00, 38.40, 40.25, 37.30],
    'lon': [-74.50, -81.30, -81.60, -83.10, -82.90, -80.90, -79.95, -86.10, -84.20, -84.50,
            -77.45, -75.50, -79.90, -75.10, -84.30, -74.65, -76.00, -81.25, -82.85, -77.80,
            -75.20, -76.95, -77.40, -77.80, -74.15, -74.10, -76.70, -75.65, -76.00]
})

print(f"Total load areas: {len(zone_coords)}")
zone_coords.head()

Total load areas: 29


Unnamed: 0,load_area,lat,lon
0,AECO,39.45,-74.5
1,AEPAPT,37.25,-81.3
2,AEPIMP,38.45,-81.6
3,AEPKPT,38.2,-83.1
4,AEPOPT,39.9,-82.9


## 3. Configure API Parameters

In [15]:
# Date range to download
start_date = "2025-11-20"
end_date = "2025-11-30"

# API endpoint - using forecast API for future dates
api = "https://api.open-meteo.com/v1/forecast"

# Weather variables to retrieve
hourly_vars = "temperature_2m,relative_humidity_2m,dew_point_2m,precipitation,wind_speed_10m"

# Create output directory
os.makedirs("data/weather", exist_ok=True)

print(f"Date range: {start_date} to {end_date}")
print(f"Variables: {hourly_vars}")
print(f"Using forecast API for future dates")

Date range: 2025-11-20 to 2025-11-30
Variables: temperature_2m,relative_humidity_2m,dew_point_2m,precipitation,wind_speed_10m
Using forecast API for future dates


## 4. Define Weather Data Fetcher Function

In [18]:
def fetch_weather_data(lat, lon, start_date, end_date, tries=3):
    """Fetch weather data for a specific date range and location"""
    # For forecast API, we use forecast_days parameter (max 16 days)
    url = (f"{api}?latitude={lat}&longitude={lon}"
           f"&hourly={hourly_vars}&timezone=UTC&forecast_days=16")
    
    for k in range(1, tries + 1):
        try:
            res = requests.get(url, timeout=60)
            if res.status_code == 200:
                j = res.json()
                if 'hourly' in j and 'time' in j['hourly']:
                    df = pd.DataFrame({
                        'datetime_beginning_utc': j['hourly']['time'],
                        'temp': j['hourly']['temperature_2m'],
                        'humidity': j['hourly']['relative_humidity_2m'],
                        'dew_point': j['hourly']['dew_point_2m'],
                        'precip': j['hourly']['precipitation'],
                        'wind': j['hourly']['wind_speed_10m']
                    })
                    
                    # Filter to the specific date range we want
                    df['datetime_beginning_utc'] = pd.to_datetime(df['datetime_beginning_utc'])
                    mask = (df['datetime_beginning_utc'] >= start_date) & (df['datetime_beginning_utc'] <= end_date + ' 23:00:00')
                    df = df[mask].copy()
                    df['datetime_beginning_utc'] = df['datetime_beginning_utc'].astype(str)
                    
                    return df
                else:
                    print(f"Unexpected response format for lat={lat}, lon={lon}")
            else:
                print(f"HTTP {res.status_code} for lat={lat}, lon={lon}")
        except Exception as e:
            print(f"Error on attempt {k} for lat={lat}, lon={lon}: {e}")
        
        if k < tries:
            time.sleep(0.7 * k)
    
    return None

## 5. Download Weather Data for All Load Areas

This cell downloads weather data for the specified date range across all load areas.

In [21]:
# Initialize list to collect data from all zones
all_data = []

print(f"Downloading weather data for {len(zone_coords)} load areas...\n")
print("="*60)

for idx, row in zone_coords.iterrows():
    zone = row['load_area']
    lat = row['lat']
    lon = row['lon']
    
    print(f"[{idx+1}/{len(zone_coords)}] Fetching {zone} (lat={lat}, lon={lon})...")
    
    # Fetch data for this zone
    df = fetch_weather_data(lat, lon, start_date, end_date)
    
    if df is not None and not df.empty:
        # Add load area identifier
        df['load_area'] = zone
        all_data.append(df)
        print(f"  ✓ Successfully fetched {len(df)} hourly records")
    else:
        print(f"  ✗ Failed to fetch data for {zone}")
    
    # Small delay between requests to be respectful to the API
    time.sleep(0.5)

print("="*60)
print(f"\nData collection complete! Total zones processed: {len(all_data)}/{len(zone_coords)}")

Downloading weather data for 29 load areas...

[1/29] Fetching AECO (lat=39.45, lon=-74.5)...
  ✓ Successfully fetched 264 hourly records
[2/29] Fetching AEPAPT (lat=37.25, lon=-81.3)...
  ✓ Successfully fetched 264 hourly records
[3/29] Fetching AEPIMP (lat=38.45, lon=-81.6)...
  ✓ Successfully fetched 264 hourly records
[4/29] Fetching AEPKPT (lat=38.2, lon=-83.1)...
  ✓ Successfully fetched 264 hourly records
[5/29] Fetching AEPOPT (lat=39.9, lon=-82.9)...
  ✓ Successfully fetched 264 hourly records
[6/29] Fetching AP (lat=37.3, lon=-80.9)...
  ✓ Successfully fetched 264 hourly records
[7/29] Fetching BC (lat=40.8, lon=-79.95)...
  ✓ Successfully fetched 264 hourly records
[8/29] Fetching CE (lat=41.85, lon=-86.1)...
  ✓ Successfully fetched 264 hourly records
[9/29] Fetching DAY (lat=39.75, lon=-84.2)...
  ✓ Successfully fetched 264 hourly records
[10/29] Fetching DEOK (lat=39.1, lon=-84.5)...
  ✓ Successfully fetched 264 hourly records
[11/29] Fetching DOM (lat=37.55, lon=-77.45).

## 6. Combine Data and Convert to Eastern Time

Combine all load area data and convert UTC timestamps to Eastern Time.

In [26]:
if all_data:
    # Combine all data
    combined_df = pd.concat(all_data, ignore_index=True)
    
    print(f"Combined dataset shape: {combined_df.shape}")
    print(f"Date range (UTC): {combined_df['datetime_beginning_utc'].min()} to {combined_df['datetime_beginning_utc'].max()}")
    
    # Convert UTC to Eastern Time
    print("\nConverting UTC to Eastern Time...")
    combined_df['datetime_beginning_utc'] = pd.to_datetime(combined_df['datetime_beginning_utc'])
    
    # Create Eastern Time column
    utc = pytz.UTC
    eastern = pytz.timezone('US/Eastern')
    et_time = combined_df['datetime_beginning_utc'].dt.tz_localize(utc).dt.tz_convert(eastern)
    
    # Format datetime in M/D/YYYY H:MM:SS AM/PM format
    # Remove timezone info and format manually for cross-platform compatibility
    et_time_no_tz = et_time.dt.tz_localize(None)
    
    # Try Unix format first, fall back to Windows format if needed
    try:
        combined_df['datetime_beginning_ept'] = et_time_no_tz.dt.strftime('%-m/%-d/%Y %-I:%M:%S %p')
    except:
        # Windows format
        combined_df['datetime_beginning_ept'] = et_time_no_tz.dt.strftime('%#m/%#d/%Y %#I:%M:%S %p')
    
    # Remove UTC column and reorder
    combined_df = combined_df.drop('datetime_beginning_utc', axis=1)
    column_order = ['datetime_beginning_ept', 'load_area', 'temp', 'humidity', 'dew_point', 'precip', 'wind']
    combined_df = combined_df[column_order]
    
    print("✓ Time conversion complete")
    
    # Display sample
    print("\nSample of data:")
    display(combined_df.head(10))
else:
    print("⚠ No data was successfully downloaded!")

Combined dataset shape: (7656, 7)
Date range (UTC): 2025-11-20 00:00:00 to 2025-11-30 23:00:00

Converting UTC to Eastern Time...
✓ Time conversion complete

Sample of data:


Unnamed: 0,datetime_beginning_ept,load_area,temp,humidity,dew_point,precip,wind
0,11/19/2025 7:00:00 PM,AECO,5.8,90,4.3,0.0,14.9
1,11/19/2025 8:00:00 PM,AECO,5.5,90,4.0,0.0,13.7
2,11/19/2025 9:00:00 PM,AECO,5.3,89,3.6,0.0,12.8
3,11/19/2025 10:00:00 PM,AECO,5.2,89,3.5,0.0,12.2
4,11/19/2025 11:00:00 PM,AECO,5.2,89,3.5,0.0,11.8
5,11/20/2025 12:00:00 AM,AECO,5.2,89,3.6,0.0,10.9
6,11/20/2025 1:00:00 AM,AECO,5.2,88,3.4,0.0,12.4
7,11/20/2025 2:00:00 AM,AECO,5.0,89,3.3,0.0,10.8
8,11/20/2025 3:00:00 AM,AECO,4.9,88,3.1,0.0,10.5
9,11/20/2025 4:00:00 AM,AECO,4.6,88,2.8,0.0,10.7


In [28]:
# ============================================================================
# CONFIGURATION
# ============================================================================

# Directory paths (relative to src/)
FIGURES_DIR = "../figures"
OUTPUT_DIR = "../output"

# Create directories if they don't exist
os.makedirs(FIGURES_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Target 29 load areas
KEEP_AREAS = [
    "AECO", "AEPAPT", "AEPIMP", "AEPKPT", "AEPOPT", "AP", "BC", "CE", "DAY", "DEOK",
    "DOM", "DPLCO", "DUQ", "EASTON", "EKPC", "JC", "ME", "OE", "OVEC", "PAPWR",
    "PE", "PEPCO", "PLCO", "PN", "PS", "RECO", "SMECO", "UGI", "VMEU"
]

# Best Model Finding dates (2025 test)
TEST_START = '2024-11-20 00:00:00'
TEST_END = '2024-11-29 23:00:00'

# Rolling window for peak days
WINDOW_SIZE = 10  # days
NUM_PEAK_DAYS = 2

In [42]:
df= combined_df.copy()
def parse_et(series):
    """Parse datetime with Eastern Time timezone handling"""
    if pd.api.types.is_datetime64_any_dtype(series):
        result = pd.to_datetime(series)
        if result.dt.tz is None:
            return result.dt.tz_localize('America/New_York', ambiguous='NaT', nonexistent='NaT')
        return result
    result = pd.to_datetime(series, format='%m/%d/%Y %I:%M:%S %p', errors='coerce')
    mask = result.isna()
    if mask.any():
        result[mask] = pd.to_datetime(series[mask], errors='coerce')
    return result.dt.tz_localize('America/New_York', ambiguous='NaT', nonexistent='NaT')

df = df.rename(columns={
    'datetime_beginning_ept': 'datetime',
    'load_area': 'region',
    'temp': 'temperature',
    'precip': 'precipitation',
    'wind': 'wind_speed'
})

# Keep only necessary columns
keep_cols = ['datetime', 'region', 'temperature', 'humidity', 'precipitation', 'wind_speed']
df = df[keep_cols]

df['datetime'] = parse_et(df['datetime'])
df = df.dropna(subset=['datetime'])
df = df.sort_values('datetime').reset_index(drop=True)

In [48]:
def add_features(df):
    """Add temporal and calendar features"""
    df = df.copy()
    
    # ===== TEMPORAL FEATURES =====
    df['hour'] = df['datetime'].dt.hour
    df['day_of_week'] = df['datetime'].dt.dayofweek  # Monday=0, Sunday=6
    df['month'] = df['datetime'].dt.month
    df['day_of_month'] = df['datetime'].dt.day
    df['day_of_year'] = df['datetime'].dt.dayofyear
    df['week_of_year'] = df['datetime'].dt.isocalendar().week
    df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
    
    # Cyclical encoding
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
    df['day_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
    df['day_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)
    
    # ===== HOLIDAY FEATURES =====
    # Create US holiday calendar
    us_holidays = holidays.US(years=range(2016, 2026))
    df['date'] = df['datetime'].dt.date
    df['is_holiday'] = df['date'].apply(lambda x: int(x in us_holidays))
    
    # Day before/after holiday
    df['is_day_before_holiday'] = df['is_holiday'].shift(-24).fillna(0).astype(int)
    df['is_day_after_holiday'] = df['is_holiday'].shift(24).fillna(0).astype(int)
    
    # Thanksgiving - Cooking load, midday peak
    df['is_thanksgiving'] = df['date'].apply(
        lambda x: int(1 if us_holidays.get(x) == 'Thanksgiving' else 0)
    )
    
    # Christmas - Low commercial, high residential heating
    df['is_christmas'] = df['date'].apply(
        lambda x: int(1 if us_holidays.get(x) == 'Christmas Day' else 0)
    )
    
    # New Year's Day - Late night/early morning shift
    df['is_new_years'] = df['date'].apply(
        lambda x: int(1 if us_holidays.get(x) == "New Year's Day" else 0)
    )
    
    # July 4th - Summer, outdoor, evening grilling/fireworks
    df['is_july4'] = df['date'].apply(
        lambda x: int(1 if us_holidays.get(x) == 'Independence Day' else 0)
    )
    # Others
    df['is_other_holiday'] = (df['is_holiday']-df['is_thanksgiving']-df['is_christmas']-df['is_new_years']-df['is_july4'])
    
    return df

df = add_features(df)

In [None]:
FEATURE_COLS = [
    # Temporal
    'hour', 'day_of_week', 'month', 'day_of_month', 'day_of_year', 'week_of_year', 'is_weekend',
    'hour_sin', 'hour_cos', 'day_sin', 'day_cos',
    # Weather
    'temperature', 'humidity', 'wind_speed', 'precipitation',
    # 'is_holiday', 'is_day_before_holiday', 'is_day_after_holiday'
    'is_thanksgiving', 'is_christmas', 'is_new_years', 'is_july4', 'is_other_holiday', 'is_day_before_holiday', 'is_day_after_holiday'
]

In [50]:
test_start = pd.to_datetime(TEST_START)
test_end = pd.to_datetime(TEST_END)

def prepare_data(df, test_start, test_end, FEATURE_COLS, region=None):
    """
    Prepare train and test data for a given time period and region.
    """
    # Filter by region if specified
    if region is not None:
        df = df[df['region'] == region].copy()
    
    # Convert string dates to datetime
    test_start_dt = pd.Timestamp(test_start).tz_localize('America/New_York')
    test_end_dt = pd.Timestamp(test_end).tz_localize('America/New_York')
    
    # Split data
    test_data = df[(df['datetime'] >= test_start_dt) & (df['datetime'] <= test_end_dt)].copy()
    
    # Drop rows with missing lag features
    test_data = test_data.dropna(subset=FEATURE_COLS)
    
    # Prepare X and y
    X_test = test_data[FEATURE_COLS].values
    y_test = test_data[TARGET_COL].values
    
    return X_test, y_test, test_data

print("Helper functions defined")

Helper functions defined


In [52]:
regions = sorted(df['region'].unique())
print(f"Number of regions: {len(regions)}")
print(f"Regions: {regions}")

Number of regions: 29
Regions: ['AECO', 'AEPAPT', 'AEPIMP', 'AEPKPT', 'AEPOPT', 'AP', 'BC', 'CE', 'DAY', 'DEOK', 'DOM', 'DPLCO', 'DUQ', 'EASTON', 'EKPC', 'JC', 'ME', 'OE', 'OVEC', 'PAPWR', 'PE', 'PEPCO', 'PLCO', 'PN', 'PS', 'RECO', 'SMECO', 'UGI', 'VMEU']


In [None]:
import pickle
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# Load the saved models
models_dir = os.path.join(OUTPUT_DIR, 'trained_models')
model_path = os.path.join(models_dir, 'hourly_load_models.pkl')

print("Loading saved models...")
with open(model_path, 'rb') as f:
    hourly_best_models = pickle.load(f)
print(f"Loaded {len(hourly_best_models)} regional models")

# Define prediction period
PRED_START = '2025-11-20'  # Adjust as needed
PRED_END = '2025-11-29'    # Adjust as needed

print(f"\nGenerating predictions for {PRED_START} to {PRED_END}")

# Generate predictions for all regions
predictions_list = []

for region in tqdm(regions, desc="Generating predictions"):
    # Get model info for this region
    model_info = hourly_best_models[region]
    model = model_info['model']
    feature_cols = model_info['feature_cols']
    method = model_info['method']
    
    # Prepare prediction data
    pred_data = df[df['load_area'] == region].copy()
    pred_data = pred_data[(pred_data['datetime_beginning_ept'] >= PRED_START) & 
                          (pred_data['datetime_beginning_ept'] <= PRED_END)]
    
    if len(pred_data) == 0:
        print(f"Warning: No data for {region} in prediction period")
        continue
    
    # Get features
    X_pred = pred_data[feature_cols]
    
    # Generate predictions
    if method == '1. Linear + Interactions':
        # Special handling for linear with interactions if needed
        y_pred = model.predict(X_pred)
    else:
        y_pred = model.predict(X_pred)
    
    # Store predictions with metadata
    pred_df = pred_data[['datetime_beginning_ept']].copy()
    pred_df['region'] = region
    pred_df['predicted_load'] = y_pred
    pred_df['date'] = pd.to_datetime(pred_df['datetime_beginning_ept']).dt.date
    pred_df['hour'] = pd.to_datetime(pred_df['datetime_beginning_ept']).dt.hour
    
    predictions_list.append(pred_df)

# Combine all predictions
all_predictions = pd.concat(predictions_list, ignore_index=True)

print(f"\nTotal predictions generated: {len(all_predictions):,}")

# Create 3D structure: Pivot to get day x region x hour
predictions_pivot = all_predictions.pivot_table(
    index='date',
    columns=['region', 'hour'],
    values='predicted_load'
)

print(f"\nPredictions shape: {predictions_pivot.shape}")
print(f"  Days: {len(predictions_pivot.index)}")
print(f"  Regions: {len(regions)}")
print(f"  Hours: 24")

# Alternative: Create a proper 3D array
dates = sorted(all_predictions['date'].unique())
hours = list(range(24))

# Initialize 3D array: (days, regions, hours)
predictions_3d = np.zeros((len(dates), len(regions), len(hours)))

# Fill the 3D array
for i, date in enumerate(dates):
    for j, region in enumerate(regions):
        for k, hour in enumerate(hours):
            mask = (all_predictions['date'] == date) & \
                   (all_predictions['region'] == region) & \
                   (all_predictions['hour'] == hour)
            
            if mask.sum() > 0:
                predictions_3d[i, j, k] = all_predictions.loc[mask, 'predicted_load'].values[0]
            else:
                predictions_3d[i, j, k] = np.nan

print(f"\n3D Array shape: {predictions_3d.shape}")
print(f"  Dimension 0 (days): {predictions_3d.shape[0]}")
print(f"  Dimension 1 (regions): {predictions_3d.shape[1]}")
print(f"  Dimension 2 (hours): {predictions_3d.shape[2]}")

# Create a more accessible DataFrame format
print("\nCreating multi-index DataFrame...")
predictions_df = all_predictions.pivot_table(
    index=['date', 'region'],
    columns='hour',
    values='predicted_load'
).reset_index()

# Rename hour columns for clarity
hour_cols = {i: f'hour_{i:02d}' for i in range(24)}
predictions_df = predictions_df.rename(columns=hour_cols)

print(f"Predictions DataFrame shape: {predictions_df.shape}")
print(f"Columns: {list(predictions_df.columns[:5])}...")

# Display sample
print("\nSample predictions:")
print(predictions_df.head(10))

# Save predictions
predictions_output = os.path.join(OUTPUT_DIR, 'predictions')
os.makedirs(predictions_output, exist_ok=True)

# Save as CSV
csv_path = os.path.join(predictions_output, f'hourly_predictions_{PRED_START}_to_{PRED_END}.csv')
predictions_df.to_csv(csv_path, index=False)
print(f"\nPredictions saved to: {csv_path}")

# Save 3D array as numpy file
array_path = os.path.join(predictions_output, f'predictions_3d_{PRED_START}_to_{PRED_END}.npy')
np.save(array_path, predictions_3d)
print(f"3D array saved to: {array_path}")

# Save metadata
metadata = {
    'dates': dates,
    'regions': regions,
    'hours': hours,
    'shape': predictions_3d.shape,
    'period': f"{PRED_START} to {PRED_END}"
}
metadata_path = os.path.join(predictions_output, f'predictions_metadata_{PRED_START}_to_{PRED_END}.pkl')
with open(metadata_path, 'wb') as f:
    pickle.dump(metadata, f)
print(f"Metadata saved to: {metadata_path}")

print("\n" + "="*70)
print("Prediction generation complete!")
print("="*70)