In [None]:
import pandas as pd
import numpy as np
from datetime import timedelta

# 1. Load and Prepare Data

In [None]:
def load_consumption_data(filepath):
    """Load consumption data with proper parsing."""
    # Check if file is in long format (one row per group per hour) or wide format
    df = pd.read_csv(filepath)
    
    if 'group_id' in df.columns:
        # Long format - pivot to wide format
        print("Detected long format data, pivoting to wide format...")
        df['measured_at'] = pd.to_datetime(df['measured_at'])
        df = df.pivot(index='measured_at', columns='group_id', values='consumption_fwh').reset_index()
        df = df.sort_values('measured_at').reset_index(drop=True)
    else:
        # Wide format (semicolon separated, comma decimal)
        df = pd.read_csv(filepath, sep=';', decimal=',')
        df['measured_at'] = pd.to_datetime(df['measured_at'])
        df = df.sort_values('measured_at').reset_index(drop=True)
    
    return df

In [None]:
# Load your data
input_file = 'data/merged_hourly_for_azure.csv'
df = load_consumption_data(input_file)
print(f"Loaded {len(df)} hourly records")
print(f"Date range: {df['measured_at'].min()} to {df['measured_at'].max()}")
print(f"Number of groups: {len(df.columns) - 1}")
df.head()

# 2. Create 48-Hour Validation Set

In [None]:
def create_48h_validation_set(df, validation_start_date):
    """
    Create 48-hour validation set with baseline predictions.
    Baseline: Uses same hour from one week earlier.
    
    Args:
        df: DataFrame with consumption data
        validation_start_date: String like '2024-09-01' for validation start
    
    Returns:
        actual_df: Actual consumption for validation period
        baseline_df: Baseline predictions
    """
    val_start = pd.to_datetime(validation_start_date)
    # Match timezone of the data if it has one
    if df['measured_at'].dt.tz is not None:
        val_start = val_start.tz_localize(df['measured_at'].dt.tz)
    val_end = val_start + timedelta(hours=47)  # 48 hours total (0-47)
    
    # Get actual values for validation period
    actual_mask = (df['measured_at'] >= val_start) & (df['measured_at'] <= val_end)
    actual_df = df[actual_mask].copy()
    
    if len(actual_df) != 48:
        print(f"Warning: Expected 48 hours, got {len(actual_df)} hours")
    
    # Create baseline predictions (one week earlier)
    baseline_df = actual_df.copy()
    baseline_start = val_start - timedelta(weeks=1)
    baseline_end = val_end - timedelta(weeks=1)
    
    baseline_mask = (df['measured_at'] >= baseline_start) & (df['measured_at'] <= baseline_end)
    baseline_data = df[baseline_mask].copy()
    
    if len(baseline_data) != 48:
        print(f"Warning: Baseline data has {len(baseline_data)} hours instead of 48")
    
    # Copy values from one week earlier
    group_cols = [col for col in df.columns if col != 'measured_at']
    baseline_df[group_cols] = baseline_data[group_cols].values
    
    return actual_df, baseline_df

In [None]:
# Create 48-hour validation set
# Using September 1, 2024 as validation start
val_start_48h = '2024-09-29T00:00:00'
actual_48h, baseline_48h = create_48h_validation_set(df, val_start_48h)

print(f"\n48-Hour Validation Set:")
print(f"Period: {actual_48h['measured_at'].min()} to {actual_48h['measured_at'].max()}")
print(f"Number of hours: {len(actual_48h)}")
print(f"\nActual values sample:")
print(actual_48h.head())
print(f"\nBaseline values sample:")
print(baseline_48h.head())

# 3. Create 12-Month Validation Set


In [None]:
def create_12m_validation_set(df, validation_start_month):
    """
    Create 12-month validation set with baseline predictions.
    Baseline: Uses same month from one year earlier.
    
    Args:
        df: DataFrame with hourly consumption data
        validation_start_month: String like '2024-10-01' for validation start
    
    Returns:
        actual_df: Actual monthly consumption for validation period
        baseline_df: Baseline predictions
    """
    val_start = pd.to_datetime(validation_start_month)
    # Match timezone of the data if it has one
    if df['measured_at'].dt.tz is not None:
        val_start = val_start.tz_localize(df['measured_at'].dt.tz)
    
    # Generate 12 month timestamps
    month_timestamps = []
    for i in range(12):
        month_date = val_start + pd.DateOffset(months=i)
        month_timestamps.append(month_date)
    
    group_cols = [col for col in df.columns if col != 'measured_at']
    
    # Calculate actual monthly totals
    actual_data = []
    baseline_data = []
    
    for month_ts in month_timestamps:
        # Get month range
        month_start = month_ts
        month_end = month_start + pd.DateOffset(months=1)
        
        # Actual values for this month
        month_mask = (df['measured_at'] >= month_start) & (df['measured_at'] < month_end)
        month_df = df[month_mask]
        
        if len(month_df) > 0:
            month_totals = month_df[group_cols].sum()
            actual_data.append([month_ts] + month_totals.tolist())
        else:
            print(f"Warning: No data for {month_ts.strftime('%Y-%m')}")
            actual_data.append([month_ts] + [np.nan] * len(group_cols))
        
        # Baseline: same month from one year earlier
        baseline_month_start = month_start - pd.DateOffset(years=1)
        baseline_month_end = baseline_month_start + pd.DateOffset(months=1)
        
        baseline_mask = (df['measured_at'] >= baseline_month_start) & (df['measured_at'] < baseline_month_end)
        baseline_month_df = df[baseline_mask]
        
        if len(baseline_month_df) > 0:
            baseline_totals = baseline_month_df[group_cols].sum()
            baseline_data.append([month_ts] + baseline_totals.tolist())
        else:
            print(f"Warning: No baseline data for {baseline_month_start.strftime('%Y-%m')}")
            baseline_data.append([month_ts] + [np.nan] * len(group_cols))
    
    # Create DataFrames
    columns = ['measured_at'] + group_cols
    actual_df = pd.DataFrame(actual_data, columns=columns)
    baseline_df = pd.DataFrame(baseline_data, columns=columns)
    
    return actual_df, baseline_df

In [None]:
# Create 12-month validation set
# Using October 2023 as validation start (to validate Oct 2023 - Sep 2024)
val_start_12m = '2023-10-01'
actual_12m, baseline_12m = create_12m_validation_set(df, val_start_12m)

print(f"\n12-Month Validation Set:")
print(f"Period: {actual_12m['measured_at'].min()} to {actual_12m['measured_at'].max()}")
print(f"Number of months: {len(actual_12m)}")
print(f"\nActual monthly totals sample:")
print(actual_12m.head())
print(f"\nBaseline monthly totals sample:")
print(baseline_12m.head())

# 5. Save Validation Sets

In [None]:
def save_csv_fortum_format(df, filepath):
    """Save DataFrame in Fortum format (semicolon separator, comma decimal)."""
    df.to_csv(filepath, sep=';', decimal=',', index=False, date_format='%Y-%m-%dT%H:%M:%S.000Z')
    print(f"Saved: {filepath}")

In [None]:
# Save 48-hour validation files
save_csv_fortum_format(actual_48h, 'data/fva_data/validation_48h_actual.csv')
save_csv_fortum_format(baseline_48h, 'data/fva_data/validation_48h_baseline.csv')

In [None]:
# Save 12-month validation files
save_csv_fortum_format(actual_12m, 'data/fva_data/validation_12m_actual.csv')
save_csv_fortum_format(baseline_12m, 'data/fva_data/validation_12m_baseline.csv')