# Fortum Junction 2025 - Baseline Forecast Model

This notebook implements the baseline forecasting model for the Fortum energy consumption challenge.

**Baseline Logic:**
- **48-hour forecast**: Use consumption from same hour 7 days earlier
- **12-month forecast**: Use total consumption from same month 1 year earlier

## 1. Import Libraries and Load Data

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timezone
import matplotlib.pyplot as plt
import seaborn as sns

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

In [None]:
# Load your data - adjust the file path as needed
# Assuming your data is in a CSV or parquet file
df = pd.read_csv('data/merged_hourly_for_azure.csv')  # or pd.read_parquet('data.parquet')

# Convert measured_at to datetime with timezone
df['measured_at'] = pd.to_datetime(df['measured_at'], utc=True)

# Display basic info
print(f"Data shape: {df.shape}")
print(f"\nDate range: {df['measured_at'].min()} to {df['measured_at'].max()}")
print(f"Number of groups: {df['group_id'].nunique()}")
print(f"\nFirst few rows:")
df.head()

## 2. Data Exploration

In [None]:
# Check for missing values
print("Missing values per column:")
print(df.isnull().sum())

In [None]:
# Check data completeness for each group
group_counts = df.groupby('group_id').size()
print(f"\nRecords per group:")
print(f"Min: {group_counts.min()}")
print(f"Max: {group_counts.max()}")
print(f"Mean: {group_counts.mean():.0f}")

In [None]:
# Visualize consumption for a sample group
sample_group = df['group_id'].iloc[0]
sample_data = df[df['group_id'] == sample_group].set_index('measured_at').sort_index()

plt.figure(figsize=(15, 4))
plt.plot(sample_data.index[-168:], sample_data['consumption_fwh'][-168:])  # Last week
plt.title(f'Last Week of Consumption - Group {sample_group}')
plt.xlabel('Time')
plt.ylabel('Consumption (FWH)')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 3. Create 48-Hour Baseline Forecast

In [None]:
def create_48h_baseline(df):
    """
    Create 48-hour baseline forecast using same hour from 7 days earlier.
    """
    # Get all unique group IDs
    group_ids = sorted(df['group_id'].unique())
    
    # Define forecast period: Oct 1, 2024 00:00 to Oct 2, 2024 23:00 (UTC)
    forecast_start = datetime(2024, 10, 1, 0, 0, 0, tzinfo=timezone.utc)
    hourly_timestamps = pd.date_range(
        start=forecast_start,
        periods=48,
        freq='h',
        tz=timezone.utc
    )
    
    print(f"Creating 48-hour forecast for {len(group_ids)} groups...")
    
    # Collect forecasts for all groups first (NEW!)
    all_forecasts = {}
    
    # For each group, use same hour from 7 days earlier
    for i, group_id in enumerate(group_ids):
        if (i + 1) % 20 == 0:
            print(f"  Processing group {i+1}/{len(group_ids)}...")
        
        group_data = df[df['group_id'] == group_id].copy()
        group_data = group_data.set_index('measured_at').sort_index()
        
        forecasts = []
        for ts in hourly_timestamps:
            # Look back 7 days
            baseline_ts = ts - pd.Timedelta(days=7)
            
            if baseline_ts in group_data.index:
                forecast_value = group_data.loc[baseline_ts, 'consumption_fwh']
            else:
                # Find nearest timestamp
                try:
                    nearest_idx = group_data.index.get_indexer([baseline_ts], method='nearest')[0]
                    forecast_value = group_data.iloc[nearest_idx]['consumption_fwh']
                except:
                    # Fallback: use mean
                    forecast_value = group_data['consumption_fwh'].mean()
            
            forecasts.append(forecast_value)
        
        # Store in dictionary instead of adding to dataframe (CHANGED!)
        all_forecasts[group_id] = forecasts
    
    # Build dataframe all at once (NEW!)
    forecast_df = pd.DataFrame({'measured_at': hourly_timestamps})
    forecast_df = pd.concat([forecast_df, pd.DataFrame(all_forecasts)], axis=1)
    
    print(f"✓ 48-hour forecast complete: {len(forecast_df)} rows x {len(group_ids)} groups")
    return forecast_df

In [None]:
# Generate 48-hour forecast
hourly_forecast = create_48h_baseline(df)
hourly_forecast.head()

## 4. Create 12-Month Baseline Forecast

In [None]:
def create_12m_baseline(df):
    """
    Create 12-month baseline forecast using same month from 1 year earlier.
    """
    # Get all unique group IDs
    group_ids = sorted(df['group_id'].unique())
    
    # Define monthly timestamps: Oct 2024 - Sep 2025
    monthly_timestamps = pd.date_range(
        start=datetime(2024, 10, 1, 0, 0, 0, tzinfo=timezone.utc),
        periods=12,
        freq='MS',
        tz=timezone.utc
    )
    
    print(f"Creating 12-month forecast for {len(group_ids)} groups...")
    
    # Collect forecasts for all groups first (NEW!)
    all_forecasts = {}
    
    # For each group, use same month from 1 year earlier
    for i, group_id in enumerate(group_ids):
        if (i + 1) % 20 == 0:
            print(f"  Processing group {i+1}/{len(group_ids)}...")
        
        group_data = df[df['group_id'] == group_id].copy()
        
        monthly_forecasts = []
        for target_month in monthly_timestamps:
            # Look back 1 year
            baseline_year = target_month.year - 1
            baseline_month = target_month.month
            
            # Get all consumption for that month from previous year
            baseline_data = group_data[
                (group_data['measured_at'].dt.year == baseline_year) &
                (group_data['measured_at'].dt.month == baseline_month)
            ]
            
            if len(baseline_data) > 0:
                # Sum all consumption for that month
                forecast_value = baseline_data['consumption_fwh'].sum()
            else:
                # Fallback: use average monthly consumption
                group_data['year_month'] = group_data['measured_at'].dt.to_period('M')
                monthly_totals = group_data.groupby('year_month')['consumption_fwh'].sum()
                forecast_value = monthly_totals.mean() if len(monthly_totals) > 0 else 0
            
            monthly_forecasts.append(forecast_value)
        
        # Store in dictionary instead of adding to dataframe (CHANGED!)
        all_forecasts[group_id] = monthly_forecasts
    
    # Build dataframe all at once (NEW!)
    forecast_df = pd.DataFrame({'measured_at': monthly_timestamps})
    forecast_df = pd.concat([forecast_df, pd.DataFrame(all_forecasts)], axis=1)
    
    print(f"✓ 12-month forecast complete: {len(forecast_df)} rows x {len(group_ids)} groups")
    return forecast_df

In [None]:
# Generate 12-month forecast
monthly_forecast = create_12m_baseline(df)
monthly_forecast.head()

## 5. Format Output for Submission

In [None]:
def format_for_submission(forecast_df):
    """
    Format forecast dataframe according to submission requirements:
    - ISO 8601 format with Z suffix
    - Semicolon delimiter
    - Comma as decimal separator
    """
    # Create a copy
    output_df = forecast_df.copy()
    
    # Format timestamp as ISO 8601 with milliseconds and Z suffix
    output_df['measured_at'] = output_df['measured_at'].dt.strftime('%Y-%m-%dT%H:%M:%S.%f').str[:-3] + 'Z'
    
    return output_df

In [None]:
def format_for_submission(forecast_df):
    """
    Format forecast dataframe according to submission requirements:
    - ISO 8601 format with Z suffix
    - Semicolon delimiter
    - Comma as decimal separator
    """
    # Create a copy
    output_df = forecast_df.copy()
    
    # Format timestamp as ISO 8601 with milliseconds and Z suffix
    output_df['measured_at'] = output_df['measured_at'].dt.strftime('%Y-%m-%dT%H:%M:%S.%f').str[:-3] + 'Z'
    
    return output_df
    

In [None]:
# Format both forecasts
hourly_submission = format_for_submission(hourly_forecast)
monthly_submission = format_for_submission(monthly_forecast)

print("Hourly forecast shape:", hourly_submission.shape)
print("Monthly forecast shape:", monthly_submission.shape)

# Show sample
print("\nSample hourly forecast:")
print(hourly_submission.head(3))

## 6. Save to CSV Files

In [None]:
# Save with European CSV format (semicolon delimiter, comma decimal separator)
hourly_submission.to_csv(
    'data/baseline_48h_forecast.csv',
    sep=';',
    decimal=',',
    index=False,
    encoding='utf-8'
)

monthly_submission.to_csv(
    'data/baseline_12m_forecast.csv',
    sep=';',
    decimal=',',
    index=False,
    encoding='utf-8'
)

print("✓ Files saved successfully!")
print("  - baseline_48h_forecast.csv")
print("  - baseline_12m_forecast.csv")

## 7. Quick Validation

In [None]:
# Verify file format
print("Validating 48-hour forecast file...")
test_df = pd.read_csv('data/baseline_48h_forecast.csv', sep=';', decimal=',', nrows=3)
print(f"✓ Columns: {len(test_df.columns)} (expected: {len(df['group_id'].unique()) + 1})")
print(f"✓ First column name: '{test_df.columns[0]}' (expected: 'measured_at')")
print(f"\nFirst few rows:\n{test_df.head()}")

In [None]:
# Check for missing values in output
hourly_check = pd.read_csv('data/baseline_48h_forecast.csv', sep=';', decimal=',')
monthly_check = pd.read_csv('data/baseline_12m_forecast.csv', sep=';', decimal=',')

print(f"\n48-hour forecast:")
print(f"  Rows: {len(hourly_check)} (expected: 48)")
print(f"  Missing values: {hourly_check.isnull().sum().sum()}")

print(f"\n12-month forecast:")
print(f"  Rows: {len(monthly_check)} (expected: 12)")
print(f"  Missing values: {monthly_check.isnull().sum().sum()}")

## 8. Visualize Sample Forecasts

In [None]:
# Visualize 48-hour forecast for a sample group
sample_group = df['group_id'].iloc[0]

# Get historical data
hist_data = df[df['group_id'] == sample_group].set_index('measured_at').sort_index()
last_week = hist_data.iloc[-168:]  # Last 7 days

# Get forecast
forecast_values = hourly_forecast[sample_group].values
forecast_times = pd.to_datetime(hourly_forecast['measured_at'])

fig, ax = plt.subplots(figsize=(15, 5))
ax.plot(last_week.index, last_week['consumption_fwh'], label='Historical', alpha=0.7)
ax.plot(forecast_times, forecast_values, label='48h Baseline Forecast', color='red', linewidth=2)
ax.axvline(forecast_times[0], color='black', linestyle='--', alpha=0.5, label='Forecast Start')
ax.set_title(f'Baseline Forecast - Group {sample_group}')
ax.set_xlabel('Time')
ax.set_ylabel('Consumption (FWH)')
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Visualize monthly forecast for sample group
monthly_values = monthly_forecast[sample_group].values
monthly_times = pd.to_datetime(monthly_forecast['measured_at'])

# Get historical monthly totals
hist_monthly = df[df['group_id'] == sample_group].copy()
hist_monthly['year_month'] = hist_monthly['measured_at'].dt.tz_localize(None).dt.to_period('M')
hist_monthly_totals = hist_monthly.groupby('year_month')['consumption_fwh'].sum()

fig, ax = plt.subplots(figsize=(15, 5))
ax.plot(hist_monthly_totals.index.to_timestamp(), hist_monthly_totals.values, 
        marker='o', label='Historical Monthly Totals', alpha=0.7)
ax.plot(monthly_times, monthly_values, marker='s', color='red', 
        linewidth=2, markersize=8, label='12-Month Baseline Forecast')
ax.axvline(monthly_times[0], color='black', linestyle='--', alpha=0.5, label='Forecast Start')
ax.set_title(f'Monthly Baseline Forecast - Group {sample_group}')
ax.set_xlabel('Month')
ax.set_ylabel('Total Consumption (FWH)')
ax.legend()
ax.grid(True, alpha=0.3)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()