# 02 — Feature Importance
Random Forest feature importance (weather-only features) for demand, price, wind onshore, and residual load.

In [1]:
import pandas as pd
import numpy as np
import json
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

df = pd.read_parquet('../cleaned_data.parquet')
print(f"Shape: {df.shape}")

Shape: (35056, 80)


In [2]:
# Feature columns: weather only (no time features — isolates weather signal)
weather_cols = [c for c in df.columns if any(
    c.startswith(p) for p in ['temp', 'pressure', 'humidity', 'wind_speed', 'wind_deg',
                               'rain_1h', 'rain_3h', 'snow_3h', 'clouds_all']
)]
feature_cols = weather_cols

# Calculate residual load: demand minus variable renewables (wind + solar)
df['residual_load'] = df['total load actual'] - df['generation wind onshore'] - df['generation solar']

targets = {
    'demand': 'total load actual',
    'price': 'price actual',
    'wind_onshore': 'generation wind onshore',
    'residual_load': 'residual_load',
}

print(f"Weather features: {len(feature_cols)}")
print(f"Targets: {list(targets.keys())}")
print(f"\nResidual load stats:")
print(f"  Mean: {df['residual_load'].mean():.0f} MW")
print(f"  Min:  {df['residual_load'].min():.0f} MW")
print(f"  Max:  {df['residual_load'].max():.0f} MW")

Weather features: 55
Targets: ['demand', 'price', 'wind_onshore', 'residual_load']

Residual load stats:
  Mean: 21800 MW
  Min:  6707 MW
  Max:  37276 MW


In [3]:
# Run Random Forest for each target (80/20 train/test split)
results = {}

X_all = df[feature_cols].fillna(0)

for name, target_col in targets.items():
    print(f"\n--- {name} ({target_col}) ---")
    y_all = df[target_col]
    
    X_train, X_test, y_train, y_test = train_test_split(
        X_all, y_all, test_size=0.2, random_state=42
    )
    
    rf = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
    rf.fit(X_train, y_train)
    
    r2_test = rf.score(X_test, y_test)
    
    importances = pd.Series(rf.feature_importances_, index=feature_cols)
    top7 = importances.nlargest(7)
    
    results[name] = {
        'r2': round(r2_test, 3),
        'features': [
            {'feature': feat, 'importance': round(imp, 4)}
            for feat, imp in top7.items()
        ],
    }
    
    print(f"R² (test): {r2_test:.3f}")
    print(f"Train: {len(X_train)}, Test: {len(X_test)}")
    print(f"Top 7: {top7.index.tolist()}")


--- demand (total load actual) ---


R² (test): 0.560
Train: 28044, Test: 7012
Top 7: ['humidity_valencia', 'humidity_bilbao', 'temp_barcelona', 'humidity_seville', 'humidity_barcelona', 'wind_deg_barcelona', 'wind_deg_seville']

--- price (price actual) ---


R² (test): 0.640
Train: 28044, Test: 7012
Top 7: ['pressure_barcelona', 'temp_max_seville', 'temp_min_valencia', 'pressure_bilbao', 'pressure_seville', 'humidity_valencia', 'wind_speed_madrid']

--- wind_onshore (generation wind onshore) ---


R² (test): 0.581
Train: 28044, Test: 7012
Top 7: ['wind_speed_madrid', 'pressure_bilbao', 'pressure_barcelona', 'pressure_seville', 'humidity_valencia', 'temp_max_barcelona', 'pressure_madrid']

--- residual_load (residual_load) ---


R² (test): 0.493
Train: 28044, Test: 7012
Top 7: ['pressure_bilbao', 'humidity_valencia', 'wind_deg_valencia', 'temp_barcelona', 'pressure_barcelona', 'humidity_bilbao', 'clouds_all_bilbao']


In [4]:
# Export JSON
import os
os.makedirs('../dashboard/public/data', exist_ok=True)

with open('../dashboard/public/data/feature_importance.json', 'w') as f:
    json.dump(results, f, indent=2)

print('Saved feature_importance.json')
print(json.dumps(results, indent=2)[:500])

Saved feature_importance.json
{
  "demand": {
    "r2": 0.56,
    "features": [
      {
        "feature": "humidity_valencia",
        "importance": 0.1205
      },
      {
        "feature": "humidity_bilbao",
        "importance": 0.056
      },
      {
        "feature": "temp_barcelona",
        "importance": 0.0503
      },
      {
        "feature": "humidity_seville",
        "importance": 0.0374
      },
      {
        "feature": "humidity_barcelona",
        "importance": 0.0374
      },
      {
        "feature":
