# 02 — Feature Engineering
**Urban Energy Consumption Forecasting with LSTM**

This notebook demonstrates and validates every feature transformation applied
before the data reaches the LSTM.

Sections
1. Load preprocessed base series
2. Weather features (synthetic + real integration)
3. Cyclical time encodings (hour, day-of-week)
4. Public holiday flag
5. MinMax scaling
6. Sliding-window sequence construction
7. Final dataset stats & validation

In [None]:
import sys, warnings
sys.path.insert(0, '..')
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from config import RAW_DATA_FILE, FEATURE_COLUMNS, TARGET_COLUMN, LOOKBACK, HORIZON
from src.preprocess import DataPreprocessor, _cyclic_encode, _add_synthetic_weather, _add_time_features

sns.set_theme(style='whitegrid')
plt.rcParams.update({'figure.dpi': 120})
print('Ready ✓')

## 1 · Load Base Series

In [None]:
prep = DataPreprocessor(lookback=LOOKBACK, horizon=HORIZON)
df = prep.load_raw(RAW_DATA_FILE)
print(f'Loaded {len(df):,} hourly records')
df.head()

## 2 · Weather Features

In [None]:
df_w = _add_synthetic_weather(df.copy())
weather_cols = ['temperature', 'humidity', 'wind_speed', 'solar_irradiance']

fig, axes = plt.subplots(2, 2, figsize=(14, 7))
titles = ['Temperature (°C)', 'Humidity (%)', 'Wind Speed (m/s)', 'Solar Irradiance (W/m²)']
for ax, col, title in zip(axes.ravel(), weather_cols, titles):
    sample = df_w[col].iloc[:24*30]  # first 30 days
    ax.plot(sample.index, sample.values, linewidth=0.7)
    ax.set_title(title)
    ax.tick_params(axis='x', rotation=30)
plt.suptitle('Synthetic Weather Features — First 30 Days', y=1.01)
plt.tight_layout()
plt.show()

print(df_w[weather_cols].describe().round(3))

## 3 · Cyclical Time Encodings

A raw integer hour feature (0–23) creates a discontinuity at midnight — the LSTM would incorrectly infer hour 0 and hour 23 are very different. Cyclical encoding removes that artefact.

In [None]:
hours = pd.Series(range(24))
sin_h, cos_h = _cyclic_encode(hours, 24)
dow   = pd.Series(range(7))
sin_d, cos_d = _cyclic_encode(dow, 7)

fig, axes = plt.subplots(1, 2, figsize=(12, 4))

axes[0].scatter(sin_h, cos_h, c=hours, cmap='hsv', s=80, zorder=5)
for h in range(24):
    axes[0].annotate(str(h), (sin_h[h], cos_h[h]), fontsize=7, ha='center', va='bottom')
axes[0].set_title('Hourly Cyclical Encoding (sin vs cos)')
axes[0].set_xlabel('sin(hour)')
axes[0].set_ylabel('cos(hour)')
axes[0].set_aspect('equal')

days = ['Mon','Tue','Wed','Thu','Fri','Sat','Sun']
axes[1].scatter(sin_d, cos_d, c=dow, cmap='tab10', s=100, zorder=5)
for i, d in enumerate(days):
    axes[1].annotate(d, (sin_d[i], cos_d[i]), fontsize=8, ha='center', va='bottom')
axes[1].set_title('Day-of-Week Cyclical Encoding')
axes[1].set_xlabel('sin(dow)')
axes[1].set_ylabel('cos(dow)')
axes[1].set_aspect('equal')

plt.tight_layout()
plt.show()

## 4 · Full Feature Matrix

In [None]:
df_feat = prep.add_features(df.copy())
print(f'Features present: {list(df_feat.columns)}')
print(f'Shape: {df_feat.shape}')
display(df_feat[FEATURE_COLUMNS].head())

# Correlation of all features with the target
corr = df_feat[FEATURE_COLUMNS].corr()[TARGET_COLUMN].drop(TARGET_COLUMN).sort_values()
plt.figure(figsize=(8, 4))
corr.plot(kind='barh', color=['#d62728' if v < 0 else '#2ca02c' for v in corr])
plt.axvline(0, color='black', linewidth=0.8)
plt.title(f'Feature Correlation with {TARGET_COLUMN}')
plt.tight_layout()
plt.show()

## 5 · MinMax Scaling

In [None]:
scaled = prep.scale(df_feat, fit=True)
df_scaled = pd.DataFrame(scaled, columns=FEATURE_COLUMNS, index=df_feat.index)

print('Scaled value ranges:')
display(df_scaled.describe().loc[['min', 'max']].round(4))

## 6 · Sliding-Window Sequences

In [None]:
X, y = prep.make_sequences(scaled, df_feat)
print(f'X shape: {X.shape}  →  (n_samples, lookback={LOOKBACK}, n_features={X.shape[2]})')
print(f'y shape: {y.shape}  →  (n_samples, horizon={HORIZON})')

# Visualise one window
sample_idx = 200
fig, axes = plt.subplots(1, 2, figsize=(14, 4))

axes[0].plot(X[sample_idx, :, 0], label='Power (scaled)')
axes[0].plot(X[sample_idx, :, 1], label='Temp (scaled)', alpha=0.7)
axes[0].set_title(f'Input Window #{sample_idx} — {LOOKBACK}h lookback')
axes[0].set_xlabel('Time step')
axes[0].legend()

hours_ahead = np.arange(1, HORIZON + 1)
axes[1].bar(hours_ahead, y[sample_idx], color='teal', alpha=0.8)
axes[1].set_title(f'Target Window #{sample_idx} — next {HORIZON}h (kWh)')
axes[1].set_xlabel('Hour ahead')
axes[1].set_ylabel('Global Active Power (kWh)')

plt.tight_layout()
plt.show()

## 7 · Train / Val / Test Split

In [None]:
y_s = prep.scale_y(y)
X_train, X_val, X_test, y_train, y_val, y_test = prep.train_val_test_split(X, y_s)

total = len(X)
print(f'Total windows   : {total:,}')
print(f'Train           : {len(X_train):,}  ({len(X_train)/total*100:.1f}%)')
print(f'Validation      : {len(X_val):,}  ({len(X_val)/total*100:.1f}%)')
print(f'Test (hold-out) : {len(X_test):,}  ({len(X_test)/total*100:.1f}%)')

labels = ['Train', 'Validation', 'Test']
sizes  = [len(X_train), len(X_val), len(X_test)]
plt.figure(figsize=(6, 4))
plt.pie(sizes, labels=labels, autopct='%1.1f%%',
        colors=['#1f77b4', '#ff7f0e', '#2ca02c'], startangle=140)
plt.title('Dataset Split')
plt.show()

## 8 · Summary

| Feature | Type | Rationale |
|---------|------|-----------|
| `Global_active_power` | Target / lagged feature | Main signal |
| `temperature` | Weather | Strong negative correlation with heating demand |
| `humidity` | Weather | Secondary climate driver |
| `wind_speed` | Weather | Affects ventilation / renewable supply |
| `solar_irradiance` | Weather | Cooling demand + PV offset |
| `hour_sin`, `hour_cos` | Cyclical time | Diurnal pattern without discontinuity |
| `dow_sin`, `dow_cos` | Cyclical time | Weekly pattern |
| `is_holiday` | Calendar | Demand reduction on public holidays |

→ Proceed to **03_Model_Training.ipynb** to train and evaluate the LSTM.