# Smart Meter — 15‑Minute Load Forecasting, EDA & Power BI Prep
This notebook assumes **15‑minute load readings** (`daily_consumption_load`) for 5 meters across 5 locations.
It performs:
- Cleaning & KPI engineering (Loss Ratio = `1 − billed/supplied`, Technical Loss kWh)
- EDA (trends, seasonality) at 15‑min/hourly/daily levels
- Per‑location forecasting with **HistGradientBoostingRegressor**
- **96‑step (next 24 hours)** recursive forecast at 15‑minute intervals
- Exports tidy CSVs for Power BI


In [12]:

# === 0) Imports & Config ===
import pandas as pd, numpy as np, matplotlib.pyplot as plt
from pathlib import Path
from datetime import timedelta

from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Paths (change if needed)
CANDIDATES = [
    Path('smart_meter_data.csv'),
]
DATA_PATH = next((p for p in CANDIDATES if p.exists()), None)
assert DATA_PATH is not None, 'Place your CSV at /mnt/data/smart_meter_data.csv or /mnt/data/transformed_smart_meter_data.csv'

OUTDIR = Path('exports_15min'); OUTDIR.mkdir(exist_ok=True)
FIGDIR = OUTDIR / 'figs'; FIGDIR.mkdir(exist_ok=True)


In [13]:

# === 1) Load & normalize columns ===
raw = pd.read_csv(DATA_PATH)

def guess(cols, keys):
    return next((c for c in cols if any(k in c.lower() for k in keys)), None)

col_dt  = guess(raw.columns, ['datetime','timestamp','date','time'])
col_msn = guess(raw.columns, ['msn','meter','serial','meter_id','id'])
col_loc = guess(raw.columns, ['location','city','site','region'])
col_load= guess(raw.columns, ['daily_consumption','consumption','load','kwh','energy_use'])
col_sup = guess(raw.columns, ['energy_supplied','supplied','supply_input'])
col_bil = guess(raw.columns, ['energy_billed','billed','metered'])

assert col_dt and col_msn and col_loc and col_load, 'Missing expected columns; check your CSV headers.'

df = raw.rename(columns={
    col_dt:'timestamp', col_msn:'msn', col_loc:'location', col_load:'load_15m',
    col_sup:'energy_supplied', col_bil:'energy_billed'
}).copy()

df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce', utc=True).dt.tz_convert('Asia/Kolkata')
df = df.dropna(subset=['timestamp']).sort_values('timestamp')
for c in ['load_15m','energy_supplied','energy_billed']:
    if c in df.columns: df[c] = pd.to_numeric(df[c], errors='coerce')

# Basic checks
print('Rows:', len(df), '| Date range:', df['timestamp'].min(), '→', df['timestamp'].max())
print('Locations:', df['location'].nunique(), df['location'].unique())
df.head()


Rows: 525600 | Date range: 2021-01-01 05:30:00+05:30 → 2024-01-01 05:15:00+05:30
Locations: 5 ['Chennai' 'Delhi' 'Mumbai' 'Bengaluru' 'Kolkata']


Unnamed: 0,timestamp,msn,location,load_15m,energy_supplied,energy_billed
0,2021-01-01 05:30:00+05:30,ACE43B7D,Chennai,2.59,2.78,2.57
105120,2021-01-01 05:30:00+05:30,7F6ACD62,Delhi,2.96,3.14,2.93
420480,2021-01-01 05:30:00+05:30,3F42A75F,Mumbai,4.83,4.82,4.81
210240,2021-01-01 05:30:00+05:30,0653DAA9,Bengaluru,1.55,1.66,1.53
315360,2021-01-01 05:30:00+05:30,344684B5,Kolkata,2.32,2.4,2.37


In [14]:

# === 2) KPI engineering ===
if 'energy_supplied' in df.columns and 'energy_billed' in df.columns:
    denom = df['energy_supplied'].replace(0, np.nan)
    df['loss_ratio'] = (1.0 - (df['energy_billed'] / denom)).clip(lower=0).fillna(0.0)
    df['technical_loss_kwh'] = (df['energy_supplied'] - df['energy_billed']).clip(lower=0)
else:
    df['loss_ratio'] = np.nan
    df['technical_loss_kwh'] = np.nan

# Aggregations for other grains
df_hourly = (df.set_index('timestamp')
               .groupby(['location','msn']).resample('1H')
               .agg(load_1h=('load_15m','sum'),
                    energy_supplied=('energy_supplied','sum'),
                    energy_billed=('energy_billed','sum'),
                    loss_ratio=('loss_ratio','mean'),
                    technical_loss_kwh=('technical_loss_kwh','sum'))
               .reset_index())

df_daily = (df.set_index('timestamp')
              .groupby(['location','msn']).resample('1D')
              .agg(load_1d=('load_15m','sum'),
                   energy_supplied=('energy_supplied','sum'),
                   energy_billed=('energy_billed','sum'),
                   loss_ratio=('loss_ratio','mean'),
                   technical_loss_kwh=('technical_loss_kwh','sum'))
              .reset_index())

# System KPIs
kpis = {
    'rows': len(df),
    'avg_load_15m': df['load_15m'].mean(),
    'peak_load_15m': df['load_15m'].max(),
    'total_consumption_kwh': df['load_15m'].sum(),
    'system_loss_ratio': float(1 - (df['energy_billed'].sum() / max(1e-6, df['energy_supplied'].sum()))) if 'energy_supplied' in df.columns else np.nan,
    'total_technical_loss_kwh': float(df['technical_loss_kwh'].sum()) if 'technical_loss_kwh' in df.columns else np.nan
}
pd.DataFrame([kpis])


  .groupby(['location','msn']).resample('1H')
  .agg(load_1h=('load_15m','sum'),
  .agg(load_1d=('load_15m','sum'),


Unnamed: 0,rows,avg_load_15m,peak_load_15m,total_consumption_kwh,system_loss_ratio,total_technical_loss_kwh
0,525600,2.542772,5.23,1336481.08,0.074915,108228.68


In [15]:

# === 3) EDA quick looks ===
import matplotlib.pyplot as plt

# (a) Overall 15-min load trend (sample last 7 days to keep chart light)
last_week = df['timestamp'].max() - pd.Timedelta(days=7)
sample = df[df['timestamp'] >= last_week]

plt.figure(); 
for loc, sub in sample.groupby('location'):
    plt.plot(sub['timestamp'], sub['load_15m'], label=loc, linewidth=0.8)
plt.title('15-min Load — last 7 days by location')
plt.xticks(rotation=45); plt.tight_layout(); plt.legend(ncol=3, fontsize=8)
plt.savefig(FIGDIR/'trend_15m_last7d.png'); plt.close()

# (b) Hour-of-day profile by location
hourly_profile = df_hourly.groupby(['location', df_hourly['timestamp'].dt.hour], as_index=False)['load_1h'].mean()
hourly_profile.rename(columns={'timestamp':'hour'}, inplace=True)
hourly_profile.to_csv(OUTDIR/'hourly_profile_by_location.csv', index=False)

# (c) Loss ratio by location (daily mean)
loss_by_loc = df_daily.groupby('location', as_index=False)['loss_ratio'].mean()
loss_by_loc.to_csv(OUTDIR/'loss_ratio_by_location.csv', index=False)

print('Saved EDA tables and trend figure.')


Saved EDA tables and trend figure.


  hourly_profile = df_hourly.groupby(['location', df_hourly['timestamp'].dt.hour], as_index=False)['load_1h'].mean()


In [16]:

# === 4) Feature engineering for 15-min forecasting ===
# We build per-location models to avoid leakage across meters.
# 15-min steps: 96 per day; 7 days = 672 steps.
LAGS = [1,2,3,4,8,12,16,24,48,96,288,672]
ROLLS = [96, 288, 672]  # 1d, 3d, 7d

def build_features(g):
    g = g.sort_values('timestamp').copy()
    g['minute_of_day'] = g['timestamp'].dt.hour*60 + g['timestamp'].dt.minute
    g['quarter'] = g['timestamp'].dt.quarter
    g['dayofweek'] = g['timestamp'].dt.dayofweek
    g['is_weekend'] = (g['dayofweek']>=5).astype(int)
    for lag in LAGS:
        g[f'lag_{lag}'] = g['load_15m'].shift(lag)
    for win in ROLLS:
        g[f'roll_{win}'] = g['load_15m'].shift(1).rolling(win).mean()
    return g

# Build features per location
feat_frames = []
for loc, sub in df[df.columns].groupby('location'):
    feat = build_features(sub[['timestamp','location','msn','load_15m']].copy())
    feat_frames.append(feat)
features = pd.concat(feat_frames, ignore_index=True).dropna()
features.head()


Unnamed: 0,timestamp,location,msn,load_15m,minute_of_day,quarter,dayofweek,is_weekend,lag_1,lag_2,...,lag_12,lag_16,lag_24,lag_48,lag_96,lag_288,lag_672,roll_96,roll_288,roll_672
672,2021-01-08 05:30:00+05:30,Bengaluru,0653DAA9,3.54,330,1,4,0,1.84,3.43,...,4.36,3.59,3.49,1.63,1.57,1.88,1.55,2.338333,2.538958,2.574583
673,2021-01-08 05:45:00+05:30,Bengaluru,0653DAA9,3.44,345,1,4,0,3.54,1.84,...,0.73,1.34,3.19,3.74,3.33,4.37,4.5,2.358854,2.544722,2.577545
674,2021-01-08 06:00:00+05:30,Bengaluru,0653DAA9,2.02,360,1,4,0,3.44,3.54,...,2.56,1.53,3.55,4.12,0.49,4.19,1.79,2.36,2.541493,2.575967
675,2021-01-08 06:15:00+05:30,Bengaluru,0653DAA9,4.24,375,1,4,0,2.02,3.44,...,2.24,3.29,2.57,3.28,4.72,1.69,3.11,2.375938,2.533958,2.57631
676,2021-01-08 06:30:00+05:30,Bengaluru,0653DAA9,0.59,390,1,4,0,4.24,2.02,...,1.42,4.36,2.52,4.21,0.57,3.93,2.67,2.370937,2.542813,2.577991


In [17]:

# === 5) Train per-location models & evaluate on the most recent 7 days ===
models = {}
metrics = []
predictions = []

for loc, sub in features.groupby('location'):
    sub = sub.sort_values('timestamp')
    cutoff = sub['timestamp'].max() - pd.Timedelta(days=7)
    train = sub[sub['timestamp'] <= cutoff]
    test  = sub[sub['timestamp'] > cutoff]

    X_train = train.drop(columns=['load_15m','timestamp','msn','location'])
    y_train = train['load_15m']
    X_test  = test.drop(columns=['load_15m','timestamp','msn','location'])
    y_test  = test['load_15m']

    model = HistGradientBoostingRegressor(max_depth=8, learning_rate=0.05, max_iter=500)
    model.fit(X_train, y_train)
    yhat = model.predict(X_test)

    mae = mean_absolute_error(y_test, yhat)
    rmse = np.sqrt(mean_squared_error(y_test, yhat))
    models[loc] = model
    metrics.append({'location': loc, 'MAE': mae, 'RMSE': rmse})

    p = pd.DataFrame({'timestamp': test['timestamp'], 'location': loc, 'actual': y_test.values, 'predicted': yhat})
    predictions.append(p)

metrics_df = pd.DataFrame(metrics).sort_values('RMSE')
pred_df = pd.concat(predictions, ignore_index=True)
metrics_df, pred_df.head()


(    location       MAE      RMSE
 2      Delhi  1.029511  1.196916
 1    Chennai  1.060541  1.218121
 0  Bengaluru  1.039606  1.223343
 4     Mumbai  1.071065  1.234117
 3    Kolkata  1.093073  1.253353,
                   timestamp   location  actual  predicted
 0 2023-12-25 05:30:00+05:30  Bengaluru    2.34   2.551508
 1 2023-12-25 05:45:00+05:30  Bengaluru    2.11   2.541508
 2 2023-12-25 06:00:00+05:30  Bengaluru    3.56   2.541423
 3 2023-12-25 06:15:00+05:30  Bengaluru    3.03   2.541508
 4 2023-12-25 06:30:00+05:30  Bengaluru    1.40   2.541508)

In [18]:

# === 6) 96-step (24h) recursive forecast at 15-min intervals ===
future_all = []

for loc, sub in features.groupby('location'):
    model = models[loc]
    # Get recent history for that location
    hist = df[df['location']==loc].sort_values('timestamp')[-(max(ROLLS)+5):].copy()
    last_ts = hist['timestamp'].max()

    # Build recursive steps
    for step in range(1, 96+1):
        ts = last_ts + pd.Timedelta(minutes=15*step)
        # create feature row using current history
        tmp = hist[['timestamp','load_15m']].copy().set_index('timestamp')
        row = {
            'minute_of_day': ts.hour*60 + ts.minute,
            'quarter': ts.quarter,
            'dayofweek': ts.dayofweek,
            'is_weekend': int(ts.dayofweek>=5)
        }
        for lag in LAGS:
            row[f'lag_{lag}'] = tmp['load_15m'].iloc[-lag] if len(tmp)>=lag else np.nan
        for win in ROLLS:
            row[f'roll_{win}'] = tmp['load_15m'].iloc[-win:].mean() if len(tmp)>=win else np.nan

        row_df = pd.DataFrame([row])
        # align columns to training set
        X_cols = models[loc].feature_names_in_
        for c in X_cols:
            if c not in row_df.columns:
                row_df[c] = 0.0
        row_df = row_df[X_cols]

        yhat = float(model.predict(row_df)[0])
        future_all.append({'timestamp': ts, 'location': loc, 'forecast_15m': yhat})
        # append to history for next step
        hist = pd.concat([hist, pd.DataFrame({'timestamp':[ts],'load_15m':[yhat]})], ignore_index=True)

forecast_24h = pd.DataFrame(future_all).sort_values(['location','timestamp'])
forecast_24h.head()


Unnamed: 0,timestamp,location,forecast_15m
0,2024-01-01 05:30:00+05:30,Bengaluru,2.5469
1,2024-01-01 05:45:00+05:30,Bengaluru,2.541508
2,2024-01-01 06:00:00+05:30,Bengaluru,2.541508
3,2024-01-01 06:15:00+05:30,Bengaluru,2.546116
4,2024-01-01 06:30:00+05:30,Bengaluru,2.541508


In [19]:

# === 7) Save artifacts for Power BI ===
# Facts at three grains
df.to_csv(OUTDIR/'fact_15min.csv', index=False)
df_hourly.to_csv(OUTDIR/'fact_hourly.csv', index=False)
df_daily.to_csv(OUTDIR/'fact_daily.csv', index=False)

# Model outputs
metrics_df.to_csv(OUTDIR/'model_metrics_by_location.csv', index=False)
pred_df.to_csv(OUTDIR/'test_predictions_15m_by_location.csv', index=False)
forecast_24h.to_csv(OUTDIR/'next24h_forecast_15m_by_location.csv', index=False)

# EDA helper tables
hourly_profile.to_csv(OUTDIR/'hourly_profile_by_location.csv', index=False)
loss_by_loc.to_csv(OUTDIR/'loss_ratio_by_location.csv', index=False)

print('All exports written to:', OUTDIR)


All exports written to: exports_15min
