# Smart Meter Analytics, Forecasting & Power BI Prep
This notebook cleans the smart-meter dataset, computes KPIs (including **Loss Ratio**), trains forecasting models per meter/location, exports **Power BI**-ready tables, and builds a **CXO 5-slide PPT**.

**Inputs:** `smart_meter_data.csv` or `transformed_smart_meter_data.csv`

**Outputs:** written to `/mnt/data/smartmeter_outputs/`

In [2]:

# === 0) Config & imports ===
import pandas as pd, numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from datetime import timedelta

# File paths (change if needed)
CANDIDATES = [
    Path("smart_meter_data.csv"),
    # Path("/mnt/data/transformed_smart_meter_data.csv")
]
DATA_PATH = next((p for p in CANDIDATES if p.exists()), None)
assert DATA_PATH is not None, "Place your CSV at /mnt/data/smart_meter_data.csv"
OUTDIR = Path("smartmeter_outputs"); OUTDIR.mkdir(exist_ok=True)


In [3]:

# === 1) Load & column mapping ===
df_raw = pd.read_csv(DATA_PATH)

# Try to map likely columns
def guess(colnames, keys):
    cands = [c for c in colnames if any(k in c.lower() for k in keys)]
    return cands[0] if cands else None

col_dt  = guess(df_raw.columns, ["datetime","date","time","timestamp"])
col_msn = guess(df_raw.columns, ["msn","meter","serial","meter_id","id"])
col_loc = guess(df_raw.columns, ["location","city","site","region"])
col_load= guess(df_raw.columns, ["daily_consumption","consumption","load","kwh","energy_use"])
col_sup = guess(df_raw.columns, ["energy_supplied","supplied","supply_input"])
col_bil = guess(df_raw.columns, ["energy_billed","billed","metered"])

mapping = {
    "datetime": col_dt,
    "msn": col_msn,
    "location": col_loc,
    "load": col_load,
    "energy_supplied": col_sup,
    "energy_billed": col_bil
}
mapping


{'datetime': 'datetime',
 'msn': 'msn',
 'location': 'location',
 'load': 'daily_consumption_load',
 'energy_supplied': 'energy_supplied',
 'energy_billed': 'energy_billed'}

In [4]:

# === 2) Clean & derive ===
df = df_raw.rename(columns={
    mapping["datetime"]: "timestamp",
    mapping["msn"]: "msn",
    mapping["location"]: "location",
    mapping["load"]: "daily_consumption_load",
    mapping["energy_supplied"]: "energy_supplied",
    mapping["energy_billed"]: "energy_billed",
}).copy()

# Parse time & sort
df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce", utc=True).dt.tz_convert("Asia/Kolkata")
df = df.dropna(subset=["timestamp"]).sort_values("timestamp")

# Basic sanity
for c in ["daily_consumption_load","energy_supplied","energy_billed"]:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce")

# Compute Loss Ratio per row if possible: 1 - billed/supplied
if "energy_supplied" in df.columns and "energy_billed" in df.columns:
    denom = df["energy_supplied"].replace(0, np.nan)
    df["loss_ratio"] = 1.0 - (df["energy_billed"] / denom)
    df["loss_ratio"] = df["loss_ratio"].clip(lower=0).fillna(0.0)
    df["technical_loss_kwh"] = (df["energy_supplied"] - df["energy_billed"]).clip(lower=0)
else:
    df["loss_ratio"] = np.nan
    df["technical_loss_kwh"] = np.nan

# Load factor proxy: daily_consumption_load / peak (per meter-month) if we had max demand;
# here we approximate by comparing to the max daily load per meter in the month.
df["month"] = df["timestamp"].dt.to_period("M")
df["day"] = df["timestamp"].dt.date

# Ensure we have exactly 5 meters and locations if present
df.head()


  df["month"] = df["timestamp"].dt.to_period("M")


Unnamed: 0,timestamp,msn,location,daily_consumption_load,energy_supplied,energy_billed,loss_ratio,technical_loss_kwh,month,day
0,2021-01-01 05:30:00+05:30,ACE43B7D,Chennai,2.59,2.78,2.57,0.07554,0.21,2021-01,2021-01-01
105120,2021-01-01 05:30:00+05:30,7F6ACD62,Delhi,2.96,3.14,2.93,0.066879,0.21,2021-01,2021-01-01
420480,2021-01-01 05:30:00+05:30,3F42A75F,Mumbai,4.83,4.82,4.81,0.002075,0.01,2021-01,2021-01-01
210240,2021-01-01 05:30:00+05:30,0653DAA9,Bengaluru,1.55,1.66,1.53,0.078313,0.13,2021-01,2021-01-01
315360,2021-01-01 05:30:00+05:30,344684B5,Kolkata,2.32,2.4,2.37,0.0125,0.03,2021-01,2021-01-01


In [5]:

# === 3) Aggregate to daily series per meter/location ===
group_cols = ["day"]
if "msn" in df.columns: group_cols.append("msn")
if "location" in df.columns: group_cols.append("location")

daily = (df
         .groupby(group_cols, as_index=False)
         .agg(daily_consumption_load=("daily_consumption_load","sum"),
              energy_supplied=("energy_supplied","sum"),
              energy_billed=("energy_billed","sum"),
              technical_loss_kwh=("technical_loss_kwh","sum"),
              loss_ratio=("loss_ratio","mean"))
        )

# Reconstruct timestamp at midnight for Power BI compatibility
daily["timestamp"] = pd.to_datetime(daily["day"])
daily = daily.drop(columns=["day"]).sort_values("timestamp")
daily.head()


Unnamed: 0,msn,location,daily_consumption_load,energy_supplied,energy_billed,technical_loss_kwh,loss_ratio,timestamp
0,0653DAA9,Bengaluru,194.54,208.33,194.24,14.09,0.066929,2021-01-01
1,344684B5,Kolkata,187.21,202.14,188.18,13.96,0.071166,2021-01-01
2,3F42A75F,Mumbai,178.77,191.96,178.16,13.8,0.077742,2021-01-01
3,7F6ACD62,Delhi,198.92,212.91,198.42,14.49,0.06607,2021-01-01
4,ACE43B7D,Chennai,170.64,184.74,171.03,13.71,0.075629,2021-01-01


In [6]:

# === 4) KPIs & insights ===
# System-wide KPIs
kpis = {}
kpis["avg_daily_load"] = daily["daily_consumption_load"].mean()
kpis["peak_daily_load"] = daily["daily_consumption_load"].max()
kpis["total_consumption"] = daily["daily_consumption_load"].sum()

if "energy_supplied" in daily.columns and daily["energy_supplied"].sum() > 0:
    kpis["system_loss_ratio"] = 1.0 - (daily["energy_billed"].sum() / daily["energy_supplied"].sum())
    kpis["total_technical_loss_kwh"] = float((daily["energy_supplied"] - daily["energy_billed"]).clip(lower=0).sum())
else:
    kpis["system_loss_ratio"] = np.nan
    kpis["total_technical_loss_kwh"] = np.nan

# By-location current month consumption & loss
if "location" in daily.columns:
    current_month = daily["timestamp"].dt.to_period("M").max()
    by_loc = (daily[daily["timestamp"].dt.to_period("M") == current_month]
              .groupby("location", as_index=False)
              .agg(monthly_consumption=("daily_consumption_load","sum"),
                   avg_loss_ratio=("loss_ratio","mean")))
else:
    by_loc = pd.DataFrame()

kpis, by_loc.head() if not by_loc.empty else "No location"


({'avg_daily_load': np.float64(243.88340875912405),
  'peak_daily_load': np.float64(300.05),
  'total_consumption': np.float64(1336481.0799999998),
  'system_loss_ratio': np.float64(0.07491478987647215),
  'total_technical_loss_kwh': 108228.68000000001},
     location  monthly_consumption  avg_loss_ratio
 0  Bengaluru                47.86        0.055811
 1    Chennai                56.04        0.073358
 2      Delhi                53.09        0.086122
 3    Kolkata                41.50        0.093180
 4     Mumbai                54.12        0.087071)

In [7]:

# === 5) Feature engineering for forecasting ===
# We'll forecast daily_consumption_load per meter (or overall if single series).
def make_features(frame):
    f = frame.copy()
    f["dayofweek"] = f["timestamp"].dt.dayofweek
    f["month"] = f["timestamp"].dt.month
    f["is_weekend"] = (f["dayofweek"] >= 5).astype(int)
    # Lags
    for lag in [1,2,3,7,14,28]:
        f[f"lag_{lag}"] = f["daily_consumption_load"].shift(lag)
    # Rolling means
    f["roll_7"] = f["daily_consumption_load"].shift(1).rolling(7).mean()
    f["roll_28"] = f["daily_consumption_load"].shift(1).rolling(28).mean()
    return f

# Split utility
def time_split(f, test_days=14):
    f = f.dropna().sort_values("timestamp")
    if len(f) <= test_days*2:
        split = int(len(f)*0.8)
        return f.iloc[:split], f.iloc[split:]
    else:
        cutoff = f["timestamp"].max() - pd.Timedelta(days=test_days)
        return f[f["timestamp"] <= cutoff], f[f["timestamp"] > cutoff]


In [9]:

# === 6) Train per meter/location models & forecast next 14 days ===
series_cols = []
if "msn" in daily.columns: series_cols.append("msn")
if "location" in daily.columns: series_cols.append("location")
if not series_cols:
    daily["series_id"] = "all"
    series_cols = ["series_id"]

models_summary = []
predictions = []
forecasts = []

for keys, sub in daily.groupby(series_cols):
    sub = sub.sort_values("timestamp")
    f = make_features(sub)
    train, test = time_split(f, test_days=14)

    feature_cols = [c for c in f.columns if c not in ["timestamp","daily_consumption_load","energy_supplied","energy_billed","technical_loss_kwh","loss_ratio"] + series_cols]
    cat_cols = []  # dayofweek/month are numeric; no cats unless we add series id
    X_train, y_train = train[feature_cols], train["daily_consumption_load"]
    X_test, y_test = test[feature_cols], test["daily_consumption_load"]

    model = Pipeline([
        ("scaler", StandardScaler(with_mean=False)),
        ("rf", RandomForestRegressor(n_estimators=300, random_state=42, n_jobs=-1))
    ])
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    # Store test preds
    df_pred = pd.DataFrame({
        "timestamp": test["timestamp"],
        "actual": y_test.values,
        "predicted": y_pred
    })
    for i, col in enumerate(series_cols):
        df_pred[col] = sub.iloc[0][col]
    predictions.append(df_pred)

    # Recursive 14-day forecast
    last_date = sub["timestamp"].max()
    history = sub[["timestamp","daily_consumption_load"]].copy().set_index("timestamp").sort_index()
    fut_rows = []
    for h in range(1, 14+1):
        t = last_date + pd.Timedelta(days=h)
        # build one-row features from history
        feat = {
            "dayofweek": t.dayofweek,
            "month": t.month,
            "is_weekend": 1 if t.dayofweek >=5 else 0,
            "lag_1": history["daily_consumption_load"].iloc[-1] if len(history)>=1 else np.nan,
            "lag_2": history["daily_consumption_load"].iloc[-2] if len(history)>=2 else np.nan,
            "lag_3": history["daily_consumption_load"].iloc[-3] if len(history)>=3 else np.nan,
            "lag_7": history["daily_consumption_load"].iloc[-7] if len(history)>=7 else np.nan,
            "lag_14": history["daily_consumption_load"].iloc[-14] if len(history)>=14 else np.nan,
            "lag_28": history["daily_consumption_load"].iloc[-28] if len(history)>=28 else np.nan,
            "roll_7": history["daily_consumption_load"].iloc[-7:].mean() if len(history)>=7 else np.nan,
            "roll_28": history["daily_consumption_load"].iloc[-28:].mean() if len(history)>=28 else np.nan,
        }
        row = pd.DataFrame([feat])
        # align columns
        for c in feature_cols:
            if c not in row.columns: row[c] = 0.0
        row = row[feature_cols]
        yhat = float(model.predict(row)[0])
        fut_rows.append((t, yhat))
        history.loc[t] = yhat

    df_fc = pd.DataFrame(fut_rows, columns=["timestamp","forecast"])
    for i, col in enumerate(series_cols):
        df_fc[col] = sub.iloc[0][col]
    forecasts.append(df_fc)

    models_summary.append({
        **({series_cols[i]: keys[i] for i in range(len(series_cols))} if isinstance(keys, tuple) else {series_cols[0]: keys}),
        "MAE": mae, "RMSE": rmse
    })

summary = pd.DataFrame(models_summary).sort_values("RMSE")
predictions_df = pd.concat(predictions, ignore_index=True)
forecasts_df = pd.concat(forecasts, ignore_index=True)

summary.head()


Unnamed: 0,msn,location,MAE,RMSE
4,ACE43B7D,Chennai,20.099898,49.98152
2,3F42A75F,Mumbai,23.070274,51.374346
3,7F6ACD62,Delhi,26.088817,53.007609
0,0653DAA9,Bengaluru,24.090117,53.174742
1,344684B5,Kolkata,25.536636,56.426312


In [10]:

# === 7) Export Power BI tables ===
# Fact table (daily)
fact_daily = daily.copy()
# Model metrics per series
metrics = summary.copy()

# Save
fact_daily.to_csv(OUTDIR/"fact_daily.csv", index=False)
metrics.to_csv(OUTDIR/"model_metrics_by_series.csv", index=False)
predictions_df.to_csv(OUTDIR/"test_predictions_by_series.csv", index=False)
forecasts_df.to_csv(OUTDIR/"next14d_forecast_by_series.csv", index=False)

# KPI table (single row)
kpis_df = pd.DataFrame([{
    "avg_daily_load": kpis["avg_daily_load"],
    "peak_daily_load": kpis["peak_daily_load"],
    "total_consumption": kpis["total_consumption"],
    "system_loss_ratio": kpis["system_loss_ratio"],
    "total_technical_loss_kwh": kpis["total_technical_loss_kwh"]
}])
kpis_df.to_csv(OUTDIR/"kpis.csv", index=False)

# By-location table
if isinstance(by_loc, pd.DataFrame) and not by_loc.empty:
    by_loc.to_csv(OUTDIR/"kpi_by_location.csv", index=False)

print("Exported to:", OUTDIR)


Exported to: smartmeter_outputs


In [11]:

# === 8) Optional: create charts saved as PNGs for PPT ===
import matplotlib.pyplot as plt

# Actual vs Predicted (pick one series as example)
if not predictions_df.empty:
    ex = predictions_df.copy()
    # choose first series
    keycols = [c for c in ["msn","location"] if c in ex.columns]
    if keycols:
        gb_key = ex[keycols].apply(lambda r: tuple(r), axis=1).iloc[0]
        # filter that group
        mask = (ex[keycols] == gb_key).all(axis=1)
        exs = ex[mask].sort_values("timestamp").tail(30)
    else:
        exs = ex.sort_values("timestamp").tail(30)

    plt.figure()
    plt.plot(exs["timestamp"], exs["actual"], label="Actual")
    plt.plot(exs["timestamp"], exs["predicted"], label="Predicted")
    plt.title("Actual vs Predicted (last ~30 days, sample series)")
    plt.xticks(rotation=45); plt.tight_layout()
    plt.legend()
    plt.savefig(OUTDIR/"actual_vs_pred_sample.png"); plt.close()

# Next 14d forecast (aggregate)
agg_fc = forecasts_df.groupby("timestamp", as_index=False)["forecast"].sum()
plt.figure()
plt.plot(agg_fc["timestamp"], agg_fc["forecast"])
plt.title("Next 14 Days â€” System Forecast (sum of all meters)")
plt.xticks(rotation=45); plt.tight_layout()
plt.savefig(OUTDIR/"next14d_system_forecast.png"); plt.close()
