# Daily Energy Usage Pattern Clustering (Household Power)

This notebook is an **unsupervised energy project** based on the
**Individual household electric power consumption** dataset.

We will:

1. Load and clean high-frequency household power data.
2. Aggregate to hourly kWh.
3. Build **daily profiles** (24h vectors) and aggregate features.
4. Cluster days into usage pattern segments using KMeans.
5. Interpret clusters (behavioural patterns) and visualise them.
6. Train a simple classifier to predict the cluster of a day from
   a small set of features.


## 0. How to run this notebook

1. Download the dataset **Individual household electric power consumption**
   from Kaggle or UCI.
2. Save it in your project under:

   ```text
   data/household_power.csv
   ```

3. Make sure the file has at least the original columns:

   - `Date`, `Time`
   - `Global_active_power`
   - `Global_reactive_power`, `Voltage`, `Global_intensity`
   - `Sub_metering_1`, `Sub_metering_2`, `Sub_metering_3`

4. Install Python dependencies in your environment:

   ```bash
   pip install numpy pandas matplotlib seaborn scikit-learn statsmodels
   ```

5. Open this notebook in Jupyter / VS Code and run all cells top to bottom.


## 1. Imports and data loading


In [ ]:
from __future__ import annotations

from pathlib import Path
from typing import Dict, List, Tuple

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, classification_report, confusion_matrix
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier

sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (11, 5)

DATA_PATH = Path("data") / "household_power.csv"
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

if not DATA_PATH.exists():
    raise FileNotFoundError(
        f"Expected dataset at {DATA_PATH.resolve()}\n"
        "Download 'Individual household electric power consumption' and save as 'data/household_power.csv'."
    )

raw = pd.read_csv(DATA_PATH)
raw.head()

## 2. Cleaning and hourly aggregation


In [ ]:
def clean_household_power(df: pd.DataFrame) -> pd.DataFrame:
    """Clean and resample the household power dataset to hourly kWh.

    Parameters
    ----------
    df : pd.DataFrame
        Raw dataframe with Date/Time, Global_active_power and related columns.

    Returns
    -------
    pd.DataFrame
        Hourly dataframe indexed by timestamp with at least:
        - kwh: energy in that hour (approx. mean kW * 1 hour)
        - global_active_power: mean kW in that hour
        - sub_metering_1/2/3: hourly sums (if available)
    """
    df = df.copy()

    # Combine Date and Time into a timestamp
    if not {"Date", "Time"}.issubset(df.columns):
        raise ValueError("Expected 'Date' and 'Time' columns in dataset.")

    df["timestamp"] = pd.to_datetime(df["Date"].astype(str) + " " + df["Time"].astype(str), errors="coerce")
    df = df.dropna(subset=["timestamp"]).sort_values("timestamp")

    # Convert numeric columns
    num_cols = [
        "Global_active_power",
        "Global_reactive_power",
        "Voltage",
        "Global_intensity",
        "Sub_metering_1",
        "Sub_metering_2",
        "Sub_metering_3",
    ]
    for col in num_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce")

    df = df.set_index("timestamp").sort_index()

    # Resample to hourly frequency
    hourly = pd.DataFrame()

    if "Global_active_power" in df.columns:
        hourly["global_active_power"] = df["Global_active_power"].resample("H").mean()
        # For 60-min average, mean kW over one hour approximates kWh
        hourly["kwh"] = hourly["global_active_power"]

    for col in ["Sub_metering_1", "Sub_metering_2", "Sub_metering_3"]:
        if col in df.columns:
            hourly[col] = df[col].resample("H").sum()

    hourly = hourly.dropna(subset=["kwh"])
    return hourly


hourly = clean_household_power(raw)
hourly.head()

### 2.1 Quick EDA

In [ ]:
hourly["kwh"].plot(alpha=0.7)
plt.title("Hourly energy consumption (kWh)")
plt.ylabel("kWh")
plt.show()

sample_start = hourly.index.min() + pd.Timedelta(days=7)
sample_end = sample_start + pd.Timedelta(days=7)
sample = hourly.loc[sample_start:sample_end]
sample["kwh"].plot()
plt.title("Sample week of hourly consumption")
plt.ylabel("kWh")
plt.show()

## 3. Daily profiles and features

In [ ]:
def build_daily_profile_frame(hourly_df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Create daily features and 24h profiles from hourly kWh.

    Returns
    -------
    daily_features : pd.DataFrame
        One row per date with aggregate features and fractions.
    daily_profiles : pd.DataFrame
        One row per date, columns `h_00`..`h_23` with kWh at that hour.
    """
    df = hourly_df.copy()
    df["date"] = df.index.date
    df["hour"] = df.index.hour

    # 24h wide profile: rows=dates, cols=hours
    profile = df.pivot_table(
        index="date",
        columns="hour",
        values="kwh",
        aggfunc="mean",
    )
    profile.columns = [f"h_{h:02d}" for h in profile.columns]

    # Aggregate stats by date
    daily = df.groupby("date").agg(
        total_kwh=("kwh", "sum"),
        max_kwh=("kwh", "max"),
        mean_kwh=("kwh", "mean"),
    )

    # Day, night, evening kWh
    def _fraction_sum(mask: pd.Series) -> pd.Series:
        return df.loc[mask, :].groupby("date")["kwh"].sum()

    day_mask = (df["hour"] >= 8) & (df["hour"] < 18)
    night_mask = (df["hour"] < 6) | (df["hour"] >= 22)
    evening_mask = (df["hour"] >= 18) & (df["hour"] < 23)

    day_kwh = _fraction_sum(day_mask)
    night_kwh = _fraction_sum(night_mask)
    eve_kwh = _fraction_sum(evening_mask)

    daily["day_kwh"] = day_kwh
    daily["night_kwh"] = night_kwh
    daily["evening_kwh"] = eve_kwh

    daily["day_frac"] = daily["day_kwh"] / daily["total_kwh"]
    daily["night_frac"] = daily["night_kwh"] / daily["total_kwh"]
    daily["evening_frac"] = daily["evening_kwh"] / daily["total_kwh"]

    features = daily.join(profile, how="inner")
    return features, profile


daily_features, daily_profiles = build_daily_profile_frame(hourly)
daily_features.head()

### 3.1 Daily totals and weekday patterns

In [ ]:
daily_features.index = pd.to_datetime(daily_features.index)
daily_features["weekday"] = daily_features.index.dayofweek

daily_features["total_kwh"].plot(alpha=0.7)
plt.title("Total daily kWh over time")
plt.ylabel("kWh/day")
plt.show()

sns.boxplot(data=daily_features, x="weekday", y="total_kwh")
plt.title("Daily energy use by weekday (0=Mon)")
plt.ylabel("kWh/day")
plt.show()

## 4. Clustering daily patterns with KMeans

In [ ]:
cluster_cols = [c for c in daily_features.columns if c not in ["weekday"]]
X = daily_features[cluster_cols].fillna(0.0).to_numpy()

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

sil_scores: Dict[int, float] = {}
for k in range(2, 8):
    kmeans = KMeans(n_clusters=k, random_state=RANDOM_STATE, n_init=20)
    labels = kmeans.fit_predict(X_scaled)
    sil_scores[k] = silhouette_score(X_scaled, labels)

sil_scores

In [ ]:
plt.plot(list(sil_scores.keys()), list(sil_scores.values()), marker="o")
plt.xlabel("K (number of clusters)")
plt.ylabel("Silhouette score")
plt.title("Silhouette vs K – daily energy patterns")
plt.show()

Pick K based on silhouette and domain knowledge; we default to K=4.

In [ ]:
K = 4

kmeans_final = KMeans(n_clusters=K, random_state=RANDOM_STATE, n_init=50)
cluster_labels = kmeans_final.fit_predict(X_scaled)

daily_features["cluster"] = cluster_labels
daily_profiles_clustered = daily_profiles.copy()
daily_profiles_clustered["cluster"] = cluster_labels

daily_features[["total_kwh", "day_frac", "night_frac", "evening_frac", "cluster"]].head()

## 5. Cluster interpretation

In [ ]:
cluster_counts = daily_features["cluster"].value_counts().sort_index()
cluster_counts

In [ ]:
agg_cols = ["total_kwh", "day_frac", "night_frac", "evening_frac", "max_kwh", "mean_kwh"]
cluster_summary = daily_features.groupby("cluster")[agg_cols].mean()
cluster_summary

In [ ]:
hour_cols = [c for c in daily_profiles.columns if c.startswith("h_")]
cluster_profiles = daily_profiles_clustered.groupby("cluster")[hour_cols].mean()

for cluster_id, row in cluster_profiles.iterrows():
    plt.plot(range(24), row.values, label=f"Cluster {cluster_id}")

plt.xticks(range(24))
plt.xlabel("Hour of day")
plt.ylabel("Average kWh")
plt.title("Average daily load shape per cluster")
plt.legend()
plt.show()

## 6. PCA visualisation

In [ ]:
pca = PCA(n_components=2, random_state=RANDOM_STATE)
X_pca = pca.fit_transform(X_scaled)

pca_df = pd.DataFrame(X_pca, columns=["pc1", "pc2"], index=daily_features.index)
pca_df["cluster"] = daily_features["cluster"].values

sns.scatterplot(data=pca_df, x="pc1", y="pc2", hue="cluster", palette="tab10", alpha=0.7)
plt.title("Daily energy patterns – PCA projection")
plt.show()

## 7. Classifier to predict clusters

In [ ]:
clf_features = ["total_kwh", "day_frac", "night_frac", "evening_frac", "weekday"]
X_clf = daily_features[clf_features].to_numpy()
y_clf = daily_features["cluster"].to_numpy()

n_days = len(daily_features)
split_idx = int(n_days * 0.8)

X_clf_train, X_clf_test = X_clf[:split_idx], X_clf[split_idx:]
y_clf_train, y_clf_test = y_clf[:split_idx], y_clf[split_idx:]

rf_clf = RandomForestClassifier(
    n_estimators=300,
    random_state=RANDOM_STATE,
    n_jobs=-1,
)
rf_clf.fit(X_clf_train, y_clf_train)

y_pred_clf = rf_clf.predict(X_clf_test)
print(classification_report(y_clf_test, y_pred_clf))
print("Confusion matrix:\n", confusion_matrix(y_clf_test, y_pred_clf))

## 8. Export daily features with clusters

In [ ]:
export_path = Path("data") / "daily_profiles_with_clusters.csv"
export_path.parent.mkdir(parents=True, exist_ok=True)
daily_features.to_csv(export_path, index_label="date")
print("Exported daily features with cluster labels to:", export_path.resolve())