# Model Flow

This notebook contains a step-by-step guide to the model registry flow. It is intended to be used as a reference to understand the operations performed to create each moddel, and as a starting point for future development.

In [None]:
# !pip install rich # Run this cell if you don't have rich installed

Import packages. Note that `tilly` is our internal name for Driftsoptimeringsmodellen. <br>
(Utilization model -> Utilization -> Tilly)

In [1]:
import pandas as pd
import numpy as np
import random
from sqlalchemy import create_engine
from sqlalchemy.orm import Session
from typing import Generator
from loguru import logger

from tilly.config import SNOWFLAKE_URL
from tilly.database.data.models import TrainingTimeslots
from tilly.services.ml.transformations.preprocessing import Preprocessor
from tilly.services.ml.model import Model

In [None]:
pd.options.plotting.backend = "plotly"

Define the snowflake connection and retrieve data

In [None]:
engine = create_engine(SNOWFLAKE_URL, future=True)

def get_session() -> Generator[Session, None, None]:
    with Session(engine) as session:
        yield session

Retrieve a single room from the database

In [None]:
def retrieve_data(session: Session, table: object) -> dict[str, pd.DataFrame]:
    """retrieve all timeslots using sqlalchemy"""
    logger.debug(f"Retrieving data from {table.__tablename__}")

    query = session.query(table).limit(1000).statement

    # Step 1: Identify the first unique combination of SKOLE and ID
    first_unique_combo = (
        session.query(table.school, table.room_id)
        .group_by(table.school, table.room_id)
        .order_by(table.school, table.room_id)
        .first()
    )

    # Step 2: Retrieve all rows that match the first unique combination of SKOLE and room_id
    if first_unique_combo:
        skole_value, room_id_value = first_unique_combo
        query = session.query(table).filter_by(school=skole_value, room_id=room_id_value).statement


        data = {
            school_room: df
            for school_room, df in (
                pd.read_sql(query, session.bind)
                .assign(
                    SKOLE_ID=lambda d: d.SKOLE + "_" + d.ID,
                    DATETIME=lambda d: pd.to_datetime(d["DATE"].astype(str) + " " + d["TIME"].astype(str))
                )
                .sort_values("DATETIME")
                .rename(str, axis="columns")
                .groupby("SKOLE_ID")
            )
    }
    return data

room = retrieve_data(next(get_session()),table=TrainingTimeslots)
f"{room.keys()=}"

In [None]:
data = room[list(room.keys())[0]]

In [None]:
def plot_col(df, x = "DATETIME", y = "CO2", **kwargs):
    if not df.empty:
        fig = df.plot.bar(
            x="DATETIME",
            y="CO2",
            title=f"{y} values for {df.SKOLE_ID.iloc[0]}",
            **kwargs
            # prevent stacking CO2 values on same dates
        )
        fig.update_traces(dict(marker_line_width=0))
        fig.show()
    else:
        print("No data to plot")

In [None]:
plot_col(data) # Rønbækskolen_E.16

# Run preprocessing flow

In [None]:
prep = Preprocessor()

processed = (
    data

    # fill missing timeslots between first and last timeslot
    .pipe(prep.add_missing_timeslots)

    # fill missing values using cubic spline interpolation
    .pipe(prep.interpolate_missing_islands, target_col="CO2", limit=4)

    # remove timeslots where 5 or more consecutive values are missing
    .pipe(prep.remove_stagnate_intervals, target_col="CO2", threshold=5)

    # remove all rows where CO2 is missing
    .dropna(subset=["CO2"])

    # drop timeslots where CO2 is outside the bounds of 1 and 8000
    .pipe(prep.drop_outliers, bounds={"CO2": (1, 8000)})

    # remove days where less than 25% of the timeslots are present
    .pipe(prep.day_filter, min_ratio=0.25)

    # group timeslots into time-contiguous groups
    .pipe(
        prep.apply_time_group_funcs,
        funcs=[

            # Apply gaussian smoothing to CO2 values
            (prep.gaussian_smooth, dict(metric="CO2", std_dev=2)),
            (
                # calculate derivatives of CO2 values
                prep.calculate_kinematic_quantities,
                dict(metric="CO2_smoothed", window=4, prefix="CO2"),
            ),
        ],
    )
    # add time features
    .pipe(prep.add_time_features, night_start=22, night_end=6)
)


## Modelling

In [None]:

FEATURES = [
    # "CO2",
    "CO2_velocity",
    "CO2_acceleration",
    # "CO2_jerk",
    "CO2_smoothed",
    "is_night",
    "CO2_log",
]

In [None]:
model = Model(
    model_params={
        "n_estimators": 300,
        "random_state": 123,
        "verbose": 0,

    },
    estimated_usage=0.3
)
model.fit(processed[FEATURES])
processed["pred"] = model.predict(processed[FEATURES])
processed["score"] = model.score(processed[FEATURES])

## Visualize results

In [None]:
# Plotting using Plotly via Pandas
(
    processed.assign(color=lambda d: d["pred"].map({0: 'Unsued', 1: 'Used'})).plot.bar(
        x='DATETIME',
        y=['CO2'], 
        color="color", 
        hover_data=["CO2_velocity", "score"], 
        barmode="group", 
    )
    .update_layout(
        title='Usage detection',
        xaxis_title='Time',
        yaxis_title='CO2 level',
        legend_title="Usage",          
    )
    .update_traces(dict(marker_line_width=0))
)

## Postprocessing

In [None]:
def heuristics(room: pd.DataFrame) -> pd.DataFrame:
    """Add heuristic rules to predicted data"""
    

    def apply_night_time_filter(df):
        """Filters out false positives during midnight to 6 AM."""
        hour = df["DATETIME"].dt.hour
        mask = (hour >= 0) & (hour < 6) & (df["ANOMALY_SCORE"] <= 0.7)
        df.loc[mask, "IN_USE"] = 0
        return df

    def apply_stand_alone_instances_filter(df):
        """Removes isolated instances of "IN_USE" being 1."""
        prev_IN_USE = df["IN_USE"].shift(1, fill_value=0)
        next_IN_USE = df["IN_USE"].shift(-1, fill_value=0)
        mask = (prev_IN_USE == 0) & (df["IN_USE"] == 1) & (next_IN_USE == 0)
        df.loc[mask, "IN_USE"] = 0
        return df

    def apply_low_co2_filter(df):
        """Sets "IN_USE" to 0 if CO2 levels are low."""
        mask = df["CO2"] <= 325
        df.loc[mask, "IN_USE"] = 0
        return df

    def update_anomaly_score(df):
        """Updates the anomaly score based on the modified
        "IN_USE" values."""
        mask = ((df["IN_USE"] == 1) & (df["ANOMALY_SCORE"] < 0.5)) | (
            (df["IN_USE"] == 0) & (df["ANOMALY_SCORE"] > 0.5)
        )
        df.loc[mask, "ANOMALY_SCORE"] = 1 - df.loc[mask, "ANOMALY_SCORE"]
        return df

    return (
        room.pipe(apply_night_time_filter)
        .pipe(apply_stand_alone_instances_filter)
        .pipe(apply_low_co2_filter)
        .pipe(update_anomaly_score)
    )


In [None]:
postprocessed = heuristics(processed.rename(columns={"pred": "IN_USE", "score": "ANOMALY_SCORE"}))

## Visualize results after postprocessing

In [None]:
(
    postprocessed.assign(color=lambda d: d["IN_USE"].map({0: 'blue', 1: 'red'})).plot.bar(
        x='DATETIME',
        y=['CO2'], 
        color="color", 
        hover_data=["CO2_velocity", "ANOMALY_SCORE"], 
        barmode="group", 
    )
    .update_layout(
        title='Usage detection - Heuristics',
        xaxis_title='Time',
        yaxis_title='CO2 level',
        legend_title="Usage",          
    )
    .update_traces(dict(marker_line_width=0))
)