# Model Flow

This notebook contains a step-by-step guide to the model registry flow. It is intended to be used as a reference to understand the operations performed to create each moddel, and as a starting point for future development.

In [None]:
# !pip install rich # Run this cell if you don't have rich installed

Import packages. Note that `tilly` is our internal name for Driftsoptimeringsmodellen. <br>
(Utilization model -> Utilization -> Tilly)

In [25]:
import pandas as pd
from typing import Generator
from loguru import logger
from snowflake.snowpark import Session

from tilly.config import SNOWFLAKE_CREDENTIALS
from tilly.database.data.crud import retrieve_data
from tilly.database.data.models import TrainingTimeslots
from tilly.services.ml.transformations.postprocessing import Postprocessor


In [3]:
pd.options.plotting.backend = "plotly"

Define the snowflake connection and retrieve data

In [21]:
def get_session() -> Generator[Session, None, None]:
    """Get a Snowflake Session for Database Interactions."""
    return Session.builder.configs(SNOWFLAKE_CREDENTIALS).create()

In [22]:
data = retrieve_data(get_session(), table_name=TrainingTimeslots.__tablename__)

[32m2023-11-07 11:01:05.162[0m | [34m[1mDEBUG   [0m | [36mtilly.database.data.crud[0m:[36mretrieve_data[0m:[36m62[0m - [34m[1mRetrieving data from 4_FEATURIZ_DRIFTOPTIMERING_TRAINING_TEST[0m


### Data integrity

In [29]:
data

{'Rønbækskolen_A.03':            ID   KOMMUNE         SKOLE        DATE      TIME  DAYNAME  \
 303      A.03  Favrskov  Rønbækskolen  2023-09-29  17:00:00   Fredag   
 1955     A.03  Favrskov  Rønbækskolen  2023-09-27  14:15:00   Onsdag   
 4019     A.03  Favrskov  Rønbækskolen  2023-10-04  14:15:00   Onsdag   
 5646     A.03  Favrskov  Rønbækskolen  2023-09-25  04:15:00   Mandag   
 6781     A.03  Favrskov  Rønbækskolen  2023-10-05  22:15:00  Torsdag   
 ...       ...       ...           ...         ...       ...      ...   
 1838144  A.03  Favrskov  Rønbækskolen  2023-10-06  13:30:00   Fredag   
 1838882  A.03  Favrskov  Rønbækskolen  2023-10-08  06:00:00  Soendag   
 1840061  A.03  Favrskov  Rønbækskolen  2023-10-03  19:45:00  Tirsdag   
 1841111  A.03  Favrskov  Rønbækskolen  2023-09-28  21:30:00  Torsdag   
 1841501  A.03  Favrskov  Rønbækskolen  2023-10-04  18:15:00   Onsdag   
 
         TIDSPUNKT_TYPE        TYPE        NAVN         CO2       TEMP  MOTION  \
 303             Fr

In [30]:
dataf = pd.concat(data.values())

In [32]:
dataf.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1842208 entries, 303 to 1842201
Data columns (total 16 columns):
 #   Column          Dtype  
---  ------          -----  
 0   ID              object 
 1   KOMMUNE         object 
 2   SKOLE           object 
 3   DATE            object 
 4   TIME            object 
 5   DAYNAME         object 
 6   TIDSPUNKT_TYPE  object 
 7   TYPE            object 
 8   NAVN            object 
 9   CO2             float64
 10  TEMP            float64
 11  MOTION          float64
 12  IAQ             float64
 13  BOOKET          float64
 14  SKEMALAGT       float64
 15  SKOLE_ID        object 
dtypes: float64(6), object(10)
memory usage: 238.9+ MB


In [34]:
dataf.SKOLE.value_counts()

SKOLE
Strandskolen       1039539
Thorsager Skole     694098
Rønbækskolen        108571
Name: count, dtype: int64

In [41]:
# Define a custom function to consider NaN, None, False, and 0 as missing
def custom_missing_ratio(series):
    return (series.isna() | (series == 0) | (series == False)).mean()

# Calculate the custom non-missing value ratio
custom_non_missing_ratio = dataf.groupby('SKOLE')[['CO2', 'TEMP', 'MOTION', 'IAQ']].apply(lambda x: 1 - x.apply(custom_missing_ratio)).reset_index()

# Melt the DataFrame to make it suitable for plotting with Plotly
melted = custom_non_missing_ratio.melt(id_vars='SKOLE', var_name='Sensordata', value_name='CustomNonMissingRatio')

# Create a bar plot using the Pandas plot method with Plotly
fig = melted.plot(kind='bar', x='SKOLE', y='CustomNonMissingRatio', color='Sensordata', barmode='group')

# Update layout if necessary
fig.update_layout(xaxis_title='SKOLE', yaxis_title='Integritet (%)', xaxis={'categoryorder':'total descending'})

# hide x-axis title
fig.update_xaxes(title=None)
# Show the figure
fig.show()

In [53]:
import pandas as pd
import plotly.express as px

# Assuming 'dataf' is your original DataFrame

# Define a custom function to consider NaN, None, False, and 0 as missing
def custom_missing_ratio(series):
    return (series.isna() | (series == 0) | (series == False)).mean()

# Calculate the custom non-missing value ratio
custom_non_missing_ratio = dataf.groupby(['SKOLE', 'ID'])[['CO2', 'TEMP', 'MOTION', 'IAQ']].apply(lambda x: 1 - x.apply(custom_missing_ratio)).reset_index()

# Melt the DataFrame to make it suitable for plotting with Plotly
melted = custom_non_missing_ratio.melt(id_vars=['SKOLE', 'ID'], var_name='Sensordata', value_name='CustomNonMissingRatio')

# Get the first three unique SKOLEs
unique_skoles = melted['SKOLE'].unique()[:3]

# Loop through each SKOLE and plot the top 8 IDs
for skole in unique_skoles:
    # Filter the DataFrame for the current SKOLE
    df_skole = melted[melted['SKOLE'] == skole]

    # Sort IDs by the minimum 'CustomNonMissingRatio' within each ID, prioritizing those with missing data
    df_skole = df_skole.assign(min_ratio=df_skole.groupby('ID')['CustomNonMissingRatio'].transform(min))
    df_skole_sorted = df_skole.sort_values(by=['min_ratio', 'ID'], ascending=[True, True]).drop_duplicates('ID')

    # Select top 8 IDs, prioritizing those with missing data
    top_ids = df_skole_sorted['ID'].unique()[:8]
    df_skole_top_ids = df_skole[df_skole['ID'].isin(top_ids)]

    # Create the bar plot for the current SKOLE with selected IDs
    fig = px.bar(
        df_skole_top_ids,
        x="ID",
        y="CustomNonMissingRatio",
        color="Sensordata",
        barmode="group",
        title=f"Custom Non-Missing Values Ratio for {skole} (Top 8 IDs)",
        # labels={"CustomNonMissingRatio": "Custom Non-Missing Ratio (%)", "ID": "ID"}
    )
    
    # Update layout if necessary
    fig.update_layout(
        xaxis={'categoryorder':'total descending'},
        yaxis_title='Integritet (%)',
    )
    
    # Show the figure
    fig.show()


In [70]:
import pandas as pd
import plotly.express as px

# Assuming 'dataf' is your original DataFrame

# Define a function to treat NaN, None, False, and 0 as missing, and everything else as non-missing
def is_valid(entry):
    if entry is None or entry is False or pd.isna(entry) or entry == 0:
        return False
    else:
        return True

# Apply the function to each sensor data column to create a boolean DataFrame where True represents a valid entry
valid_entries = dataf[['CO2', 'TEMP', 'MOTION', 'IAQ']].applymap(is_valid)

# Now, for each sensor data type, determine which IDs have only valid (non-missing) entries
valid_counts_per_sensor = valid_entries.groupby(dataf['ID']).all()

# Calculate the ratio of IDs with all valid entries for each sensor data type
ratios = valid_counts_per_sensor.sum() / len(valid_counts_per_sensor)

# Convert the Series to a DataFrame for plotting
ratios_df = ratios.reset_index()
ratios_df.columns = ['Sensordata', 'RatioOfNonMissingIDs']

# Plot the ratios using Plotly Express
fig = px.bar(
    ratios_df,
    x='Sensordata',
    y='RatioOfNonMissingIDs',
)

# Update the y-axis to show the ratio as a percentage
fig.update_yaxes(tickformat='.0%')
fig.update_layout(yaxis_title='Integritet (%)', xaxis_title='Sensordata')

# Show the plot
fig.show()


In [73]:
def merge_dt(df, date, time, name, sep=" "):
    return df.assign(
        **{
            name: lambda d: pd.to_datetime(
                d[date].astype(str) + sep + d[time].astype(str)
            )
        }
    )
dataf = dataf.pipe(merge_dt, date="DATE", time="TIME", name="DATETIME")

In [173]:
import pandas as pd

def add_missing_timeslots(df: pd.DataFrame, freq: str = "15T") -> pd.DataFrame:
    """Adds the rows that are missing from the DataFrame, by comparing
    it with a DataFrame containing all the timeslots."""

    results = []

    # Filter out rows beyond the specified date
    df_filtered = df[df["DATETIME"] < pd.to_datetime("2023-03-01", format="%Y-%m-%d")]

    # Iterate over each group by 'ID'
    for id, group in df_filtered.groupby("ID"):
        # Remove duplicate timestamps within each group
        group_unique = group.drop_duplicates(subset='DATETIME')

        # Create a complete DataFrame of timeslots from min to max of the group
        full_ts = pd.DataFrame(
            {
                "DATETIME": pd.date_range(
                    start=group["DATETIME"].min(), 
                    end=group["DATETIME"].max(), 
                    freq=freq
                )
            }
        )

        # Calculate the ratio of the length of unique timestamps to the full timeslot length
        ratio = len(group_unique) / len(full_ts)

        # Append the results to the list
        results.append({
            "full_len": len(full_ts), 
            "orig_len": len(group_unique), 
            "msg_ratio": ratio, 
            "id": id, 
            "skole": group["SKOLE"].iloc[0]
        })

    # Convert results to DataFrame before returning
    return pd.DataFrame(results)


In [174]:
ratios = add_missing_timeslots(dataf).sort_values("msg_ratio", ascending=True)

In [175]:
fig = (
    ratios
    .sort_values("msg_ratio")
    .groupby("skole")
    .agg({"msg_ratio": "mean"})
    .sort_values("msg_ratio")
    .reset_index()
    .assign(msg_ratio=lambda d: (1 - d["msg_ratio"]) * 100)
    .plot(
        kind="bar",
        x="skole",
        y="msg_ratio",
        # y range
        range_y=(0, 100),
    )
)
fig.update_layout(yaxis_title='Manglende tidsintervaller (%)', xaxis_title='Skole')
# # set percentage tick suffix
fig.update_yaxes(ticksuffix="%")

In [176]:
ratios.sort_values("msg_ratio", ascending=True)

Unnamed: 0,full_len,orig_len,msg_ratio,id,skole
67,3335,1668,0.50015,D.19,Rønbækskolen
68,3335,1668,0.50015,D.23,Rønbækskolen
71,3335,1668,0.50015,D.31,Rønbækskolen
69,3335,1668,0.50015,D.29,Rønbækskolen
64,3333,1667,0.50015,D.09,Rønbækskolen
...,...,...,...,...,...
45,17280,17280,1.00000,12.1.002,Thorsager Skole
52,17280,17280,1.00000,2.0.008,Thorsager Skole
53,17280,17280,1.00000,4.0.001,Thorsager Skole
54,17280,17280,1.00000,4.0.009,Thorsager Skole


In [123]:
[k for k in data.keys() if k.endswith("07.S.20")]

['Strandskolen_07.S.20']

In [179]:
from random import choice

In [180]:
rb = [k for k in data.keys() if k.startswith("Røn")]

In [198]:
k = choice(rb)
print(k)
fig = (
    data[k]# data["Rønbækskolen_D.09"]
    .pipe(merge_dt, date="DATE", time="TIME", name="DATETIME")
    # [lambda d: d["DATETIME"] < pd.to_datetime("2023-04-01", format="%Y-%m-%d")]
    .plot.bar(
        x="DATETIME",
        y="CO2",
        # title="CO2",
    
    )
)
fig.update_traces(dict(marker_line_width=0))
fig

Rønbækskolen_B.26


Retrieve a single room from the database

In [None]:
data = room[list(room.keys())[0]]

In [None]:
def plot_col(df, x = "DATETIME", y = "CO2", **kwargs):
    if not df.empty:
        fig = df.plot.bar(
            x="DATETIME",
            y="CO2",
            title=f"{y} values for {df.SKOLE_ID.iloc[0]}",
            **kwargs
            # prevent stacking CO2 values on same dates
        )
        fig.update_traces(dict(marker_line_width=0))
        fig.show()
    else:
        print("No data to plot")

In [None]:
plot_col(data) # Rønbækskolen_E.16

# Run preprocessing flow

In [None]:
prep = Preprocessor()

processed = (
    data

    # fill missing timeslots between first and last timeslot
    .pipe(prep.add_missing_timeslots)

    # fill missing values using cubic spline interpolation
    .pipe(prep.interpolate_missing_islands, target_col="CO2", limit=4)

    # remove timeslots where 5 or more consecutive values are missing
    .pipe(prep.remove_stagnate_intervals, target_col="CO2", threshold=5)

    # remove all rows where CO2 is missing
    .dropna(subset=["CO2"])

    # drop timeslots where CO2 is outside the bounds of 1 and 8000
    .pipe(prep.drop_outliers, bounds={"CO2": (1, 8000)})

    # remove days where less than 25% of the timeslots are present
    .pipe(prep.day_filter, min_ratio=0.25)

    # group timeslots into time-contiguous groups
    .pipe(
        prep.apply_time_group_funcs,
        funcs=[

            # Apply gaussian smoothing to CO2 values
            (prep.gaussian_smooth, dict(metric="CO2", std_dev=2)),
            (
                # calculate derivatives of CO2 values
                prep.calculate_kinematic_quantities,
                dict(metric="CO2_smoothed", window=4, prefix="CO2"),
            ),
        ],
    )
    # add time features
    .pipe(prep.add_time_features, night_start=22, night_end=6)
)


## Modelling

In [None]:

FEATURES = [
    # "CO2",
    "CO2_velocity",
    "CO2_acceleration",
    # "CO2_jerk",
    "CO2_smoothed",
    "is_night",
    "CO2_log",
]

In [None]:
model = Model(
    model_params={
        "n_estimators": 300,
        "random_state": 123,
        "verbose": 0,

    },
    estimated_usage=0.3
)
model.fit(processed[FEATURES])
processed["pred"] = model.predict(processed[FEATURES])
processed["score"] = model.score(processed[FEATURES])

## Visualize results

In [None]:
# Plotting using Plotly via Pandas
(
    processed.assign(color=lambda d: d["pred"].map({0: 'Unsued', 1: 'Used'})).plot.bar(
        x='DATETIME',
        y=['CO2'], 
        color="color", 
        hover_data=["CO2_velocity", "score"], 
        barmode="group", 
    )
    .update_layout(
        title='Usage detection',
        xaxis_title='Time',
        yaxis_title='CO2 level',
        legend_title="Usage",          
    )
    .update_traces(dict(marker_line_width=0))
)

## Postprocessing

In [None]:
def heuristics(room: pd.DataFrame) -> pd.DataFrame:
    """Add heuristic rules to predicted data"""
    

    def apply_night_time_filter(df):
        """Filters out false positives during midnight to 6 AM."""
        hour = df["DATETIME"].dt.hour
        mask = (hour >= 0) & (hour < 6) & (df["ANOMALY_SCORE"] <= 0.7)
        df.loc[mask, "IN_USE"] = 0
        return df

    def apply_stand_alone_instances_filter(df):
        """Removes isolated instances of "IN_USE" being 1."""
        prev_IN_USE = df["IN_USE"].shift(1, fill_value=0)
        next_IN_USE = df["IN_USE"].shift(-1, fill_value=0)
        mask = (prev_IN_USE == 0) & (df["IN_USE"] == 1) & (next_IN_USE == 0)
        df.loc[mask, "IN_USE"] = 0
        return df

    def apply_low_co2_filter(df):
        """Sets "IN_USE" to 0 if CO2 levels are low."""
        mask = df["CO2"] <= 325
        df.loc[mask, "IN_USE"] = 0
        return df

    def update_anomaly_score(df):
        """Updates the anomaly score based on the modified
        "IN_USE" values."""
        mask = ((df["IN_USE"] == 1) & (df["ANOMALY_SCORE"] < 0.5)) | (
            (df["IN_USE"] == 0) & (df["ANOMALY_SCORE"] > 0.5)
        )
        df.loc[mask, "ANOMALY_SCORE"] = 1 - df.loc[mask, "ANOMALY_SCORE"]
        return df

    return (
        room.pipe(apply_night_time_filter)
        .pipe(apply_stand_alone_instances_filter)
        .pipe(apply_low_co2_filter)
        .pipe(update_anomaly_score)
    )


In [None]:
postprocessed = heuristics(processed.rename(columns={"pred": "IN_USE", "score": "ANOMALY_SCORE"}))

## Visualize results after postprocessing

In [None]:
(
    postprocessed.assign(color=lambda d: d["IN_USE"].map({0: 'blue', 1: 'red'})).plot.bar(
        x='DATETIME',
        y=['CO2'], 
        color="color", 
        hover_data=["CO2_velocity", "ANOMALY_SCORE"], 
        barmode="group", 
    )
    .update_layout(
        title='Usage detection - Heuristics',
        xaxis_title='Time',
        yaxis_title='CO2 level',
        legend_title="Usage",          
    )
    .update_traces(dict(marker_line_width=0))
)