Notes

- IAQ = Indoor Air Quality
- SKEMALAGT = Room-time scheduled in school timetable

In [None]:
# base
import pandas as pd
import numpy as np
import plotly.express as px

# anomaly detection
from sklearn.ensemble import IsolationForest

# settings
pd.options.plotting.backend = "plotly"

In [None]:
SENSOR_COLUMNS = ["CO2", "TEMP", "MOTION", "IAQ", "BOOKET"]
KOMMUNE = "Aarhus"

In [None]:
def diagnose(df, col, dfunc="unique", **kwargs):
    print(getattr(df[col], dfunc)(**kwargs))
    return df

def fill_na(df, cols, values, types):
    return df.assign(
        **{col: df[col].fillna(value).astype(_type) for col, value, _type in zip(cols, values, types)}
        # .fillna(value).astype(_type
        # fillna(method="ffill", limit=2)
    )

def display_missing_values(df):
    for i, munc in df.groupby('KOMMUNE'):
        print(f"Missing values for {i}")
        display(
            munc
            # assign null if False else 1
            .groupby('ID')
            [SENSOR_COLUMNS + ['SKEMALAGT', 'TYPE']]
            .apply(lambda x: x.isnull().sum()/len(x))
            .style.format(precision=2)
            .background_gradient(cmap='Reds', axis=0, vmin=0, vmax=1)
        )
    return df

    
def merge_dt(df, date, time, name, sep=" "):
    return df.assign(
        **{name: lambda d: pd.to_datetime(
                d[date] + sep + d[time]
            )
        }
    )

def drop_cols(df, cols):
    return df.drop(columns=cols)

def filter_values(df, col, values):
    return df[lambda d: d[col].isin(values)]
    

In [None]:
full_data = (
    pd.read_csv("data/Skemaer.csv")
    .pipe(drop_cols, cols=["KOMMUNE_DATO_LOKALE_TIME"])
    .pipe(diagnose, col="KOMMUNE", dfunc="unique")
    # .pipe(display_missing_values)
    .pipe(merge_dt, date="DATE", time="TIME", name="DATETIME")
    .pipe(fill_na, 
        cols=["CO2", "TEMP", "MOTION", "IAQ"],
        values=[487, 20.0, 0.0, .03],
        types=[float, float, float, float]
    )
)


Filter on 'KOMMUNE'

In [None]:
print(f"Selecting data for {KOMMUNE}")
dataf = full_data.pipe(filter_values, col="KOMMUNE", values=[KOMMUNE])

### Distributions

In [None]:
(
    dataf.TEMP.value_counts(bins=3)
    .reset_index()
    .astype({"TEMP": "str"})
    .sort_values("TEMP", key=lambda d: d.str.extract(r"\((.+)\,", expand=False).astype(float))
    .plot(x="TEMP", y="count", kind="bar", title="TEMP distribution", text="count")
)

In [None]:
(
    dataf.CO2.value_counts(bins=10)
    .reset_index()
    .astype({"CO2": "str"})
    .sort_values("CO2", key=lambda d: d.str.extract(r"\((.+)\,", expand=False).astype(float))
    .plot(x="CO2", y="count", kind="bar", title="CO2 levels", text="count")
)

In [None]:
(
    dataf.MOTION.value_counts()
    .reset_index()
    .astype({"MOTION": "str"})
    .sort_values("MOTION", key=lambda d: d.str.extract(r"\((.+)\,", expand=False).astype(float))
    .plot(x="MOTION", y="count", kind="bar", log_y=True, title="MOTION triggers, log scale")
)

In [None]:
(
    dataf.IAQ.value_counts(bins=10)
    .reset_index()
    .astype({"IAQ": "str"})
    .sort_values("IAQ", key=lambda d: d.str.extract(r"\((.+)\,", expand=False).astype(float))
    .plot(x="IAQ", y="count", kind="bar", title="IAQ levels", text="count", log_y=True)
)

### Show variables over time

In [None]:
print(f"Showing {dataf.ID.nunique()} rooms")
# for i, df in dataf.groupby("ID"):
(
    dataf
    .melt(id_vars = ["DATETIME", "ID"], value_vars=["CO2", "TEMP", "MOTION", "IAQ"], var_name="Type")
    .plot(
        x="DATETIME",
        y="value",
        facet_row="Type",
        # title=f"V",
        color="ID",
    )
    .update_yaxes(matches=None)
    .update_traces(connectgaps=False)
).show(config={
            # 'displayModeBar': False, 
            # "staticPlot": True
        })


Missing values after imputation

In [None]:
(
    (
        dataf
        .count(0) / len(dataf)
    )
    .rename("purity")
    .plot
    .bar(
        title="Data purity",
        range_y=(0,1),
        height=300,
        text="value",
        labels={"variable": "col"},
    )
)

Preprocessing

For each room, drop sequences of rows where CO2 is constant, indicating that the sensor is not working.

In [None]:
# Identify intervals of 5+ rows of identical values
to_drop = (
    dataf.groupby("ID")
    ["CO2"].transform(
        lambda x: x.rolling(5).apply(
            lambda x: x.nunique() == 1
        )
    )
)
cdataf = dataf.drop(index=to_drop[lambda d: d.eq(1.0)].index)
f"Turns {dataf.shape[0]} rows into {cdataf.shape[0]} rows - Dropping {(dataf.shape[0] - cdataf.shape[0])/1000}K rows"

In [None]:
print(f"Showing {cdataf.ID.nunique()} rooms")
# for i, df in cdataf.groupby("ID"):
(
    cdataf
    .melt(id_vars = ["DATETIME", "ID"], value_vars=["CO2", "TEMP", "MOTION", "IAQ"], var_name="Type")
    .plot(
        x="DATETIME",
        y="value",
        facet_row="Type",
        # title=f"V",
        color="ID",
    )
    .update_yaxes(matches=None)
    .update_traces(connectgaps=False)
).show(config={
            # 'displayModeBar': False, 
            # "staticPlot": True
        })

For Aarhus, we don't have sufficient data for 02.S.09 and 12.S.20.

Feature engineering

In [None]:
def add_date_range_group(grp):
    grp['DATE_RANGE_GROUP'] = grp['DATETIME'].transform(lambda x: (x.diff().dt.total_seconds()/ 60).ne(15).cumsum())
    return grp



data = (
    cdataf.sort_values("DATETIME").groupby("ID").apply(add_date_range_group)
    .reset_index(drop=True)
    .assign(
        
        CO2_ACC=lambda d: d.groupby(["ID", "DATE_RANGE_GROUP"])["CO2"].pct_change(fill_method="ffill").fillna(0),
        TEMP_ACC=lambda d: d.groupby(["ID", "DATE_RANGE_GROUP"])["TEMP"].pct_change(fill_method="ffill").fillna(0),
        MOTION_ACC=lambda d: d.groupby(["ID", "DATE_RANGE_GROUP"])["MOTION"].pct_change(fill_method="ffill").fillna(0),
        IAQ_ACC=lambda d: d.groupby(["ID", "DATE_RANGE_GROUP"])["IAQ"].pct_change(fill_method="ffill").fillna(0),
    )
)



In [None]:
(
    data
    .melt(
        id_vars = ["DATETIME", "ID"],
        value_vars=["CO2_ACC", "TEMP_ACC", "MOTION_ACC", "IAQ_ACC"],
        var_name="Type"
    )
    .plot(
        x="DATETIME",
        y="value",
        facet_row="Type",
        # title=f"V",
        color="ID",
    )
    .update_yaxes(matches=None)
    .update_traces(connectgaps=False)
    .for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
).show(config={
            # 'displayModeBar': False,
            # "staticPlot": True
        })

In [None]:
data.SKEMALAGT.value_counts()

In [None]:
model_data = (
    data
    .assign(
        AKTIVITET=lambda d: pd.factorize(d["TIDSPUNKT_TYPE"])[0],
        DOW=lambda d: d["DATETIME"].dt.dayofweek,
        HOUR=lambda d: d["DATETIME"].dt.hour,
        DAY_TYPE=lambda d: pd.factorize(d["TYPE"])[0],
        BOOKET=lambda d: d["BOOKET"].fillna(0.0),
        
    )
    .drop(columns=[
        "DATE",
        "TIDSPUNKT_TYPE",
        "TYPE",
        "DATE_RANGE_GROUP",
        "DAYNAME",
        "TIME",
        "SKOLE",
        "KOMMUNE",
        "NAVN"
        ]
    )

       # anomaly detection variables
    # .assign(EST_USE=lambda d: (d["SKEMALAGT"] | d["BOOKET"]).astype("category"))
    # .assign(CO2_ACC=lambda d: (d["CO2"] + 1).pct_change())
)

Pearson correlations

In [None]:
(
    model_data
    .drop(columns=["DATETIME", "ID"])
    .corr()
    .dropna(axis=0, how='all')
    .dropna(axis=1, how='all')
    .pipe(px.imshow, text_auto='.3f', width=1000)
)

Preprocessing for modelling

In [None]:
examples = model_data.ID.value_counts().head(3)
EX1_ID = examples.index[1]
display(examples)
EX1_ID

In [None]:
est_usage = 2.1 * (model_data["SKEMALAGT"].astype(bool) | model_data["BOOKET"].astype(bool)).sum() / model_data.shape[0]
est_usage 

In [None]:
def fit_predict(df, usage=est_usage, n_estimators=500, random_state=42):
    model_IF = IsolationForest(
        usage=0.4, #est_usage,
        random_state=random_state,
        n_estimators=n_estimators,
        verbose=1
    )
    model_IF.fit(df)

    scores = model_IF.decision_function(df)
    predictions = model_IF.predict(df)
    return scores, predictions

In [None]:
def format_results(dataf, scores, predictions):
    return (
        pd.DataFrame(
            {
                'DATETIME': dataf['DATETIME'],
                'usage_score': scores,
                'usage': predictions,
                'CO2': dataf['CO2'],
                # 'CO2_ACC': dataf['CO2_ACC'],
                # 'MOTION': dataf['MOTION'],
                # 'IAQ_ACC': dataf['IAQ_ACC'],
            }
        )
        .astype({"usage": "category"})
    )

In [None]:
EX_FEATURES = EX[[
    # 'ID',
    'SKEMALAGT',
    # 'CO2',
    # 'TEMP',
    # 'MOTION',
    # 'IAQ',
    # 'BOOKET',
    # 'DATETIME',
    'CO2_ACC',
    'TEMP_ACC',
    # 'MOTION_ACC',
    # 'IAQ_ACC',
    # 'AKTIVITET',
    # 'DOW',
    # 'HOUR',
    # 'DAY_TYPE'
]]


In [None]:
scores, predictions = fit_predict(EX_FEATURES, n_estimators=100, usage=est_usage)

results = format_results(EX, scores, predictions)
results.usage.value_counts()

In [None]:
(
    results.usage_score.value_counts(bins=10)
    .reset_index()
    .astype({"usage_score": str})
    .sort_values("usage_score", key=lambda d: d.str.extract(r"\((.+)\,", expand=False).astype(float))
    .plot(x="usage_score", y="count", kind="bar", title="IF anomaly score distribution", text="count")
)

In [None]:
fig = (
    (
        results
        .assign(CO2_ACC=EX["CO2_ACC"])
        .assign(
            usage=lambda d: np.where(
                d["CO2"].lt(600),
                1,
                d["usage"],
            ),
        )
        .assign(
            usage=lambda d: np.where(
                (d["CO2_ACC"].gt(.1) & d["CO2"].gt(600)) | (d["CO2"].gt(1200)),
                -1, 
                d["usage"]
            )
        )
        # [lambda d: (d["DATETIME"] > "2022-08-09") & (d["DATETIME"] <= "2022-11-01")]
    ).plot.bar(    
    x='DATETIME',
    y='CO2',
    color='usage',
    title='CO2 anomaly',
    width=3000,
    hover_data=EX[["CO2_ACC"]],
)
)
# fig["layout"]["xaxis"].update(range=["2022-08-09", "2023-01-01"]) 
fig.update_traces(dict(marker_line_width=0))
fig.write_html(f'anomaly-{KOMMUNE}-{EX1_ID}.html')
fig.show()

### Process for delivery

In [None]:
model_data.head()

### Prediction flow

******

In [None]:
def fit_predict(df, usage=0.19, n_estimators=500, random_state=42):
    model_IF = IsolationForest(
        usage=0.4, #est_usage,
        random_state=random_state,
        n_estimators=n_estimators,
        verbose=1
    )
    model_IF.fit(df)

    scores = model_IF.decision_function(df)
    predictions = model_IF.predict(df)
    return scores, predictions
    

In [None]:
est_usage = 2.1 * (model_data["SKEMALAGT"].astype(bool) | model_data["BOOKET"].astype(bool)).sum() / model_data.shape[0]
est_usage 

In [None]:
def run_model(data, features):

    scores, predictions = fit_predict(
        data[features], 
        n_estimators=100, 
        usage=est_usage,
    )
    data["usage_score"] = scores
    data["usage"] = predictions
    return data

In [None]:
def add_heuristics(data):
    return(
        data
        .assign(
            usage=lambda d: np.where(
                d["CO2"].lt(600),
                1,
                d["usage"],
            ),
        )
        .assign(
            usage=lambda d: np.where(
                (d["CO2_ACC"].gt(.1) & d["CO2"].gt(600)) | (d["CO2"].gt(1200)),
                -1, 
                d["usage"]
            )
        )
    )

In [None]:
def export_plots(data, kommune):

    for i, dataf in data.groupby("ID"):
        fig = dataf.plot.bar(    
            x='DATETIME',
            y='CO2',
            color='usage',
            title=f'Anvendelsesmodel - Lokale {i} - {kommune} Kommune',
            width=3000,
            hover_data=dataf[["CO2_ACC"]],
        )
        fig.update_traces(dict(marker_line_width=0))
        fig.write_html(f'result_plots/{kommune}/anomaly-{kommune.lower()}-{i}.html')
        
    return data

In [None]:
room_features = ["SKEMALAGT", "CO2_ACC", "TEMP_ACC"]

In [None]:
dataf = (
    model_data
    .sort_values(["DATETIME", "ID"])
    .groupby("ID").apply(run_model, features=room_features)
    .reset_index(drop=True)
    .pipe(add_heuristics)
    .pipe(export_plots, kommune=KOMMUNE)
)