In [76]:
# base
import pandas as pd
import numpy as np
from datetime import datetime

# anomaly detection
from sklearn.ensemble import IsolationForest

# settings
pd.options.plotting.backend = "plotly"

In [3]:
SENSOR_COLUMNS = ["CO2", "TEMP", "MOTION", "IAQ", "BOOKET"]

### IMPORT

In [4]:

def diagnose(df, col, dfunc="unique", **kwargs):
    print(getattr(df[col], dfunc)(**kwargs))
    return df

def fill_na(df, cols, values, types):
    return df.assign(
        **{col: df[col].fillna(value).astype(_type) for col, value, _type in zip(cols, values, types)}
        # .fillna(value).astype(_type
        # fillna(method="ffill", limit=2)
    )

def display_missing_values(df):
    for i, munc in df.groupby('KOMMUNE'):
        print(f"Missing values for {i}")
        display(
            munc
            # assign null if False else 1
            .groupby('ID')
            [SENSOR_COLUMNS + ['SKEMALAGT', 'TYPE']]
            .apply(lambda x: x.isnull().sum()/len(x))
            .style.format(precision=2)
            .background_gradient(cmap='Reds', axis=0, vmin=0, vmax=1)
        )
    return df

    
def merge_dt(df, date, time, name, sep=" "):
    return df.assign(
        **{name: lambda d: pd.to_datetime(
                d[date] + sep + d[time]
            )
        }
    )

def drop_cols(df, cols):
    return df.drop(columns=cols)

def filter_values(df, col, values):
    return df[lambda d: d[col].isin(values)]


In [5]:

full_data = (
    pd.read_csv("data/Skemaer.csv")
    .pipe(drop_cols, cols=["KOMMUNE_DATO_LOKALE_TIME"])
    .pipe(diagnose, col="KOMMUNE", dfunc="unique")
    .pipe(display_missing_values)
    .pipe(merge_dt, date="DATE", time="TIME", name="DATETIME")
    .pipe(fill_na, 
        cols=["CO2", "TEMP", "MOTION", "IAQ"],
        values=[487, 20.0, 0.0, .03],
        types=[float, float, float, float]
    )
)


['Syddjurs' 'Favrskov' 'Aarhus']
Missing values for Aarhus


Unnamed: 0_level_0,CO2,TEMP,MOTION,IAQ,BOOKET,SKEMALAGT,TYPE
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
02.S.08,0.14,0.14,0.0,0.14,1.0,0.0,0.0
02.S.09,1.0,1.0,0.0,1.0,1.0,0.0,0.0
02.S.10,0.14,0.14,0.0,0.14,1.0,0.0,0.0
02.S.11,0.14,0.14,0.0,0.14,1.0,0.0,0.0
02.S.13,0.24,0.24,0.0,0.24,1.0,0.0,0.0
03.S.03,0.16,0.16,0.0,0.16,1.0,0.0,0.0
03.S.04,0.17,0.17,0.0,0.17,1.0,0.0,0.0
03.S.05,0.19,0.19,0.0,0.19,1.0,0.0,0.0
03.S.06,0.14,0.14,0.0,0.14,1.0,0.0,0.0
03.S.07,0.19,0.19,0.0,0.2,1.0,0.0,0.0


Missing values for Favrskov


Unnamed: 0_level_0,CO2,TEMP,MOTION,IAQ,BOOKET,SKEMALAGT,TYPE
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
D.03,0.94,0.94,0.94,0.94,0.7,0.0,0.0
D.05,0.94,0.94,0.94,0.94,0.8,0.0,0.0
D.06,0.94,0.94,0.94,0.94,0.8,0.0,0.0
D.08,0.94,0.94,0.94,0.94,0.79,0.0,0.0
D.30,0.94,0.94,0.94,0.94,0.8,0.0,0.0
D.31,0.94,0.94,0.94,0.94,0.8,0.0,0.0
D.32,0.94,0.94,0.94,0.94,0.79,0.0,0.0


Missing values for Syddjurs


Unnamed: 0_level_0,CO2,TEMP,MOTION,IAQ,BOOKET,SKEMALAGT,TYPE
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1.0.012,1.0,0.01,1.0,0.01,0.94,0.0,0.0
12.0.001,0.01,0.01,0.01,0.01,1.0,0.0,0.0
4.0.001,0.01,0.01,0.01,0.01,1.0,0.0,0.0
7.0.002,0.01,0.01,0.01,0.01,1.0,0.0,0.0


### PREPROCESSING

In [6]:
def drop_inactive_ranges(df):
    # Identify intervals of 5+ rows of identical values
    to_drop = (
        df.groupby("ID")
        ["CO2"].transform(
            lambda x: x.rolling(5).apply(
                lambda x: x.nunique() == 1
            )
        )
    )
    cdataf = df.drop(index=to_drop[lambda d: d.eq(1.0)].index)
    print(f"Turns {df.shape[0]} rows into {cdataf.shape[0]} rows - Dropping {(df.shape[0] - cdataf.shape[0])/1000}K rows")
    return cdataf

def add_date_range_group(grp):
    grp['DATE_RANGE_GROUP'] = grp['DATETIME'].transform(lambda x: (x.diff().dt.total_seconds()/ 60).ne(15).cumsum())
    return grp

def acceleration_features(df):

    return (
        df.sort_values("DATETIME").groupby("ID").apply(add_date_range_group)
        .reset_index(drop=True)
        .assign(
            
            CO2_ACC=lambda d: d.groupby(["ID", "DATE_RANGE_GROUP"])["CO2"].pct_change(fill_method="ffill").fillna(0),
            TEMP_ACC=lambda d: d.groupby(["ID", "DATE_RANGE_GROUP"])["TEMP"].pct_change(fill_method="ffill").fillna(0),
            MOTION_ACC=lambda d: d.groupby(["ID", "DATE_RANGE_GROUP"])["MOTION"].pct_change(fill_method="ffill").fillna(0),
            IAQ_ACC=lambda d: d.groupby(["ID", "DATE_RANGE_GROUP"])["IAQ"].pct_change(fill_method="ffill").fillna(0),
        )
    )


def preprocess_for_modelling(df):
    return (
        df
        .assign(
            AKTIVITET=lambda d: pd.factorize(d["TIDSPUNKT_TYPE"])[0],
            DOW=lambda d: d["DATETIME"].dt.dayofweek,
            HOUR=lambda d: d["DATETIME"].dt.hour,
            DAY_TYPE=lambda d: pd.factorize(d["TYPE"])[0],
            BOOKET=lambda d: d["BOOKET"].fillna(0.0),
            
        )
        .drop(columns=[
            "DATE",
            "TIDSPUNKT_TYPE",
            "TYPE",
            "DATE_RANGE_GROUP",
            "DAYNAME",
            "TIME",
            "SKOLE",
            "KOMMUNE",
            "NAVN"
            ]
        )
    )

### MODELLING

In [48]:
def fit_predict(df, contamination=0.19, n_estimators=500, random_state=42):
    model_IF = IsolationForest(
        contamination=contamination, #est_contamination,
        random_state=random_state,
        n_estimators=n_estimators,
        verbose=1
    )
    model_IF.fit(df)

    scores = model_IF.decision_function(df)
    predictions = model_IF.predict(df)
    return scores, predictions


def run_model(data, features, contamination=0.19):

    scores, predictions = fit_predict(
        data[features], 
        n_estimators=100, 
        contamination=contamination,
    )
    data["IF_anomaly_score"] = scores
    data["IF_anomaly"] = predictions
    return data

def add_heuristics(data):
    return(
        data
        .assign(
            IF_anomaly=lambda d: np.where(
                d["CO2"].lt(600),
                1,
                d["IF_anomaly"],
            ),
        )
        .assign(
            IF_anomaly=lambda d: np.where(
                (d["CO2_ACC"].gt(.1) & d["CO2"].gt(600)) | (d["CO2"].gt(1200)),
                -1, 
                d["IF_anomaly"]
            )
        )
        .assign( # Align score with anomaly label
            IF_anomaly_score=lambda d: np.where(
                (d["IF_anomaly"].eq(-1) & d["IF_anomaly_score"] > 0) | (d["IF_anomaly"].eq(1) & d["IF_anomaly_score"] < 0),
                -d["IF_anomaly_score"],
                d["IF_anomaly_score"]
            )
        )

    )

def export_plots(data, kommune):

    for i, dataf in data.groupby("ID"):
        fig = dataf.plot.bar(    
            x='DATETIME',
            y='CO2',
            color='IF_anomaly',
            title=f'Anvendelsesmodel - Lokale {i} - {kommune} Kommune',
            width=3000,
            hover_data=dataf[["CO2_ACC"]],
        )
        fig.update_traces(dict(marker_line_width=0))
        fig.write_html(f'result_plots/{kommune}/anomaly-{kommune.lower()}-{i}.html')
        
    return data

In [51]:
room_features = ["SKEMALAGT", "CO2_ACC", "TEMP_ACC"]

In [54]:
def calculate_contamination(data, kommune, coeff=2.1):
    df = full_data[lambda d: d["KOMMUNE"] == KOMMUNE]
    est_contamination = coeff * (df["SKEMALAGT"].astype(bool) | df["BOOKET"].fillna(0).astype(bool)).sum() / df.shape[0]
    print(f"Estimated contamination score {est_contamination} | coeff: {coeff}")
    return est_contamination

In [56]:
results = []
for kommune in full_data["KOMMUNE"].unique():
    print(f"Running flow for {kommune}")

    est_contamination = calculate_contamination(full_data, kommune, coeff=2.1)

    dataf = (
        # Processing
        full_data
        .pipe(filter_values, col="KOMMUNE", values=[kommune])
        .pipe(drop_inactive_ranges)
        .pipe(acceleration_features)
        .pipe(preprocess_for_modelling)

        # Modelling
        .groupby("ID").apply(run_model, features=room_features, contamination=est_contamination)
        .reset_index(drop=True)
        .pipe(add_heuristics)
        .assign(KOMMUNE=kommune)

        # Export visuals
        .pipe(export_plots, kommune=kommune)

        # Postprocess 
        [["DATETIME", "ID", "KOMMUNE", "IF_anomaly", "IF_anomaly_score"]]
    )
    results.append(dataf)


Running flow for Aarhus
Estimated contamination score 0.18832383751477735 | coeff: 2.1
Turns 115200 rows into 53526 rows - Dropping 61.674K rows
Running flow for Aarhus
Estimated contamination score 0.18832383751477735 | coeff: 2.1
Turns 201600 rows into 23410 rows - Dropping 178.19K rows
Running flow for Aarhus
Estimated contamination score 0.18832383751477735 | coeff: 2.1
Turns 730848 rows into 559222 rows - Dropping 171.626K rows


In [66]:
full_results = pd.concat(results).sort_values("DATETIME")
full_results.shape

(636158, 5)

In [65]:
original_data = (
    pd.read_csv("data/Skemaer.csv")
    .pipe(merge_dt, date="DATE", time="TIME", name="DATETIME")
)
original_data.shape

(1047648, 17)

In [69]:
combined = original_data.merge(
    full_results,
    on=["DATETIME", "ID", "KOMMUNE"],
    how="left",

    
)

In [72]:
combined.IF_anomaly.value_counts(dropna=False)

IF_anomaly
 1.0    529803
 NaN    411490
-1.0    106355
Name: count, dtype: int64

In [77]:
(
    combined
    [['DATE', 'TIME', 'DATETIME', 'ID', 'KOMMUNE', 'IF_anomaly', 'IF_anomaly_score']]
    .to_csv(f"data/results-{datetime.now().strftime('%Y-%m-%d-%H-%M')}.csv", index=False)
)