In [38]:
# base
import pandas as pd
import numpy as np
from datetime import datetime
from pathlib import Path
import types

# anomaly detection
from sklearn.ensemble import IsolationForest

# settings
pd.options.plotting.backend = "plotly"

### IMPORT

In [17]:
class DataLoader():

    SENSOR_COLUMNS = ["CO2", "TEMP", "MOTION", "IAQ", "BOOKET"]

    @classmethod
    def diagnose(cls, df, col, dfunc="unique", **kwargs):
        print(getattr(df[col], dfunc)(**kwargs))
        return df

    @classmethod
    def fill_na(cls, df, cols, values, types):
        return df.assign(
            **{col: df[col].fillna(value).astype(_type) for col, value, _type in zip(cols, values, types)}
            # .fillna(value).astype(_type
            # fillna(method="ffill", limit=2)
        )

    @classmethod
    def display_missing_values(cls, df):
        for i, munc in df.groupby('KOMMUNE'):
            print(f"Missing values for {i}")
            display(
                
                munc
                # filter between the first and last timeslot of activity
                .sort_values("DATETIME", ascending=True)
                [lambda d: d["DATETIME"].between(
                        *(
                            d.
                            dropna(
                                subset=cls.SENSOR_COLUMNS,
                                how="all"
                            )
                            ["DATETIME"]
                            .iloc[[0, -1]]
                        ),
                        inclusive="both"
                    )
                ]
                
                .groupby('ID')
                [cls.SENSOR_COLUMNS]
                .apply(lambda x: x.isnull().sum()/len(x))
                .style.format(precision=2)
                .background_gradient(cmap='Reds', axis=0, vmin=0, vmax=1)
            )
        return df

    @classmethod 
    def merge_dt(cls, df, date, time, name, sep=" "):
        return df.assign(
            **{name: lambda d: pd.to_datetime(
                    d[date] + sep + d[time]
                )
            }
        )

    @classmethod
    def drop_cols(cls, df, cols):
        return df.drop(columns=cols)
    
    @classmethod
    def full_process(cls, df, **kwargs):
        
        return (
            df
            .pipe(cls.drop_cols, cols=["KOMMUNE_DATO_LOKALE_TIME"])
            .pipe(cls.diagnose, col="KOMMUNE", dfunc="unique")
            .pipe(cls.merge_dt, date="DATE", time="TIME", name="DATETIME")
            .pipe(cls.display_missing_values)
            .pipe(cls.fill_na, 
                cols=cls.SENSOR_COLUMNS[:-1],
                values=[487, 20.0, 0.0, .03],
                types=[float, float, float, float]
            )
        )

    @classmethod
    def _load(cls, path):
        return pd.read_csv(path)
    
    @classmethod
    def load(cls, path = "data/Skemaer.csv", steps : dict = {}, **kwargs):
        dataf = cls._load(path)

        if "all" in steps:
            return cls.full_process(dataf, **kwargs)

        for func, func_kwargs in steps.items():
            dataf = getattr(cls, func)(dataf, **func_kwargs)
            
        return dataf

In [None]:
no_schemas = DataLoader.load(path="data/schemas.csv", steps={"full_process": {}})
schemas = DataLoader.load(path="data/schemas.csv", steps={"full_process": {}})

dataf = pd.concat([no_schemas, schemas])
dataf.drop_duplicates(subset=list(dataf.columns).remove("SKEMALAGT")).sort_values(["DATETIME", "ID"]).to_csv("data/full.csv", index=False)

In [37]:
dataf = pd.concat([no_schemas, schemas])
dataf.drop_duplicates(subset=list(dataf.columns).remove("SKEMALAGT")).sort_values(["DATETIME", "ID"]).to_csv("data/full.csv", index=False)

### PREPROCESSING

In [5]:
def filter_values(df, col, values):
    return df[lambda d: d[col].isin(values)]


def drop_inactive_ranges(df, range_length=5):
    # Identify intervals of 5+ rows of identical values
    to_drop = (
        df.groupby("ID")
        ["CO2"].transform(
            lambda x: x.rolling(range_length).apply(
                lambda d: d.nunique() == 1
            )
        )
    )
    cdataf = df.drop(index=to_drop[lambda d: d.eq(1.0)].index)
    print(f"Turns {df.shape[0]} rows into {cdataf.shape[0]} rows - Dropping {(df.shape[0] - cdataf.shape[0])/1000}K rows")
    return cdataf

def add_date_range_group(grp):
    grp['DATE_RANGE_GROUP'] = grp['DATETIME'].transform(lambda x: (x.diff().dt.total_seconds()/ 60).ne(15).cumsum())
    return grp

def acceleration_features(df):

    return (
        df.sort_values("DATETIME").groupby("ID").apply(add_date_range_group)
        .reset_index(drop=True)
        .assign(
            
            CO2_ACC=lambda d: d.groupby(["ID", "DATE_RANGE_GROUP"])["CO2"].pct_change(fill_method="ffill").fillna(0),
            TEMP_ACC=lambda d: d.groupby(["ID", "DATE_RANGE_GROUP"])["TEMP"].pct_change(fill_method="ffill").fillna(0),
            MOTION_ACC=lambda d: d.groupby(["ID", "DATE_RANGE_GROUP"])["MOTION"].pct_change(fill_method="ffill").fillna(0),
            IAQ_ACC=lambda d: d.groupby(["ID", "DATE_RANGE_GROUP"])["IAQ"].pct_change(fill_method="ffill").fillna(0),
        )
    )


def preprocess_for_modelling(df):
    return (
        df
        .assign(
            AKTIVITET=lambda d: pd.factorize(d["TIDSPUNKT_TYPE"])[0],
            DOW=lambda d: d["DATETIME"].dt.dayofweek,
            HOUR=lambda d: d["DATETIME"].dt.hour,
            DAY_TYPE=lambda d: pd.factorize(d["TYPE"])[0],
            BOOKET=lambda d: d["BOOKET"].fillna(0.0),
            
        )
        .drop(columns=[
            "DATE",
            "TIDSPUNKT_TYPE",
            "TYPE",
            "DATE_RANGE_GROUP",
            "DAYNAME",
            "TIME",
            "SKOLE",
            "KOMMUNE",
            "NAVN"
            ]
        )
    )

### MODELLING

In [6]:
def fit_predict(df, **kwargs) -> tuple:
    model_IF = IsolationForest(**kwargs)
    model_IF.fit(df)

    scores = model_IF.decision_function(df)
    predictions = model_IF.predict(df)
    return scores, predictions


def format_predictions(preds : list) -> list:
    return [1 if pred == -1 else 0 for pred in preds]


def format_scores(scores : list) -> list:
    """ Normalize scores in range [-1, 1] to
    [0, 1] where 1 is most anomalous

    Args:
        scores (list): List of scores
    """
    return np.interp(scores, (min(scores), max(scores)), (0, 1))


def run_model(data : pd.DataFrame, features : list, **kwargs) -> pd.DataFrame:
    """ Run for every room in every school"""
    
    scores, predictions = fit_predict(
        data[features], 
        **kwargs
    )
    return data.assign(
        usage_score=1 - format_scores(scores),
        in_use=format_predictions(predictions)
    )


### Heuristics

In [7]:

def add_heuristics(data, apply_rules):
    
    return data if not apply_rules else (

        data
        .assign(
            in_use=lambda d: np.where(
                d["CO2"].lt(600),
                0,
                d["in_use"],
            ),
        )
        .assign( # Remove anomalies when CO2 accelerates or CO2 is high
            in_use=lambda d: np.where(
                (d["CO2_ACC"].gt(0.01) & d["CO2"].gt(600)) | (d["CO2"].gt(1400)),
                1, 
                d["in_use"]
            )
        )
        .assign( 
            # Combine single usages - 1 IF n-1 = 1, n = 0, n+1 = 1, n+2 = 1
            # Use for cases with a premature prediction of use
            in_use=lambda d: np.where(
                d["in_use"].shift(-1).eq(1) & d["in_use"].eq(0) & d["in_use"].shift(1).eq(1) & d["in_use"].shift(2).eq(1),
                1,
                d["in_use"]
            )
        )

        .assign( 
            # Remove single in_use - 0 IF n-1 = 0, n = 1, n+1 = 0
            in_use=lambda d: np.where(
                d["in_use"].shift(-1).eq(0) & d["in_use"].eq(1) & d["in_use"].shift(1).eq(0),
                0,
                d["in_use"]
            )
        )

    )


### Exports

In [8]:
def create_dir(run_id, _kommune) -> None:
    Path(f"results/{run_id}/{_kommune}").mkdir(parents=True, exist_ok=True)


def export_plots(data, kommune, run_id):

    for i, room in data.groupby("ID"):

        _kommune = kommune.lower()
        create_dir(run_id, _kommune)

        fig = room.plot.bar(    
            x='DATETIME',
            y='CO2',
            color='in_use',
            title=f'Anvendelsesmodel - Lokale {i} - {kommune} Kommune',
            width=3000,
            hover_data=data[["CO2_ACC"]],
        )
        fig.update_traces(dict(marker_line_width=0))
        fig.write_html(f'results/{run_id}/{_kommune}/anomaly-{_kommune}-{i}.html')
        
    return data

### Estimate usage

In [9]:
def estimate_usage(data, kommune, usage_coeff=2.1, usage_limit=.2):
    df = data[lambda d: d["KOMMUNE"] == kommune]
    est_usage = min(
        usage_coeff * (
            df["SKEMALAGT"].astype(bool) | df["BOOKET"].fillna(0).astype(bool)
        ).sum() / df.shape[0], 
        usage_limit
    )
    print(f"Est. usage score {est_usage:.2f} | usage coeff: {usage_coeff:.2f}")
    return est_usage

### Exit report

In [10]:
def exit_report(df, kommune, original_data):

    original = original_data[lambda d: d["KOMMUNE"] == kommune]
    print(
        f"EXIT REPORT - {kommune} | Size: {df.shape[0]} | "
        + f"Orig. size: {original.shape[0]}\n"
        + f"Mean usage rate: {df['in_use'].mean():.2f} | "
        + f"Mean usage score: {df['usage_score'].mean():.2f}\n"
        + "\n\n"
    )
    assert df['ID'].nunique() == original['ID'].nunique(), "Flow dropped rooms!"
    return df

*** 

### Pipeline

#### Set feature set

In [12]:
room_features = [
    # "SKEMALAGT",
    "CO2_ACC",
    # "TEMP_ACC"
]

#### Define flow

In [13]:
def flow(
        data,
        run_id,
        usage_coeff,
        usage_limit,
        random_state,
        apply_rules,
    ):
                
    for kommune in data["KOMMUNE"].unique():
        print(f"Running flow for {kommune}")

        est_contamination = estimate_usage(
            data, 
            kommune, 
            usage_coeff=usage_coeff,
            usage_limit=usage_limit
        )

        yield (
            
            # Processing
            data
            .pipe(filter_values, col="KOMMUNE", values=[kommune])
            .pipe(drop_inactive_ranges)
            .pipe(acceleration_features)
            .pipe(preprocess_for_modelling)

            # Modelling
            .groupby("ID").apply(
                run_model,
                features=room_features,
                contamination=est_contamination,
                random_state=random_state,
            )
            .reset_index(drop=True)

            # heuristics
            .pipe(
                add_heuristics,
                apply_rules=apply_rules
            )

            # Export plots
            .pipe(export_plots, kommune=kommune, run_id=run_id)

            # Postprocess 
            [["DATETIME", "ID", "in_use", "usage_score"]]
            .assign(KOMMUNE=kommune)

            # Exit report
            .pipe(exit_report, kommune=kommune, original_data=data)
        )

### Run flow

In [14]:
def run_flow(**kwargs):

    run_id = f"RUN-{datetime.now().strftime('%Y-%m-%d-%H-%M')}"
    print(f"RUNNING FLOW '{run_id}'\n")

    return {
        "run_id": run_id,
        "data": pd.concat(flow(run_id=run_id, **kwargs))
    }


In [15]:
results = run_flow(
    data=DataLoader.load(path="data/Skemaer.csv", steps=["all"]),
    usage_coeff=2.1,
    usage_limit=.2,
    random_state=42,
    apply_rules=True,
)

['Syddjurs' 'Aarhus' 'Favrskov']
Missing values for Aarhus


Unnamed: 0_level_0,CO2,TEMP,MOTION,IAQ,BOOKET
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
02.S.08,0.0,0.0,1.0,0.0,1.0
02.S.09,0.0,0.0,1.0,0.0,1.0
02.S.10,0.0,0.0,1.0,0.01,1.0
02.S.11,0.0,0.0,1.0,0.0,1.0
02.S.13,0.0,0.0,1.0,0.0,1.0
03.S.03,0.0,0.0,1.0,0.0,1.0
03.S.04,0.0,0.0,1.0,0.0,1.0
03.S.05,0.0,0.0,1.0,0.0,1.0
03.S.06,0.0,0.0,1.0,0.0,1.0
03.S.07,0.0,0.0,1.0,0.02,1.0


Missing values for Favrskov


Unnamed: 0_level_0,CO2,TEMP,MOTION,IAQ,BOOKET
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
D.03,0.0,0.0,1.0,1.0,0.65
D.05,0.0,0.0,1.0,1.0,0.8
D.06,0.0,0.0,1.0,1.0,0.8
D.08,0.0,0.0,1.0,1.0,0.74
D.09,0.0,0.0,1.0,1.0,0.8
D.14,0.0,0.0,1.0,1.0,0.8
D.15,1.0,0.0,1.0,1.0,1.0
D.17,0.0,0.0,1.0,1.0,0.83
D.19,0.0,0.0,1.0,1.0,0.74
D.23,0.0,0.0,1.0,1.0,0.71


Missing values for Syddjurs


Unnamed: 0_level_0,CO2,TEMP,MOTION,IAQ,BOOKET
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1.0.001,0.0,0.0,0.0,1.0,1.0
1.0.012,1.0,0.0,1.0,1.0,0.95
12.0.001,0.0,0.0,0.0,1.0,1.0
12.0.004,0.0,0.0,0.0,1.0,1.0
12.1.002,0.0,0.0,0.0,1.0,1.0
12.1.003,0.0,0.0,0.0,1.0,1.0
13.0.002,0.0,0.0,0.0,1.0,1.0
13.0.004,0.0,0.0,0.0,1.0,1.0
13.0.007,0.0,0.0,0.0,1.0,1.0
2.0.003,0.0,0.0,0.0,1.0,1.0


RUNNING FLOW 'RUN-2023-09-13-16-33'

Running flow for Syddjurs


KeyError: 'SKEMALAGT'

### Combine with original data

In [None]:
(
    DataLoader.load(
        steps={
            "merge_dt": dict(date="DATE", time="TIME", name="DATETIME")
        }
    )
    .merge(
        results["data"],
        on=["DATETIME", "ID", "KOMMUNE"],
        how="left"
    )
    [['DATE', 'TIME', 'DATETIME', 'ID', 'KOMMUNE', 'in_use', 'usage_score']]
    .to_csv(f"results/{results['run_id']}/results.csv", index=False)
)

### Diagnostics

In [None]:
results["data"].usage_score.value_counts(bins=20).sort_index()

In [None]:
results["data"].usage_score.hist(
    bins=100, 
    title="Usage score distribution",
    histnorm='percent',


).update_layout(
    showlegend=False,
    xaxis_title="Usage score",
    # add '0' suffix to yaxis
    # yaxis_tickformat=".1%/100"
)

In [None]:
df = results["data"].groupby(["KOMMUNE", "ID"]).agg(
    {
        "usage_score": ["mean"],
        "in_use": ["mean", "count"],
    }
).reset_index(level=[0, 1])
df.columns = ["KOMMUNE", "ID", "MEAN_USAGE_SCORE", "MEAN_IN_USE_RATE", "COUNT"]

In [None]:
df.round(2).plot.bar(
    x="ID",
    y="MEAN_IN_USE_RATE",
    title="Mean predicted usage rate per room",
    color="KOMMUNE",
    width=600,
    hover_data=df[["MEAN_IN_USE_RATE", "COUNT"]],
)

In [None]:
df = results["data"].assign(DAY=lambda d: d["DATETIME"].dt.date).groupby(["KOMMUNE", "DAY"]).agg(
    {
        "usage_score": ["mean"],
        "in_use": ["mean", "count"],
    }
).reset_index(level=[0, 1])
df.columns = ["KOMMUNE", "DAY", "MEAN_USAGE_SCORE", "MEAN_USAGE_RATE", "COUNT"]

In [None]:
fig = df.plot.bar(    
    x='DAY',
    y='MEAN_USAGE_RATE',
    color='KOMMUNE',
    width=800,
)
fig.update_traces(dict(marker_line_width=0))
fig


In [None]:
!ls -lh