In [2]:
import pandas as pd
import numpy as np
import random
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from typing import Generator
from loguru import logger
import plotly.graph_objects as go
import pandas as pd
import numpy as np
from scipy.ndimage import gaussian_filter1d


from ai.tilly.config import SNOWFLAKE_CREDENTIALS
from ai.tilly.database.data.models import TrainingTimeslots

pd.options.plotting.backend = "plotly"

In [3]:
# Define your Snowflake connection parameters
SNOWFLAKE_URL = (
    "snowflake://{user}:{password}@{account}"
    "/{database}/{schema}?warehouse={warehouse}&role={role}"
).format(**SNOWFLAKE_CREDENTIALS)

# Create a synchronous engine
engine = create_engine(SNOWFLAKE_URL, future=True)
Session = sessionmaker(bind=engine)

def get_session() -> Generator[Session, None, None]:
    with Session() as session:
        yield session

In [4]:
def retrieve_data(session: Session, table: object) -> dict[str, pd.DataFrame]:
    """retrieve all timeslots using sqlalchemy"""
    logger.debug(f"Retrieving data from {table.__tablename__}")

    query = session.query(table).statement  # .limit(5000)

    return {
        school_room: df
        for school_room, df in (
            pd.read_sql(query, session.bind)
            .assign(SKOLE_ID=lambda d: d.SKOLE + "_" + d.ID)
            .rename(str, axis="columns")  # Fixes weird SA bug
            .groupby("SKOLE_ID")
        )
    }

data = retrieve_data(next(get_session()), table=TrainingTimeslots)

[32m2023-10-05 22:49:07.239[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mretrieve_data[0m:[36m3[0m - [34m[1mRetrieving data from 4_FEATURIZ_DRIFTOPTIMERING_TRAINING_TEST[0m


In [354]:
municipality = random.choice(list(data.keys()))
print(municipality)
test = (
    data[municipality] # "Rønbækskolen_E.16"
    .copy()
    .assign(DATETIME=lambda d: pd.to_datetime(d["DATE"].astype(str) + " " + d["TIME"].astype(str)))
    .sort_values("DATETIME")
)
test.shape

Rønbækskolen_C.27


(617, 18)

In [355]:
def plot_col(df, x = "DATETIME", y = "CO2", **kwargs):
    if not df.empty:
        fig = df.plot.bar(
            x="DATETIME",
            y="CO2",
            title=f"{y} values for {df.SKOLE_ID.iloc[0]}",
            **kwargs
            # prevent stacking CO2 values on same dates
        )
        fig.update_traces(dict(marker_line_width=0))
        fig.show()
    else:
        print("No data to plot")

In [356]:
plot_col(test) # Rønbækskolen_E.16

#### Preprocessing

#### Add missing timeslots

In [357]:
def add_missing_timeslots(df: pd.DataFrame, freq: str = '15T') -> pd.DataFrame:
    """ Adds the rows that are missing from the DataFrame, by merging
    it with a DataFrame containing all the timeslots."""

    static_values = (
        df.head(1)
        [["ID", "KOMMUNE", "SKOLE", "SKOLE_ID"]]
        .squeeze()
        .to_dict()
    )
    return (
        pd.DataFrame(
            {
                'DATETIME': pd.date_range(
                    start=df['DATETIME'].min(),
                    end=df['DATETIME'].max(), 
                    freq=freq
                )
            }
        )
        .assign(**static_values)
        .merge(
            df,
            on=['DATETIME', 'ID', 'KOMMUNE', 'SKOLE', 'SKOLE_ID'],
            how='left'
        )
    )

In [358]:
test.shape

(617, 18)

In [359]:
test = add_missing_timeslots(test)
test.shape

(1047, 18)

#### Interpolate islands

In [360]:
def interpolate_missing_islands(
        df : pd.DataFrame, *,
        target_col : str = "CO2",
        limit : int = 3, 
        direction : str = 'forward',
        method : str = "cubic", 
        **kwargs
    ) -> pd.DataFrame:

    """Interpolate missing values in a dataframe, but only for 
    islands of missing values, ie. rows where there 
    are no more than `limit` consecutive missing values
    in the `target_col` column."""

    return df.assign(
        CO2=lambda d: d[target_col].interpolate(
            method=method,
            limit=limit,
            limit_direction=direction,
            **kwargs,
        )
    )

test.shape

(1047, 18)

In [361]:
test = interpolate_missing_islands(test, target_col="CO2", limit=4)

In [362]:
test.shape, test.size

((1047, 18), 18846)

In [363]:
plot_col(test)

#### Remove stagnate intervals

In [364]:


def remove_stagnate_intervals(df, target_col: str ="CO2", threshold = 4) -> pd.DataFrame:
    """ Remove intervals where the CO2 value is the same for consecutive rows within time-contiguous blocks"""
    return (
        df
        .assign(
            time_diff=lambda d: d['DATETIME'].diff(),
            new_block=lambda d: (d['time_diff'] > pd.Timedelta(minutes=15)) | (d[target_col] != d[target_col].shift(1)),
            block_id=lambda d: d['new_block'].cumsum()
        )
        .assign(
            block_count=lambda d: d.groupby('block_id')['block_id'].transform('count')
        )
        [lambda d: d['block_count'].lt(threshold)]
        .drop(['time_diff', 'new_block', 'block_id', 'block_count'], axis=1)
    )



In [365]:
test.shape, test.size

((1047, 18), 18846)

In [366]:
test = remove_stagnate_intervals(test, target_col="CO2", threshold=5)

In [367]:
test.shape

(1047, 18)

In [368]:
plot_col(test)

#### Drop NaNs in CO2

In [369]:
test = test.dropna(subset=["CO2"])

In [370]:
test.shape

(800, 18)

In [371]:
plot_col(test)

#### Drop outliers

In [372]:


def drop_outliers(
    df, bounds: dict[str, tuple[float | None, float | None]]
) -> pd.DataFrame:
    """Drop rows where values are outside the given bounds.

    Args:
        df (pd.DataFrame): DataFrame to filter
        named_bounds (dict[str, tuple[float | None, float | None]]): Dictionary
            of column names and their lower and upper bounds. If a bound is None,
            then it is not applied."""

    mask = np.ones(df.shape[0], dtype=bool)

    for col_name, (lo, hi) in bounds.items():
        if lo is not None and hi is not None:
            mask &= (df[col_name].values >= lo) & (df[col_name].values <= hi)
        elif lo is not None:
            mask &= df[col_name].values >= lo
        elif hi is not None:
            mask &= df[col_name].values <= hi
        else:
            raise ValueError(f"Bounds for {col_name} are both None")

    return df[mask]

In [373]:
test.shape, test.size

((800, 18), 14400)

In [374]:
test = drop_outliers(
    test, 
    bounds={
        "CO2": (1, 8000),
        # "TEMP": (0, 40),
    }
)

In [375]:
test.shape

(800, 18)

In [376]:
plot_col(test)

#### Add velocity, acceleration and jerk

In [377]:



def calculate_kinematic_quantities(df, *, metric, window) -> pd.DataFrame:
    """Calculate the rolling velocity, acceleration and jerk for a given metric"""
    return (
        df
        .assign(rolling_velocity=lambda d: d[metric].rolling(window=window).apply(lambda x: np.gradient(x)[-1]))
        .assign(rolling_acceleration=lambda d: d['rolling_velocity'].rolling(window=window).apply(lambda x: np.gradient(x)[-1]))
        .assign(rolling_jerk=lambda d: d['rolling_acceleration'].rolling(window=window).apply(lambda x: np.gradient(x)[-1]))
        .assign(
            rolling_velocity=lambda d: d['rolling_velocity'].fillna(0),
            rolling_acceleration=lambda d: d['rolling_acceleration'].fillna(0),
            rolling_jerk=lambda d: d['rolling_jerk'].fillna(0),
        )
    )

def apply_time_group_funcs(df, funcs) -> pd.DataFrame:
    """Add the rolling velocity, acceleration and jerk for a given metric.
    The rolling quantities are calculated using the gradient of the metric and
    the given window size. The resulting null values are filled with zeros"""

    dataf = (
        df
        # Calculate time_diff and identify new blocks of time
        .assign(time_diff=lambda d: d['DATETIME'].diff())
        .assign(new_block=lambda d: d['time_diff'] > pd.Timedelta(minutes=15))
        .assign(block_id=lambda d: d['new_block'].cumsum())
    )
    for func, kwargs in funcs:
        dataf = (
            dataf.groupby('block_id').apply(func,**kwargs)
            .reset_index(drop=True)
        )
    return dataf.drop(columns=["block_id", "new_block", "time_diff"])

In [378]:
test.shape, test.size

((800, 18), 14400)

In [379]:
test = apply_time_group_funcs(
    test,
    funcs=[
        (calculate_kinematic_quantities, dict(metric="CO2", window=4)),
        
    ]
)
test.shape, test.size

((800, 21), 16800)

In [380]:
fig = go.Figure()

# Add traces for the kinematic quantities
fig.add_trace(go.Scatter(x=test['DATETIME'], y=test['rolling_velocity'], mode='lines', name='Velocity'))
fig.add_trace(go.Scatter(x=test['DATETIME'], y=test['rolling_acceleration'], mode='lines', name='Acceleration'))
fig.add_trace(go.Scatter(x=test['DATETIME'], y=test['rolling_jerk'], mode='lines', name='Jerk'))

# Add a trace for CO2 levels on a secondary y-axis
fig.add_trace(go.Scatter(x=test['DATETIME'], y=test['CO2'], mode='lines', name='CO2', yaxis='y2'))

# Update layout to support a secondary y-axis
fig.update_layout(
    title='Kinematic Quantities and CO2 over Time',
    xaxis=dict(title='Time'),
    yaxis=dict(title='Kinematic Quantities'),
    yaxis2=dict(title='CO2', overlaying='y', side='right')
)

# Show the figure
fig.show()

#### Apply smoothing

In [381]:


def gaussian_smooth(df, metric, *, std_dev):
    return (
        df
        .assign(
            **{
                f"{metric}_smoothed": 
                lambda d: gaussian_filter1d(d[metric], sigma=std_dev)
            }
        )
    )

In [382]:

test = apply_time_group_funcs(
    test,
    funcs=[
        (gaussian_smooth, dict(metric="CO2", std_dev=2)),
        
    ]
)

In [383]:
# Plotting using Plotly via Pandas
fig = test.plot(x='DATETIME', y=['CO2', 'CO2_smoothed'], kind='bar', barmode="group", backend="plotly")
fig.update_layout(title='Gausian Filter Smoothing of CO2 levels',
                  xaxis_title='Time',
                  yaxis_title='CO2 level',
                  
        )
fig.update_traces(dict(marker_line_width=0))
fig.show()


#### Anomaly detection

In [384]:
from pandas import DataFrame
from numpy import interp
from sklearn.ensemble import IsolationForest

from ai.tilly.config import MODEL_PARAMS


class Model:
    def __init__(self, estimated_usage: str | float = "auto", model_params = MODEL_PARAMS):
        self.model = IsolationForest(
            contamination=estimated_usage,
            **model_params,
            # n_jobs=-1,
        )

    def fit(self, X: DataFrame) -> "Model":
        self.model.fit(X)
        return self

    def predict(self, X: DataFrame) -> list[float]:
        """Predicts whether a given room is anomalous or not.
        Returns 1 if anomalous, 0 if not.
        """
        y_hats = self.model.predict(X)
        return [1 if y_hat == -1 else 0 for y_hat in y_hats]

    def score(self, X: DataFrame) -> list[float]:
        """Calculates the anomaly scores for each room and
        returns the normalized scores.
        """
        y_hat = self.model.decision_function(X)
        return 1 - interp(y_hat, (min(y_hat), max(y_hat)), (0, 1))


In [398]:

FEATURES = [
    # "CO2",
    "rolling_velocity",
    # "rolling_acceleration",
    # "rolling_jerk",
    "CO2_smoothed",
]

In [399]:
model = Model(
    model_params={
        "n_estimators": 300,
        # "max_samples": 0.25,
        # "max_features": 2,
        "random_state": 123,
        "verbose": 0,

    },
    estimated_usage=0.3
)
model.fit(test[FEATURES])

<__main__.Model at 0x7f12a0c63a00>

In [400]:
test["pred"] = model.predict(test[FEATURES])
test["score"] = model.score(test[FEATURES])

In [401]:
# Plotting using Plotly via Pandas
fig = test.assign(color=lambda d: d["pred"].map({0: 'blue', 1: 'red'})).plot.bar(
    x='DATETIME',
    y=['CO2'], 
    color="color", 
    hover_data=["rolling_velocity", "score"], 
    barmode="group", 
)
fig.update_layout(title='Usage detection',
                  xaxis_title='Time',
                  yaxis_title='CO2 level',
                  legend_title="Usage",
                  # since pred is not continuous the legend should not be either
                  
        )
fig.update_traces(dict(marker_line_width=0))
fig.show()

In [None]:
# Create a plot
plot = test.plot(
    x='DATETIME', 
    y=['CO2', 'Smoothed_CO2'], 
    kind='bar', 
    title='CO2 Levels Over Time',
    barmode='group'
)
plot.update_traces(dict(marker_line_width=0))
plot

#### Run flow

In [404]:
prep = Preprocessor()

processed = (
    test
    .pipe(prep.add_missing_timeslots)
    .pipe(prep.interpolate_missing_islands, target_col="CO2", limit=4)
    .pipe(prep.remove_stagnate_intervals, target_col="CO2", threshold=5)
    .dropna(subset=["CO2"])
    .pipe(prep.drop_outliers, bounds={"CO2": (1, 8000)})
    .pipe(
        prep.apply_time_group_funcs,
        funcs=[
            (
                prep.calculate_kinematic_quantities,
                dict(metric="CO2", window=4)
            ),
            (
                prep.gaussian_smooth,
                dict(metric="CO2", std_dev=2)
            )
        ]
    )
)

TODO:
- Find a way to fill single (or two contiguous) missing timeslots <br>
A: We will use cubic spline with a maximum of 2 missing timeslots

- Rethink how to fill larger gaps


In [None]:
import numpy as np
from scipy.interpolate import CubicSpline
import matplotlib.pyplot as plt

# Create a time series with missing values
x = np.linspace(0, 10, 11)
y = np.sin(x)
y_missing = y.copy()
y_missing[4:7] = np.nan  # Introduce missing values

# Indices for valid (non-missing) and missing data points
valid_idx = np.isfinite(y_missing)
missing_idx = np.isnan(y_missing)

# Create cubic spline interpolator
cs = CubicSpline(x[valid_idx], y_missing[valid_idx])

# Interpolate to fill in missing values
y_missing[missing_idx] = cs(x[missing_idx])

# Plotting the original data, missing values, and interpolated data
plt.scatter(x[valid_idx], y[valid_idx], label='Original Data', color='blue')
plt.scatter(x[missing_idx], cs(x[missing_idx]), label='Interpolated Values', color='red')
plt.plot(x, y_missing, label='Interpolated Time Series', color='green')
plt.legend()
plt.show()

In [None]:
import numpy as np
from scipy.optimize import minimize
import matplotlib.pyplot as plt

# Objective function to minimize for TV denoising
def tv_denoise_objective(x, y, alpha):
    return 0.5 * np.sum((x - y)**2) + alpha * np.sum(np.abs(np.diff(x)))

# Create synthetic time series data for CO2 levels
np.random.seed(0)
x = np.linspace(0, 10, 100)
y = 400 + 50 * np.sin(x) + np.random.normal(0, 10, 100)  # CO2 levels in ppm

# TV denoising
alpha = 3  # regularization parameter, adjust as needed
res = minimize(tv_denoise_objective, y, args=(y, alpha), method='L-BFGS-B')
y_denoised = res.x

# Plotting
plt.figure(figsize=(12, 6))
plt.plot(x, y_denoised, label='TV Denoised')
plt.plot(x, y, label='Original Data', linestyle='--')
plt.legend()
plt.xlabel('Time (in arbitrary units)')
plt.ylabel('CO2 level (in ppm)')
plt.title('CO2 levels: Original and TV Denoised')
plt.show()


In [None]:
import numpy as np
import pywt
import matplotlib.pyplot as plt

# Generate synthetic data
np.random.seed(0)
x = np.linspace(0, 10, 100)
y = 400 + 50 * np.sin(x) + np.random.normal(0, 10, 100)  # CO2 levels in ppm

# Perform a wavelet transform
coeffs = pywt.wavedec(y, 'haar')

# Threshold the wavelet coefficients (set small coefficients to 0)
threshold = 10.0  # adjust based on your specific needs
coeffs_thresholded = [np.where(np.abs(c) > threshold, c, 0) for c in coeffs]

# Reconstruct the signal
y_denoised = pywt.waverec(coeffs_thresholded, 'haar')

# Plot original and denoised signals
plt.figure(figsize=(12, 6))
plt.plot(x, y, label='Original Data', linestyle='--')
plt.plot(x, y_denoised, label='Wavelet Denoised')
plt.legend()
plt.xlabel('Time (in arbitrary units)')
plt.ylabel('CO2 level (in ppm)')
plt.title('CO2 levels: Original and Wavelet Denoised')
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.ndimage import gaussian_filter1d, median_filter
from scipy.stats import zscore

# Generate some example data
np.random.seed(0)
n_points = 1000
x = np.linspace(0, 4*np.pi, n_points)
y = 20 * np.sin(x) + 100 + np.random.normal(0, 5, n_points)  # Simulated CO2 levels

# Apply Gaussian filter
sigma = 10
y_smoothed_gaussian = gaussian_filter1d(y, sigma)

# Apply Median filter
size = 50  # size of the neighborhood
y_smoothed_median = median_filter(y, size=size)

# Calculate rolling Z-score on smoothed data for anomaly detection (simple example)
rolling_window = 10
z_scores = zscore(y_smoothed_gaussian)
rolling_z_scores = np.convolve(z_scores, np.ones(rolling_window)/rolling_window, mode='same')

# Threshold for anomaly detection
threshold = 2.0
anomalies = np.where(np.abs(rolling_z_scores) > threshold)

# Plotting
plt.figure(figsize=(15, 8))

plt.subplot(3, 1, 1)
plt.title("Original Data")
plt.plot(x, y)

plt.subplot(3, 1, 2)
plt.title("Original and Smoothed Data")
plt.plot(x, y, label='Original Data', alpha=0.5)
plt.plot(x, y_smoothed_gaussian, label='Smoothed by Gaussian Filter')
plt.plot(x, y_smoothed_median, label='Smoothed by Median Filter')
plt.legend()

plt.subplot(3, 1, 3)
plt.title("Anomaly Detection Based on Smoothed Data")
plt.plot(x, y_smoothed_gaussian, label='Smoothed Data')
plt.scatter(x[anomalies], y_smoothed_gaussian[anomalies], color='red', label='Anomalies')
plt.legend()

plt.tight_layout()
plt.show()


In [None]:
import numpy as np
import pandas as pd
from scipy.interpolate import CubicSpline
import matplotlib.pyplot as plt

# Generate some example data
x = np.array([0, 1, 2, 3, 4, 5])
y = np.array([0, 1, 4, 9, 16, 25])

# Create a Cubic Spline
cs = CubicSpline(x, y)

# Points at which to evaluate the spline
x_new = np.linspace(0, 5, 50)

# Evaluate the spline at the new points
y_new = cs(x_new)

# Create DataFrame for better visualization
df = pd.DataFrame({'x': x_new, 'y': y_new})

# Plotting
plt.scatter(x, y, label='Data Points')
plt.plot(x_new, y_new, label='Cubic Spline')
plt.legend()
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import OneClassSVM

# Generate synthetic data: normal data and outliers
X_normal = 0.3 * np.random.randn(100, 2)
X_anomaly = np.array([[5, 5], [6, 6], [-5, -5], [-6, -6]])

# Combine the two datasets
X = np.r_[X_normal, X_anomaly]

# Apply One-Class SVM
clf = OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1)
clf.fit(X_normal)
y_pred = clf.predict(X)

# Generate a mesh grid
xx, yy = np.meshgrid(np.linspace(-7, 7, 100), np.linspace(-7, 7, 100))
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

# Plot the decision boundary
plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), 0, 7), cmap=plt.cm.PuBu)
a = plt.contour(xx, yy, Z, levels=[0], linewidths=2, colors='darkred')

# Plot data points
s = 40
b1 = plt.scatter(X_normal[:, 0], X_normal[:, 1], c='white', s=s, edgecolors='k')
b2 = plt.scatter(X_anomaly[:, 0], X_anomaly[:, 1], c='gold', s=s, edgecolors='k')
c = plt.scatter(X[y_pred == -1, 0], X[y_pred == -1, 1], c='red', s=s, edgecolors='k')

plt.axis('tight')
plt.legend(
    [a.collections[0], b1, b2, c],
    ['Learned frontier', 'Normal points', 'True anomalies', 'Detected anomalies'],
    loc="upper left",
)

plt.title('One-Class SVM Anomaly Detection')
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.semi_supervised import LabelSpreading

# Simulating 1000 time intervals (15 minutes each)
n_points = 1000

# Generate synthetic CO2 level data
np.random.seed(42)
X = np.sort(5 * np.random.rand(n_points, 1), axis=0)
y_true = np.sin(X).ravel()

# Add noise to simulate real-world data
y_true += 0.3 * np.random.randn(n_points)

# Label some data points
n_labeled_points = 100  # About 10% of the data is labeled
max_value = np.max(y_true)

# Normal CO2 levels (Label as 1)
y = np.ones(n_points)
y[y_true > max_value * 0.6] = 1

# Anomalous CO2 levels (Label as -1)
y[y_true < max_value * 0.4] = -1

# Unlabeled points (Label as 0)
y[n_labeled_points:] = 0

# Model
label_spread = LabelSpreading(kernel='knn', alpha=0.2)
label_spread.fit(X, y)

# Predict
y_pred = label_spread.predict(X)

# Visualizing the results
plt.figure(figsize=(12, 6))

plt.scatter(X[y == 0], y_true[y == 0], c='gray', label='Unlabeled')
plt.scatter(X[y == 1], y_true[y == 1], c='blue', label='Normal (True)')
plt.scatter(X[y == -1], y_true[y == -1], c='red', label='Anomaly (True)')
plt.scatter(X, y_pred, marker='x', c='lime', label='Predictions')

plt.title('Semi-supervised Anomaly Detection for CO2 Levels')
plt.xlabel('Time Interval (15 minutes)')
plt.ylabel('CO2')
