In [21]:
from mads_datasets.base import BaseDatastreamer
from mltrainer.preprocessors import BasePreprocessor
from pathlib import Path
import pandas as pd
import seaborn as sns
from sklearn.metrics import confusion_matrix
from torch import nn
import torch
import seaborn as sns
import numpy as np

from src import datasets, metrics

In [None]:
import tomllib

datadir = Path('../data')
configfile = Path("config.toml")

with configfile.open('rb') as f:
    config = tomllib.load(f)
print(config)

In [None]:
# data paths
trainfile = datadir / (config['arrhythmia'] + '_train.parq')
testfile = datadir / (config['arrhythmia'] + '_test.parq')
trainfile, testfile

In [24]:
train_df = pd.read_parquet(trainfile)
test_df = pd.read_parquet(testfile)

In [25]:
norm_train_df = train_df[train_df['target'] == 0]
anom_train_df = train_df[train_df['target'] != 0]
norm_test_df = test_df[test_df['target'] == 0]
anom_test_df = test_df[test_df['target'] != 0]

In [26]:
# create two separate datasets for normal and anomalous data

# norm_test_df.to_parquet(datadir / 'heart_big_norm_train.parq')
# anom_test_df.to_parquet(datadir / 'heart_big_anom_train.parq')

# norm_test_df.to_parquet(datadir / 'heart_big_norm_test.parq')
# anom_test_df.to_parquet(datadir / 'heart_big_anom_test.parq')



In [None]:
trainfile = datadir / (config['arrhythmia'] + '_norm_train.parq')
testfile = datadir / (config['arrhythmia'] + '_norm_test.parq')
trainfile, testfile

In [None]:
#loading the datasets 
traindataset = datasets.HeartDataset1D(trainfile, target="target")
testdataset = datasets.HeartDataset1D(testfile, target="target")
traindataset, testdataset

# moving to mps device crashes the jupyter kernel

In [None]:
# creating the datastreamers
trainstreamer = BaseDatastreamer(traindataset, preprocessor = BasePreprocessor(), batchsize=32)
teststreamer = BaseDatastreamer(testdataset, preprocessor = BasePreprocessor(), batchsize=32)
len(trainstreamer), len(teststreamer)

In [30]:
# shape = (16, 12)
# traindataset = datasets.HeartDataset2D(trainfile, target="target", shape=shape)
# testdataset = datasets.HeartDataset2D(testfile, target="target", shape=shape)
# traindataset, testdataset

In [31]:
# trainstreamer = BaseDatastreamer(traindataset, preprocessor = BasePreprocessor(), batchsize=32)
# teststreamer = BaseDatastreamer(testdataset, preprocessor = BasePreprocessor(), batchsize=32)
# len(trainstreamer), len(teststreamer)

In [None]:
x, y = next(trainstreamer.stream())
x.shape, y.shape

In [33]:
f1micro = metrics.F1Score(average='micro')
f1macro = metrics.F1Score(average='macro')
precision = metrics.Precision('micro')
recall = metrics.Recall('macro')
accuracy = metrics.Accuracy()

In [None]:
import mlflow
mlflow.set_tracking_uri("sqlite:///mads_exam.db")
mlflow.set_experiment("Autoencoder model")

In [35]:
from typing import Iterator
from pydantic import BaseModel
from mads_datasets.base import BaseDatastreamer
class VAEstreamer(BaseDatastreamer):
    def stream(self) -> Iterator:
        while True:
            if self.index > (self.size - self.batchsize):
                self.reset_index()
            batch = self.batchloop()
            # we throw away the Y
            X_, _ = zip(*batch)  # noqa N806
            X = torch.stack(X_)  # noqa N806
            # change the channel to channel-last
            X = torch.moveaxis(X, 1, 2)  # noqa N806
            # and yield X, X
            yield X, X

class VAESettings(BaseModel):
    data_dir: Path = Path("data")
    h1: int = 192
    h2: int = 100
    insize: int = 2
    latent: int = 2
    batchsize: int = 32
    epochs: int = 10
    modelname: Path = Path("vaemodel.pt")
    modeldir: Path = Path("models")
    imgpath: Path = Path("img")
    samplesize: int = 512

In [None]:
from mltrainer import ReportTypes, Trainer, TrainerSettings, vae
presets = VAESettings()
print(presets)
trainstreamer = VAEstreamer(traindataset, batchsize=presets.batchsize).stream()
teststreamer = VAEstreamer(testdataset, batchsize=32).stream()
#print(next(trainstreamer))
X1, X2 = next(trainstreamer)
X1.shape, X2.shape

In [None]:

config = presets.model_dump()
encoder = vae.Encoder(config)
print(encoder)
decoder = vae.Decoder(config)
print(decoder)
latent = encoder(x)
latent = latent.view(32, 1, 16, 12)
print(latent.shape)
x = decoder(latent)
lossfn = ReconstructionLoss()
loss = lossfn(x, y)

In [None]:
import torch
import torch.nn as nn

class ReconstructionLoss(nn.Module):
    def __init__(self):
        super(ReconstructionLoss, self).__init__()

    def forward(self, yhat, y):
        # Ensure yhat and y have the same shape
        assert yhat.shape == y.shape, f"Shape mismatch: yhat {yhat.shape}, y {y.shape}"

        # Compute squared error
        sqe = (y - yhat) ** 2
        
        # Sum over batch and spatial dimensions (height, width, channels)
        summed = sqe.sum(dim=(1, 2))  # Summing over batch, channel, height, and width
        return summed.mean()  # Return the mean squared error over the batch


class Encoder(nn.Module):
    def __init__(self):
        super(Encoder, self).__init__()
        self.flatten = nn.Flatten(start_dim=1, end_dim=-1)  # Flatten (32, 1, 16, 12) -> (32, 192)
        self.encode = nn.Sequential(
            nn.Linear(192, 100),  # Flattened input size is 192
            nn.ReLU(),
            nn.Linear(100, 2)  # Output latent vector of size 2
        )

    def forward(self, x):
        x = self.flatten(x)  # Flatten input to [batch_size, 192]
        x = self.encode(x)   # [batch_size, 2] (latent vector)
        return x


class Decoder(nn.Module):
    def __init__(self):
        super(Decoder, self).__init__()
        self.decode = nn.Sequential(
            nn.Linear(2, 100),  # Latent vector size is 2 -> expand to 100
            nn.ReLU(),
            nn.Linear(100, 192),  # Expand to 192
            nn.ReLU(),
            nn.Linear(192, 192)  # Expand to 192 for reshaping
        )
        
    def forward(self, x):
        x = self.decode(x)  # [batch_size, 192]
        x = x.view(-1, 1, 192)  # Reshape to [batch_size, 1, 16, 12]
        return x


# Instantiate the encoder and decoder
encoder = Encoder()
decoder = Decoder()

# Pass through the encoder
latent = encoder(X1)  # Latent vector of shape [32, 2]

# Pass through the decoder
reconstructed = decoder(latent)  # Reconstructed shape should be [32, 1, 16, 12]
print(reconstructed.shape)  # Should print torch.Size([32, 1, 16, 12])
lossfn = ReconstructionLoss()
print(X1.shape)
loss = lossfn(X1, reconstructed)
print(loss)

In [39]:
from typing import Dict
class Encoder(nn.Module):
    """encoder"""

    def __init__(self, config: Dict) -> None:
        super().__init__()
        self.flatten = nn.Flatten()
        self.encode = nn.Sequential(
            nn.Linear(config["insize"], config["h1"]),
            nn.ReLU(),
            nn.Linear(config["h1"], config["h2"]),
            nn.ReLU(),
            nn.Linear(config["h2"], config["latent"]),
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.flatten(x)
        latent = self.encode(x)
        return latent


class Decoder(nn.Module):
    def __init__(self, config: Dict) -> None:
        super().__init__()
        self.decode = nn.Sequential(
            nn.Linear(config["latent"], config["h2"]),
            nn.ReLU(),
            nn.Linear(config["h2"], config["h1"]),
            nn.ReLU(),
            nn.Linear(config["h1"], config["insize"]),
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.decode(x)
        x = x.reshape((-1, 28, 28, 1))
        return x


class AutoEncoder(nn.Module):
    def __init__(self, config: Dict) -> None:
        super().__init__()
        self.encoder = Encoder(config)
        self.decoder = Decoder(config)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        latent = self.encoder(x)
        x = self.decoder(latent)
        return x

In [40]:
#from src import models
#config Autoencoder
config = {
    "insize": 192,
    "h1": 100,  
    "h2": 40,
    "latent": 2,
    
}
autoencoder = AutoEncoder(config)

In [None]:
from mltrainer import Trainer, TrainerSettings, ReportTypes
loss_fn = torch.nn.CrossEntropyLoss()

with mlflow.start_run():
    optimizer = torch.optim.Adam

    settings = TrainerSettings(
        epochs=2,
        metrics=[accuracy, f1micro, f1macro, precision, recall, loss_fn],
        logdir="logs/heartAE",
        train_steps=len(list(trainstreamer))//5, #met 5 epochs heeft het een keer de hele dataset gezien
        valid_steps=len(list(teststreamer))//5,
        reporttypes=[ReportTypes.TENSORBOARD, ReportTypes.MLFLOW],
        scheduler_kwargs=None,
        earlystop_kwargs=None
    )

    # modify the tags when you change them!
    mlflow.set_tag("model", "Autoencoder")
    mlflow.set_tag("dataset", "heart_big")
    mlflow.log_param("scheduler", "None")
    mlflow.log_param("earlystop", "None")

    mlflow.log_params(config)
    mlflow.log_param("epochs", settings.epochs)
    mlflow.log_param("optimizer", str(optimizer))
    mlflow.log_params(settings.optimizer_kwargs)

    trainer = Trainer(
        model=autoencoder,
        settings=settings,
        loss_fn=loss_fn,
        optimizer=optimizer,
        traindataloader=trainstreamer.stream(),
        validdataloader=teststreamer.stream(),
        scheduler=None,
        )
    trainer.loop()

    #htop om de gpu te zien
    # dimensie transformer reduceren

In [None]:
modelpath = modeldir / "vaemodel.pt"

torch.save(autoencoder, modelpath)

In [None]:
def predict(model, streamer):
  predictions, losses = [], []
  criterion = nn.L1Loss(reduction='sum').to(device)
  with torch.no_grad():
    model = model.eval()
    for x, y in next(teststreamer.stream()):
      seq_true = y.to(device)
      seq_pred = model(seq_true)

      loss = criterion(seq_pred, seq_true)

      predictions.append(seq_pred.cpu().numpy().flatten())
      losses.append(loss.item())
  return predictions, losses

In [None]:
modelpath = presets.modeldir / presets.modelname
model = torch.load(modelpath)


In [None]:
_, losses = predict(model, trainstreamer)

sns.distplot(losses, bins=50, kde=True);

In [None]:
#from src import models
#config GRU
config = {
    "input": 1,
    "hidden": 256,  # updated key
    "dropout": 0.1,
    "output": 5,
    "num_layers": 2,
}
model = GRUmodel(config)

- Model GRU:

Hyperparameters:
- 256 hidden units
- 4 layers

In [None]:
from mltrainer import Trainer, TrainerSettings, ReportTypes
loss_fn = torch.nn.CrossEntropyLoss()

with mlflow.start_run():
    optimizer = torch.optim.Adam

    settings = TrainerSettings(
        epochs=5,
        metrics=[accuracy, f1micro, f1macro, precision, recall],
        logdir="logs/heart2D",
        train_steps=len(trainstreamer) // 5, #met 5 epochs heeft het een keer de hele dataset gezien
        valid_steps=len(teststreamer) // 5,
        reporttypes=[ReportTypes.TENSORBOARD, ReportTypes.MLFLOW],
        scheduler_kwargs=None,
        earlystop_kwargs=None
    )

    # modify the tags when you change them!
    mlflow.set_tag("model", "GRU")
    mlflow.set_tag("dataset", "heart_big")
    mlflow.log_param("scheduler", "None")
    mlflow.log_param("earlystop", "None")

    mlflow.log_params(config)
    mlflow.log_param("epochs", settings.epochs)
    mlflow.log_param("optimizer", str(optimizer))
    mlflow.log_params(settings.optimizer_kwargs)

    trainer = Trainer(
        model=model,
        settings=settings,
        loss_fn=loss_fn,
        optimizer=optimizer,
        traindataloader=trainstreamer.stream(),
        validdataloader=teststreamer.stream(),
        scheduler=None,
        )
    trainer.loop()

    #htop om de gpu te zien
    # dimensie transformer reduceren

In [None]:
y_true = []
y_pred = []

testdata = teststreamer.stream()
for _ in range(len(teststreamer)):
    X, y = next(testdata)
    yhat = model(X)
    yhat = yhat.argmax(dim=1) # we get the one with the highest probability
    y_pred.append(yhat.cpu().tolist())
    y_true.append(y.cpu().tolist())

yhat = [x for y in y_pred for x in y]
y = [x for y in y_true for x in y]

cfm = confusion_matrix(y, yhat)
cfm = cfm / np.sum(cfm, axis=1, keepdims=True)

plot = sns.heatmap(cfm, annot=cfm, fmt=".3f")
plot.set(xlabel="Predicted", ylabel="Target")

In [None]:
y_true = []
y_pred = []

testdata = teststreamer.stream()
for _ in range(len(teststreamer)):
    X, y = next(testdata)
    yhat = model(X)
    yhat = yhat.argmax(dim=1) # we get the one with the highest probability
    y_pred.append(yhat.cpu().tolist())
    y_true.append(y.cpu().tolist())

yhat = [x for y in y_pred for x in y]
y = [x for y in y_true for x in y]

cfm = confusion_matrix(y, yhat)
cfm = cfm / np.sum(cfm, axis=1, keepdims=True)

plot = sns.heatmap(cfm, annot=cfm, fmt=".3f")
plot.set(xlabel="Predicted", ylabel="Target")

In [None]:
import seaborn as sns
import numpy as np

y_true = []
y_pred = []

testdata = teststreamer.stream()
for _ in range(len(teststreamer)):
    X, y = next(testdata)
    yhat = model(X)
    yhat = yhat.argmax(dim=1) # we get the one with the highest probability
    y_pred.append(yhat.cpu().tolist())
    y_true.append(y.cpu().tolist())

yhat = [x for y in y_pred for x in y]
y = [x for y in y_true for x in y]

cfm = confusion_matrix(y, yhat)
cfm = cfm / np.sum(cfm, axis=1, keepdims=True)

plot = sns.heatmap(cfm, annot=cfm, fmt=".3f")
plot.set(xlabel="Predicted", ylabel="Target")

In [None]:
import seaborn as sns
import numpy as np

y_true = []
y_pred = []

testdata = teststreamer.stream()
for _ in range(len(teststreamer)):
    X, y = next(testdata)
    yhat = model(X)
    yhat = yhat.argmax(dim=1) # we get the one with the highest probability
    y_pred.append(yhat.cpu().tolist())
    y_true.append(y.cpu().tolist())

yhat = [x for y in y_pred for x in y]
y = [x for y in y_true for x in y]

cfm = confusion_matrix(y, yhat)
cfm = cfm / np.sum(cfm, axis=1, keepdims=True)

plot = sns.heatmap(cfm, annot=cfm, fmt=".3f")
plot.set(xlabel="Predicted", ylabel="Target")