In [1]:
import pandas as pd
import numpy as np
import os
import sys
import torch as torch

PROJECT_ROOT = os.path.abspath("..")  # move up one level from notebooks/
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)
# Visual confirmation
print("[✓] Project at:", PROJECT_ROOT)

from src.models.model_1 import ECGNet
from src.models.hyperparamter_tunning import hyperparameter_search


[✓] Project at: c:\Users\toby_\Documents\TU_Berlin\Semestre 3\AMLS\AMLS_packed


In [2]:
# Add the project root path if not already present




from src.data.load_data import load_train_data, EDGCDataset, load_test_data, EDGCTestDataset
from src.data.stratified_split import stratified_split_pad_torch, pad_test_torch
from src.models.model_trainer import Trainer
from torch.nn.utils.rnn import pad_sequence

### This part implements an import of the dataset but with a lossy technique. Afther this, we will reevaluate the same model we had with this new data. 

Most of the functions necessary for this compression are stored with the rest of the source code. 

In [3]:
from src.data.lossless_compression import write_compressed_file, read_compressed_file
from src.data.lossy_compression import read_custom_compressed, compress_and_save, CONFIG

def lossy_import():

    print("Loading data...")
    X_train, y_train = load_train_data()
    
    use_lossy = True
    
    if use_lossy:
        compress_and_save(X_train, CONFIG)
        compressed_path = os.path.join(PROJECT_ROOT, "data", "processed", "compressed_data.bin")
        X_train_custom = read_custom_compressed(compressed_path)
        
        X_tensor_list = [torch.tensor(x, dtype=torch.float32) for x in X_train_custom]
        X_train_pad = pad_sequence(X_tensor_list, batch_first=True)
        lengths_train_final = torch.tensor([len(x) for x in X_train_custom])
        y_train_tensor = torch.tensor(y_train.iloc[:, 0].values, dtype=torch.long)

    durations = np.array([len(x) / 300 for x in X_train])

    print("Splitting and padding...")

    X_train, X_val, lengths_train, lengths_val, y_train, y_val = (
        stratified_split_pad_torch(X_train, y_train)
    )     
    
    train_dataset = EDGCDataset(X_train_pad, lengths_train_final, y_train_tensor) if use_lossy else EDGCDataset(X_train, lengths_train, y_train)
    val_dataset = EDGCDataset(X_val, lengths_val, y_val)

    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=32, shuffle=True
    )
    val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=32)

    for X_batch, lengths_batch, y_batch in train_loader:
        print(
            f"Batch shapes → X: {X_batch.shape}, lengths: {lengths_batch.shape}, y: {y_batch.shape}"
        )
        break

    print("Data preparation completed.")

    return train_loader, val_loader, X_train.shape[1]

In [5]:
train_loader, val_loader,signal_size = lossy_import()


Loading data...
[✓] Loaded X_train with 6179 sequences
[✓] Loaded y_train with shape (6179, 1)
Splitting and padding...
Batch shapes → X: torch.Size([32, 18286]), lengths: torch.Size([32]), y: torch.Size([32])
Data preparation completed.


## Next steps are necessary for evaluating the model, and because the model requires to give the number of columns of the X_train matrix after padding as a paramaeter for the model. However the compressed data was already previously generated. 

In [4]:
from src.models.model_1 import ECGNet
from src.models.hyperparamter_tunning import hyperparameter_search
X_test = load_test_data()

X_test, lengths_test = pad_test_torch(X_test)

test_dataset = EDGCTestDataset(X_test, lengths_test)

test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32)

X_train, y_train = load_train_data()

X_train, X_val, lengths_train, lengths_val, y_train, y_val = stratified_split_pad_torch(
    X_train, y_train
)

[✓] Loaded X_test with 2649 sequences
[✓] Loaded X_train with 6179 sequences
[✓] Loaded y_train with shape (6179, 1)


## Retraining and evaluating model with lossy compressed data. 

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = ECGNet(
    num_classes=4,
    n_fft=512,
    hop_length=256,
    conv1_padding=1,
    conv2_padding=1,
    conv1_kernel=3,
    conv2_kernel=3,
    lstm_num_layers=1,
    conv1_channels=32,
    conv2_channels=32,
    lst_hidden_size=128,
    dropout=0.1,
    signal_length=X_train.shape[1],
    device=device,
).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=5e-4)

criterion = torch.nn.CrossEntropyLoss()

trainer = Trainer(model, optimizer, criterion, augment_data=True, device=device)

history = trainer.fit(train_loader, val_loader, epochs=50)

train_loss, train_f1 = trainer.evaluate(train_loader)

val_loss, val_f1 = trainer.evaluate(val_loader)


cm, report = trainer.detailed_metrics(val_loader, class_names=["class_0", "class_1", "class_2", "class_3"])
print(report)

model.eval()  # Modo evaluación

all_preds = []

with torch.no_grad():
    for X_batch, lengths_batch in test_loader:
        X_batch = X_batch.to(device)
        lengths_batch = lengths_batch.to(device)
            
        outputs = model(X_batch, lengths_batch)
        preds = torch.argmax(outputs, dim=1)  # clase con mayor probabilidad
        all_preds.extend(preds.cpu().numpy())
    
df = pd.DataFrame({'predicted_label': all_preds})
    
df.to_csv('reduced.csv', index=False)


Epoch 1/50 - Train Loss: 1.0416 - Train F1: 0.2069 - Val Loss: 0.9790 - Val F1: 0.1856
Epoch 2/50 - Train Loss: 0.9987 - Train F1: 0.1865 - Val Loss: 0.9890 - Val F1: 0.1856
Epoch 3/50 - Train Loss: 1.0053 - Train F1: 0.1858 - Val Loss: 1.0004 - Val F1: 0.1856
Epoch 4/50 - Train Loss: 1.0077 - Train F1: 0.1859 - Val Loss: 1.0009 - Val F1: 0.1856
Epoch 5/50 - Train Loss: 1.0056 - Train F1: 0.1865 - Val Loss: 0.9960 - Val F1: 0.1856
Epoch 6/50 - Train Loss: 1.0045 - Train F1: 0.1853 - Val Loss: 0.9988 - Val F1: 0.1856

Early stopping triggered at epoch 6


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


              precision    recall  f1-score   support

     class_0     0.5902    1.0000    0.7423       363
     class_1     0.0000    0.0000    0.0000        54
     class_2     0.0000    0.0000    0.0000       176
     class_3     0.0000    0.0000    0.0000        22

    accuracy                         0.5902       615
   macro avg     0.1476    0.2500    0.1856       615
weighted avg     0.3484    0.5902    0.4382       615



## The results are not worth of keep trying this approach, even thou the data was compressed, because the f1 score passed from .75 to .18 with the same model architechture. 

## Now lossless compression will be implemented and the same model retrained and evaluated.

In [7]:
def lossless_import():

    print("Loading data...")
    X_train, y_train = load_train_data()
    
    use_compressed = False
    
    if use_compressed:
        
        compressed_path = os.path.join(PROJECT_ROOT, "data", "processed", "compressed_data.bin")
        write_compressed_file(compressed_path, X_train)

        X_train_compressed = read_compressed_file("compressed_data.bin")
        X_tensor_list = [torch.tensor(x, dtype=torch.float32) for x in X_train_compressed]
        X_train_pad = pad_sequence(X_tensor_list, batch_first=True)
        lengths_train_final = torch.tensor([len(x) for x in X_train_compressed])
        y_train_tensor = torch.tensor(y_train.iloc[:, 0].values, dtype=torch.long)

    durations = np.array([len(x) / 300 for x in X_train])

    print("Splitting and padding...")

    X_train, X_val, lengths_train, lengths_val, y_train, y_val = (
        stratified_split_pad_torch(X_train, y_train)
    )     
    
    train_dataset = EDGCDataset(X_train_pad, lengths_train_final, y_train_tensor) if use_compressed else EDGCDataset(X_train, lengths_train, y_train)
    val_dataset = EDGCDataset(X_val, lengths_val, y_val)

    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=32, shuffle=True
    )
    val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=32)

    for X_batch, lengths_batch, y_batch in train_loader:
        print(
            f"Batch shapes → X: {X_batch.shape}, lengths: {lengths_batch.shape}, y: {y_batch.shape}"
        )
        break

    print("Data preparation completed.")

    return train_loader, val_loader, X_train.shape[1]

train_loader, val_loader,signal_size = lossless_import()


Loading data...
[✓] Loaded X_train with 6179 sequences
[✓] Loaded y_train with shape (6179, 1)
Splitting and padding...
Batch shapes → X: torch.Size([32, 18286]), lengths: torch.Size([32]), y: torch.Size([32, 1])
Data preparation completed.


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = ECGNet(
    num_classes=4,
    n_fft=512,
    hop_length=256,
    conv1_padding=1,
    conv2_padding=1,
    conv1_kernel=3,
    conv2_kernel=3,
    lstm_num_layers=1,
    conv1_channels=32,
    conv2_channels=32,
    lst_hidden_size=128,
    dropout=0.1,
    signal_length=X_train.shape[1],
    device=device,
).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=5e-4)

criterion = torch.nn.CrossEntropyLoss()

trainer = Trainer(model, optimizer, criterion, augment_data=True, device=device)

history = trainer.fit(train_loader, val_loader, epochs=50)

train_loss, train_f1 = trainer.evaluate(train_loader)

val_loss, val_f1 = trainer.evaluate(val_loader)


cm, report = trainer.detailed_metrics(val_loader, class_names=["class_0", "class_1", "class_2", "class_3"])
print(report)

model.eval()  # Modo evaluación

all_preds = []

with torch.no_grad():
    for X_batch, lengths_batch in test_loader:
        X_batch = X_batch.to(device)
        lengths_batch = lengths_batch.to(device)
            
        outputs = model(X_batch, lengths_batch)
        preds = torch.argmax(outputs, dim=1)  # clase con mayor probabilidad
        all_preds.extend(preds.cpu().numpy())
    
df = pd.DataFrame({'predicted_label': all_preds})



df.to_csv('reduced.csv', index=False)


Epoch 1/50 - Train Loss: 0.9704 - Train F1: 0.2264 - Val Loss: 0.9144 - Val F1: 0.2458
Epoch 2/50 - Train Loss: 0.8822 - Train F1: 0.2819 - Val Loss: 0.8260 - Val F1: 0.3067
Epoch 3/50 - Train Loss: 0.8013 - Train F1: 0.3451 - Val Loss: 0.7681 - Val F1: 0.4003
Epoch 4/50 - Train Loss: 0.7516 - Train F1: 0.4584 - Val Loss: 0.7370 - Val F1: 0.4514
Epoch 5/50 - Train Loss: 0.7097 - Train F1: 0.5024 - Val Loss: 0.6628 - Val F1: 0.5381
Epoch 6/50 - Train Loss: 0.6634 - Train F1: 0.5848 - Val Loss: 0.6593 - Val F1: 0.5166
Epoch 7/50 - Train Loss: 0.6330 - Train F1: 0.6234 - Val Loss: 0.6426 - Val F1: 0.5377
Epoch 8/50 - Train Loss: 0.6113 - Train F1: 0.6418 - Val Loss: 0.5795 - Val F1: 0.6326
Epoch 9/50 - Train Loss: 0.5816 - Train F1: 0.6545 - Val Loss: 0.5632 - Val F1: 0.6827
Epoch 10/50 - Train Loss: 0.5780 - Train F1: 0.6680 - Val Loss: 0.5808 - Val F1: 0.6562
Epoch 11/50 - Train Loss: 0.5610 - Train F1: 0.6800 - Val Loss: 0.5593 - Val F1: 0.6680
Epoch 12/50 - Train Loss: 0.5412 - Train 

## The same model we used with the augmented data, and which showed a f1 score of .75 now gives .69, but with a data set half the size.  

In [None]:
compressed_path = os.path.join(PROJECT_ROOT, "data", "processed", "lossless_data.bin")
X_train, y_train = load_train_data()
write_compressed_file(compressed_path, X_train)

[✓] Loaded X_train with 6179 sequences
[✓] Loaded y_train with shape (6179, 1)


## Lets use augment data for improving performance of the model training. Now with TCN classifier. 
##### This is the loop for choosing the best model parameter combination with augmented data. The function hyperparameter_search has the parameter augmented_data, which initialize a different data pipeline for loading and processing the data. When this parameter is False, it only takes the raw matrix X_train. However, for augmented_data = True, the pipeline implements time stretch, time_shift, add noise, amplitude scale amd random crop. 

In [14]:
from src.models.model_2 import TCN_STFT_Classifier


param_grid = {
    # Configuraciones donde len(hidden_channels) == num_levels
    
        'hidden_channels': [[64,128,128,128],[128,128,128,128]],
    'dropout': [0.1, 0.2, 0.3],
    'kernel_size': [3, 5],
    'num_levels': [3,4]
}


fixed = {
    "num_classes": 4,
    "n_fft": 256,
    "hop_length": 128,
    "kernel_size": 3,
    "learning_rate" : .001,
}

results = hyperparameter_search(
    TCN_STFT_Classifier,
    param_grid,
    fixed,
    device=device,
    epochs=5,
    train_loader=train_loader,
    val_loader=val_loader,
    augmented_data = True
)


🔧 Training with config: {'num_classes': 4, 'n_fft': 256, 'hop_length': 128, 'kernel_size': 3, 'learning_rate': 0.001, 'hidden_channels': [64, 128, 128, 128], 'dropout': 0.1, 'num_levels': 3}
Epoch 1/5 | Train Loss: 0.9341 | Train F1: 0.2667 | Val Loss: 0.8246 | Val F1: 0.3874
Epoch 2/5 | Train Loss: 0.7934 | Train F1: 0.3967 | Val Loss: 0.7065 | Val F1: 0.4543
Epoch 3/5 | Train Loss: 0.7096 | Train F1: 0.4864 | Val Loss: 0.6723 | Val F1: 0.5496
Epoch 4/5 | Train Loss: 0.6553 | Train F1: 0.5339 | Val Loss: 0.6016 | Val F1: 0.6278
Epoch 5/5 | Train Loss: 0.6234 | Train F1: 0.5856 | Val Loss: 0.6080 | Val F1: 0.6162

🔧 Training with config: {'num_classes': 4, 'n_fft': 256, 'hop_length': 128, 'kernel_size': 3, 'learning_rate': 0.001, 'hidden_channels': [64, 128, 128, 128], 'dropout': 0.1, 'num_levels': 4}
Epoch 1/5 | Train Loss: 0.9239 | Train F1: 0.2720 | Val Loss: 0.8040 | Val F1: 0.3220
Epoch 2/5 | Train Loss: 0.7996 | Train F1: 0.4021 | Val Loss: 0.7239 | Val F1: 0.4130
Epoch 3/5 | Tr

## We get our best parameter selection for the model trained with data augmentation for TCN classifier

In [7]:
from src.models.model_2 import TCN_STFT_Classifier
from src.models.model_trainer import Trainer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


model = TCN_STFT_Classifier(
    num_classes=4,
    hop_length = 128,
    n_fft = 256,
    kernel_size = 5, 
    hidden_channels=  [128, 128, 128, 128],
    dropout = 0.1,
    num_levels = 3,
    device=device,
).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

criterion = torch.nn.CrossEntropyLoss()

trainer = Trainer(model, optimizer, criterion, device=device, augment_data = True)

history = trainer.fit(train_loader, val_loader, epochs=50)

train_loss, train_f1 = trainer.evaluate(train_loader)

val_loss, val_f1 = trainer.evaluate(val_loader)

cm, report = trainer.detailed_metrics(val_loader, class_names=["class_0", "class_1", "class_2", "class_3"])
print(report)

model.eval()  # Modo evaluación

all_preds = []

with torch.no_grad():
    for X_batch, lengths_batch in test_loader:
        X_batch = X_batch.to(device)
        lengths_batch = lengths_batch.to(device)
            
        outputs = model(X_batch, lengths_batch)
        preds = torch.argmax(outputs, dim=1)  # clase con mayor probabilidad
        all_preds.extend(preds.cpu().numpy())
    
df = pd.DataFrame({'predicted_label': all_preds})
    
df.to_csv('augment.csv', index=False)

  WeightNorm.apply(module, name, dim)


Epoch 1/50 - Train Loss: 0.9650 - Train F1: 0.2736 - Val Loss: 0.8357 - Val F1: 0.4054
Epoch 2/50 - Train Loss: 0.8224 - Train F1: 0.3944 - Val Loss: 0.7376 - Val F1: 0.4935
Epoch 3/50 - Train Loss: 0.7503 - Train F1: 0.4549 - Val Loss: 0.6875 - Val F1: 0.4449
Epoch 4/50 - Train Loss: 0.6898 - Train F1: 0.5139 - Val Loss: 0.6970 - Val F1: 0.5146
Epoch 5/50 - Train Loss: 0.6438 - Train F1: 0.5835 - Val Loss: 0.6304 - Val F1: 0.5737
Epoch 6/50 - Train Loss: 0.6245 - Train F1: 0.5828 - Val Loss: 0.6476 - Val F1: 0.4823
Epoch 7/50 - Train Loss: 0.5934 - Train F1: 0.6272 - Val Loss: 0.5826 - Val F1: 0.6639
Epoch 8/50 - Train Loss: 0.5788 - Train F1: 0.6337 - Val Loss: 0.6112 - Val F1: 0.6324
Epoch 9/50 - Train Loss: 0.5585 - Train F1: 0.6468 - Val Loss: 0.5903 - Val F1: 0.6286
Epoch 10/50 - Train Loss: 0.5346 - Train F1: 0.6806 - Val Loss: 0.5894 - Val F1: 0.6740
Epoch 11/50 - Train Loss: 0.5312 - Train F1: 0.6820 - Val Loss: 0.5735 - Val F1: 0.6734
Epoch 12/50 - Train Loss: 0.5180 - Train 