## Comparison of different datasets using custom model architecture

In [1]:
from torch.utils.data import DataLoader
import torch

In [2]:
from src.data.make_dataset import AudioDataset

In [3]:
from src.models.first_cnn_lstm import First_CNN_LSTM
from src.models.train import train_model
from src.models.evaluate import evaluate_model

In [4]:
# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('Device:', device)

Device: cuda


#### Experiment 1: CQT dataset with frequency bins = 12

In [5]:
train_dataset = AudioDataset('../Data/processed/cqt/bins_12/', 'train.h5')

test_dataset = AudioDataset('../Data/processed/cqt/bins_12/', 'test.h5')
test_loader = DataLoader(test_dataset, batch_size=512, shuffle=False, num_workers=16)


In [6]:
model = First_CNN_LSTM().to(device)

In [7]:
train_model(model=model, dataset=train_dataset, device=device, val_ratio=0.2, 
                   num_epochs=2, batch_size=512, learning_rate=0.001,
                   save_dir='../Models/custom_note_transcription/cqt/bins_12/')

Epoch 1/2 - Training, Loss: 0.023: 100%|██████████| 23020/23020 [13:52<00:00, 27.66it/s]
Epoch 1/2 - Validation, Loss: 0.017: 100%|██████████| 5755/5755 [03:35<00:00, 26.70it/s]


Epoch [1/2] - Train Loss: 0.023, Validation Loss: 0.017
Saved best model state to ../Models/custom_note_transcription/cqt/bins_12/best_model_state.pth
Saved latest model checkpoint to ../Models/custom_note_transcription/cqt/bins_12/latest_model_checkpoint.pth


Epoch 2/2 - Training, Loss: 0.016: 100%|██████████| 23020/23020 [13:09<00:00, 29.16it/s]
Epoch 2/2 - Validation, Loss: 0.015: 100%|██████████| 5755/5755 [03:33<00:00, 26.89it/s]

Epoch [2/2] - Train Loss: 0.016, Validation Loss: 0.015
Saved best model state to ../Models/custom_note_transcription/cqt/bins_12/best_model_state.pth
Saved latest model checkpoint to ../Models/custom_note_transcription/cqt/bins_12/latest_model_checkpoint.pth
Finished Training





In [8]:
cqt_bins_12_metrics = evaluate_model(model=model, val_loader=test_loader, device=device, return_metrics=True)

Test Accuracy: 0.9008861859252824
Test Recall: 0.5073715333056845
Test Precision: 0.7554029486851561
Test F1: 0.6035058273273617


#### Experiment 2: CQT dataset with frequency bins = 24

In [5]:
train_dataset = AudioDataset('../Data/processed/cqt/bins_24/', 'train.h5')

test_dataset = AudioDataset('../Data/processed/cqt/bins_24/', 'test.h5')
test_loader = DataLoader(test_dataset, batch_size=512, shuffle=False, num_workers=16)

In [6]:
model = First_CNN_LSTM().to(device)

In [7]:
train_model(model=model, dataset=train_dataset, device=device, val_ratio=0.2, 
                   num_epochs=2, batch_size=512, learning_rate=0.001,
                   save_dir='../Models/custom_note_transcription/cqt/bins_24/')

Epoch 1/2 - Training, Loss: 0.021: 100%|██████████| 23020/23020 [30:07<00:00, 12.73it/s]
Epoch 1/2 - Validation, Loss: 0.015: 100%|██████████| 5755/5755 [03:57<00:00, 24.24it/s]


Epoch [1/2] - Train Loss: 0.021, Validation Loss: 0.015
Saved best model state to ../Models/custom_note_transcription/cqt/bins_24/best_model_state.pth
Saved latest model checkpoint to ../Models/custom_note_transcription/cqt/bins_24/latest_model_checkpoint.pth


Epoch 2/2 - Training, Loss: 0.014: 100%|██████████| 23020/23020 [29:35<00:00, 12.96it/s] 
Epoch 2/2 - Validation, Loss: 0.012: 100%|██████████| 5755/5755 [04:03<00:00, 23.64it/s]


Epoch [2/2] - Train Loss: 0.014, Validation Loss: 0.012
Saved best model state to ../Models/custom_note_transcription/cqt/bins_24/best_model_state.pth
Saved latest model checkpoint to ../Models/custom_note_transcription/cqt/bins_24/latest_model_checkpoint.pth
Finished Training


In [8]:
cqt_bins_24_metrics = evaluate_model(model=model, val_loader=test_loader, device=device, return_metrics=True)

Test Accuracy: 0.9035556038227628
Test Recall: 0.5417140487243636
Test Precision: 0.7513831136037811
Test F1: 0.6252571215278818


#### Experiment 3: CQT dataset with frequency bins = 36

In [5]:
train_dataset = AudioDataset('../Data/processed/cqt/bins_36/', 'train.h5')

test_dataset = AudioDataset('../Data/processed/cqt/bins_36/', 'test.h5')
test_loader = DataLoader(test_dataset, batch_size=512, shuffle=False, num_workers=16)

In [6]:
model = First_CNN_LSTM().to(device)

In [7]:
train_model(model=model, dataset=train_dataset, device=device, val_ratio=0.2,
                   num_epochs=2, batch_size=512, learning_rate=0.001,
                   save_dir='../Models/custom_note_transcription/cqt/bins_36/')

Epoch 1/2 - Training, Loss: 0.019: 100%|██████████| 23020/23020 [20:24<00:00, 18.80it/s]
Epoch 1/2 - Validation, Loss: 0.014: 100%|██████████| 5755/5755 [04:35<00:00, 20.87it/s]


Epoch [1/2] - Train Loss: 0.019, Validation Loss: 0.014
Saved best model state to ../Models/custom_note_transcription/cqt/bins_36/best_model_state.pth
Saved latest model checkpoint to ../Models/custom_note_transcription/cqt/bins_36/latest_model_checkpoint.pth


Epoch 2/2 - Training, Loss: 0.012: 100%|██████████| 23020/23020 [20:18<00:00, 18.89it/s]
Epoch 2/2 - Validation, Loss: 0.011: 100%|██████████| 5755/5755 [04:30<00:00, 21.31it/s]


Epoch [2/2] - Train Loss: 0.012, Validation Loss: 0.011
Saved best model state to ../Models/custom_note_transcription/cqt/bins_36/best_model_state.pth
Saved latest model checkpoint to ../Models/custom_note_transcription/cqt/bins_36/latest_model_checkpoint.pth
Finished Training


In [8]:
cqt_bins_36_metrics = evaluate_model(model=model, val_loader=test_loader, device=device, return_metrics=True)

Test Accuracy: 0.9018462206776716
Test Recall: 0.5425903380737689
Test Precision: 0.7473143231765956
Test F1: 0.6235831627910539


#### Experiment 4: STFT dataset with frame size = 1024

In [8]:
train_dataset = AudioDataset('../Data/processed/stft/frame_size_1024/', 'train.h5')

test_dataset = AudioDataset('../Data/processed/stft/frame_size_1024/', 'test.h5')
test_loader = DataLoader(test_dataset, batch_size=512, shuffle=True, num_workers=16)

In [9]:
model = First_CNN_LSTM().to(device)

In [10]:
train_model(model=model, dataset=train_dataset, device=device, val_ratio=0.2,
                   num_epochs=2, batch_size=512, learning_rate=0.001,
                   save_dir='../Models/custom_note_transcription/stft/frame_size_1024/')

Epoch 1/2 - Training, Loss: 0.028: 100%|██████████| 46040/46040 [1:05:20<00:00, 11.74it/s]
Epoch 1/2 - Validation, Loss: 0.024: 100%|██████████| 11510/11510 [13:36<00:00, 14.09it/s]


Epoch [1/2] - Train Loss: 0.028, Validation Loss: 0.024
Saved best model state to ../Models/custom_note_transcription/stft/frame_size_1024/best_model_state.pth
Saved latest model checkpoint to ../Models/custom_note_transcription/stft/frame_size_1024/latest_model_checkpoint.pth


Epoch 2/2 - Training, Loss: 0.023: 100%|██████████| 46040/46040 [1:02:12<00:00, 12.33it/s]
Epoch 2/2 - Validation, Loss: 0.022: 100%|██████████| 11510/11510 [12:59<00:00, 14.76it/s]


Epoch [2/2] - Train Loss: 0.023, Validation Loss: 0.022
Saved best model state to ../Models/custom_note_transcription/stft/frame_size_1024/best_model_state.pth
Saved latest model checkpoint to ../Models/custom_note_transcription/stft/frame_size_1024/latest_model_checkpoint.pth
Finished Training


In [11]:
stft_frame_size_1024_metrics = evaluate_model(model=model, val_loader=test_loader, device=device, return_metrics=True)

Test Accuracy: 0.8852300741754363
Test Recall: 0.31849398698713766
Test Precision: 0.6677639661274494
Test F1: 0.40786269477312037


#### Experiment 5: STFT dataset with frame size = 2048

In [5]:
train_dataset = AudioDataset('../Data/processed/stft/frame_size_2048/', 'train.h5')

test_dataset = AudioDataset('../Data/processed/stft/frame_size_2048/', 'test.h5')
test_loader = DataLoader(test_dataset, batch_size=512, shuffle=False, num_workers=16)

In [6]:
model = First_CNN_LSTM().to(device)

In [7]:
train_model(model=model, dataset=train_dataset, device=device, val_ratio=0.2,
                   num_epochs=2, batch_size=512, learning_rate=0.001,
                   save_dir='../Models/custom_note_transcription/stft/frame_size_2048/')

Epoch 1/2 - Training, Loss: 0.023: 100%|██████████| 23020/23020 [20:13<00:00, 18.97it/s]
Epoch 1/2 - Validation, Loss: 0.017: 100%|██████████| 5755/5755 [04:04<00:00, 23.57it/s]


Epoch [1/2] - Train Loss: 0.023, Validation Loss: 0.017
Saved best model state to ../Models/custom_note_transcription/stft/frame_size_2048/best_model_state.pth
Saved latest model checkpoint to ../Models/custom_note_transcription/stft/frame_size_2048/latest_model_checkpoint.pth


Epoch 2/2 - Training, Loss: 0.017: 100%|██████████| 23020/23020 [20:17<00:00, 18.91it/s]
Epoch 2/2 - Validation, Loss: 0.016: 100%|██████████| 5755/5755 [03:57<00:00, 24.18it/s]

Epoch [2/2] - Train Loss: 0.017, Validation Loss: 0.016
Saved best model state to ../Models/custom_note_transcription/stft/frame_size_2048/best_model_state.pth
Saved latest model checkpoint to ../Models/custom_note_transcription/stft/frame_size_2048/latest_model_checkpoint.pth
Finished Training





In [8]:
stft_frame_size_2048_metrics = evaluate_model(model=model, val_loader=test_loader, device=device, return_metrics=True)

Test Accuracy: 0.8962684622067767
Test Recall: 0.46930311584411416
Test Precision: 0.7335218756431129
Test F1: 0.5615569907803696


#### Results

Best results are obtained with 