In [1]:
import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent
if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

PREPROCESSED_ROOT = PROJECT_ROOT / "preprocessed_dataset"

print("PREPROCESSED_ROOT:", PREPROCESSED_ROOT)
print("PROJECT_ROOT:", PROJECT_ROOT)

PREPROCESSED_ROOT: E:\DL_audiotomidi\preprocessed_dataset
PROJECT_ROOT: E:\DL_audiotomidi


In [2]:
import math
import random
import numpy as np

import torch
from torch import nn
from torch.optim import AdamW
from torch.optim.lr_scheduler import LambdaLR

from models.rnn_amt import CompactBiGRU
from scripts.dataset_helpers import create_dataloaders
from scripts.evaluate import evaluate, frame_level_f1

import wandb

import time

In [3]:
# SPEC_TYPE="mel" for Mel-Spectrogram
SPEC_TYPE = "cqt"
# N_FREQ_BINS=229 for Mel-Spectrogram
N_FREQ_BINS = 252
N_PITCHES = 88

BATCH_SIZE = 16
CHUNK_LEN = 512
NUM_WORKERS = 4

NUM_EPOCHS = 100

BASE_LR = 3e-4
WEIGHT_DECAY = 5e-5
# WARMUP for Learning Rate
WARMUP_RATIO = 0.1

SEED = 42
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

HOP_LENGTH = 512
SR = 22050
# Binarization threshold
THRESHOLD = 0.5

In [4]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)

    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
set_seed(SEED)

In [5]:
train_loader, val_loader = create_dataloaders(
    root_dir=str(PREPROCESSED_ROOT),
    spec_type=SPEC_TYPE,
    batch_size=BATCH_SIZE,
    chunk_len=CHUNK_LEN,
    num_workers=NUM_WORKERS,
)

print("train batches:", len(train_loader))
print("val batches:", len(val_loader))

Train batches: 61
Val batches: 9


In [6]:
num_training_steps = NUM_EPOCHS * len(train_loader)
print("training steps:", num_training_steps)
num_warmup_steps = int(WARMUP_RATIO * num_training_steps)
print("warmup steps:",num_warmup_steps)

6100
610


In [7]:
def lr_lambda(current_step: int):
    # warmup
    if current_step < num_warmup_steps:
        return float(current_step) / max(1, num_warmup_steps)

    # cosine decay
    progress = float(current_step - num_warmup_steps) / max(
        1, num_training_steps - num_warmup_steps
    )
    return 0.5 * (1.0 + math.cos(math.pi * progress))

In [8]:
model = CompactBiGRU(
    input_dim=N_FREQ_BINS,
    n_pitches=N_PITCHES,
    proj_dim=96,
    hidden_dim=160,
    num_blocks=2,
    dropout=0.2,
).to(DEVICE)

optimizer = AdamW(
    model.parameters(),
    lr=BASE_LR,
    weight_decay=WEIGHT_DECAY
)

scheduler = LambdaLR(optimizer, lr_lambda=lr_lambda)

model

CompactBiGRU(
  (input_proj): Linear(in_features=252, out_features=96, bias=True)
  (blocks): Sequential(
    (0): ResidualBiGRUBlock(
      (gru): GRU(96, 160, batch_first=True, bidirectional=True)
      (dropout): Dropout(p=0.2, inplace=False)
      (proj): Linear(in_features=96, out_features=320, bias=True)
      (norm): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
    )
    (1): ResidualBiGRUBlock(
      (gru): GRU(320, 160, batch_first=True, bidirectional=True)
      (dropout): Dropout(p=0.2, inplace=False)
      (norm): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
    )
  )
  (head): Linear(in_features=320, out_features=88, bias=True)
)

In [9]:
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Trainable parameters: {trainable:,}")

Trainable parameters: 795,256


In [33]:
#loss function weighted with a positive weight value
pos_weight_value = 5.0

criterion = nn.BCEWithLogitsLoss(
    pos_weight=torch.full((N_PITCHES,), pos_weight_value, device=DEVICE)
)

In [25]:
def train_one_epoch(
    model: nn.Module,
    dataloader,
    criterion,
    optimizer,
    scheduler,
    device: torch.device,
    epoch: int,
    log_interval: int = 50
):
    
    model.train()
    running_loss = 0.0
    running_f1 = 0.0
    num_batches = 0

    for batch_idx, (spec, target) in enumerate(dataloader):

        spec = spec.to(device, non_blocking=True)
        target = target.to(device, non_blocking=True)

        optimizer.zero_grad(set_to_none=True)

        logits = model(spec)

        # BCEWithLogitsLoss by all (frame, pitch)
        loss = criterion(
            logits.reshape(-1, logits.shape[-1]),
            target.reshape(-1, target.shape[-1])
        )

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()

        batch_f1 = frame_level_f1(logits.detach(), target.detach(), threshold=THRESHOLD)

        running_loss += loss.item()
        running_f1 += batch_f1
        num_batches += 1

    epoch_loss = running_loss / max(1, num_batches)
    epoch_f1 = running_f1 / max(1, num_batches)

    return epoch_loss, epoch_f1

In [26]:
def train(model,
          train_dataloader,
          val_dataloader,
          criterion,
          optimizer,
          scheduler,
          device,
          n_epochs,
          run=None,
          checkpoint_path=None,
          threshold=0.5,
          hop_length=512,
          sr=22050,
          log_interval=50,
):
    
    global_step = 0
    best_val_frame_f1 = 0.0

    total_start = time.time()
    
    for epoch in range(1, n_epochs + 1):

        epoch_start = time.time()

        # training phase
        train_loss, train_frame_f1 = train_one_epoch(
            model=model,
            dataloader=train_dataloader,
            criterion=criterion,
            optimizer=optimizer,
            scheduler=scheduler,
            device=device,
            epoch=epoch,
            log_interval=log_interval
        )
        
        train_time_sec = time.time() - epoch_start

        global_step = epoch * len(train_dataloader)

        # validation phase
        val_start = time.time()
        val_loss, val_frame_f1, _ = evaluate(
            model=model,
            dataloader=val_dataloader,
            criterion=criterion,
            device=device,
            threshold=threshold,
            hop_length=hop_length,
            sr=sr,
            onset_tolerance=0.05,
            compute_note_f1=False
        )

        val_time = time.time() - val_start
        epoch_time = time.time() - epoch_start

        print(
            f"[Epoch {epoch}/{n_epochs}] "
            f"train_loss={train_loss:.4f}, train_F1={train_frame_f1:.4f} | "
            f"val_loss={val_loss:.4f}, val_F1={val_frame_f1:.4f} | "
            f"train_time={train_time_sec:.1f}s"
        )

        current_lr = optimizer.param_groups[0]["lr"]

        #wandb logging
        if run is not None:
            run.log(
                {
                    "epoch": epoch,
                    "train/loss": train_loss,
                    "train/frame_F1": train_frame_f1,
                    "val/loss": val_loss,
                    "val/frame_F1": val_frame_f1,
                    "lr": current_lr,
                    "time/train_epoch_sec": epoch_time,
                },
                step=global_step
            )

        #saving the best checkpoint by val_frame metric
        if checkpoint_path is not None and val_frame_f1 > best_val_frame_f1:
            best_val_frame_f1 = val_frame_f1
            torch.save(
                {
                    "model_state_dict": model.state_dict()
                },
                checkpoint_path
            )
            print(f"*** New best frame-F1={best_val_frame_f1:.4f}, saved to {checkpoint_path} ***")

    total_time = time.time() - total_start
    print(f"\n=== TOTAL TRAINING TIME: {total_time/3600:.2f} hours ===")

    if run is not None:
        run.log({"time/total_hours": total_time / 3600.0}, step=global_step)

    return best_val_frame_f1

In [36]:
CHECKPOINT_DIR = PROJECT_ROOT / "checkpoints"
print(CHECKPOINT_DIR)

best_ckpt_path = CHECKPOINT_DIR / f"compactgru_{SPEC_TYPE}_best_cqt.pt"

E:\DL_audiotomidi\checkpoints


In [37]:
wandb_config = {
    "model": "BiGRU",
    "spec_type": SPEC_TYPE,
    "n_freq_bins": N_FREQ_BINS,
    "n_pitches": N_PITCHES,
    "batch_size": BATCH_SIZE,
    "chunk_len": CHUNK_LEN,
    "num_workers": NUM_WORKERS,
    "num_epochs": NUM_EPOCHS,
    "base_lr": BASE_LR,
    "weight_decay": WEIGHT_DECAY,
    "warmup_ratio": WARMUP_RATIO,
    "seed": SEED,
    "hop_length": HOP_LENGTH,
    "sr": SR,
    "threshold": THRESHOLD,
}

In [38]:
with wandb.init(
    project="DL_audiotomidi",
    name="CompactBiGRU_cqt",
    config=wandb_config
) as run:

    best_val_frame_f1 = train(
        model=model,
        train_dataloader=train_loader,
        val_dataloader=val_loader,
        criterion=criterion,
        optimizer=optimizer,
        scheduler=scheduler,
        device=DEVICE,
        n_epochs=NUM_EPOCHS,
        run=run,
        checkpoint_path=best_ckpt_path,
        threshold=THRESHOLD,
        hop_length=HOP_LENGTH,
        sr=SR,
        log_interval=50
    )

print("Best validation frame-F1:", best_val_frame_f1)


=== Epoch 1/100 ===
[Epoch 1/100] train_loss=0.8370, train_F1=0.0956 | val_loss=0.6614, val_F1=0.0768 | train_time=47.4s
*** New best frame-F1=0.0768, saved to E:\DL_audiotomidi\checkpoints\compactgru_cqt_best_cqt.pt ***

=== Epoch 2/100 ===
[Epoch 2/100] train_loss=0.6477, train_F1=0.0761 | val_loss=0.4958, val_F1=0.0087 | train_time=51.8s

=== Epoch 3/100 ===
[Epoch 3/100] train_loss=0.5928, train_F1=0.0490 | val_loss=0.4604, val_F1=0.0362 | train_time=42.6s

=== Epoch 4/100 ===
[Epoch 4/100] train_loss=0.5496, train_F1=0.1143 | val_loss=0.4218, val_F1=0.2062 | train_time=45.0s
*** New best frame-F1=0.2062, saved to E:\DL_audiotomidi\checkpoints\compactgru_cqt_best_cqt.pt ***

=== Epoch 5/100 ===
[Epoch 5/100] train_loss=0.5082, train_F1=0.3151 | val_loss=0.3697, val_F1=0.3864 | train_time=39.3s
*** New best frame-F1=0.3864, saved to E:\DL_audiotomidi\checkpoints\compactgru_cqt_best_cqt.pt ***

=== Epoch 6/100 ===
[Epoch 6/100] train_loss=0.4488, train_F1=0.4164 | val_loss=0.3229, v

0,1
epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇██
lr,▂▃▄██████▇▇▇▇▇▇▆▆▆▆▅▅▅▄▄▄▃▃▃▃▂▂▂▂▁▁▁▁▁▁▁
time/total_hours,▁
time/train_epoch_sec,▃▅▁▃█▄▃▇▄▂▄▂▄▄▃▃▂▅▂▄▄▄▄▃▃▂▂▄▃▆▆▅▂▃▂▄▃▄▃▃
train/frame_F1,▁▂▃▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇████████████████
train/loss,█▆▅▄▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val/frame_F1,▁▂▂▁▃▅▆▆▆▅▄▆▆▅▇▆▅▆▇▇▆▆▆▇▇▆█▇███▇████████
val/loss,█▆▅▅▄▃▂▂▂▂▂▁▁▂▁▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,100.0
lr,0.0
time/total_hours,1.57479
time/train_epoch_sec,45.21986
train/frame_F1,0.65999
train/loss,0.20542
val/frame_F1,0.6545
val/loss,0.13929


Best validation frame-F1: 0.6651157952048441


In [13]:
ckpt = torch.load(best_ckpt_path, map_location=DEVICE)
model.load_state_dict(ckpt["model_state_dict"])
model.to(DEVICE)
model.eval()

CompactBiGRU(
  (input_proj): Linear(in_features=252, out_features=96, bias=True)
  (blocks): Sequential(
    (0): ResidualBiGRUBlock(
      (gru): GRU(96, 160, batch_first=True, bidirectional=True)
      (dropout): Dropout(p=0.2, inplace=False)
      (proj): Linear(in_features=96, out_features=320, bias=True)
      (norm): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
    )
    (1): ResidualBiGRUBlock(
      (gru): GRU(320, 160, batch_first=True, bidirectional=True)
      (dropout): Dropout(p=0.2, inplace=False)
      (norm): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
    )
  )
  (head): Linear(in_features=320, out_features=88, bias=True)
)

In [14]:
from scripts.dataset_helpers import PianoRollDataset
from torch.utils.data import DataLoader

TEST_CHUNK_LEN = 512  # for CNN we don't need to chop the spectrogram into chunks, 
                      # but for RNN and CRNN we have to
TEST_BATCH_SIZE = 8

test_ds = PianoRollDataset(
    root_dir=str(PREPROCESSED_ROOT),
    split="test",
    spec_type=SPEC_TYPE,
    chunk_len=TEST_CHUNK_LEN,
    random_crop=False
)

test_loader = DataLoader(
    test_ds,
    batch_size=TEST_BATCH_SIZE,
    shuffle=False,
    num_workers=NUM_WORKERS,
    pin_memory=True
)

In [15]:
from scripts.evaluate import compute_frame_micro_metrics

frame_metrics = compute_frame_micro_metrics(
    model,
    test_loader,
    DEVICE,
    threshold=0.6
)

print("=== Test frame-level metrics ===")
for k, v in frame_metrics.items():
    if k in ("tp", "fp", "fn"):
        continue
    print(f"{k}: {v * 100:.2f}%")

=== Test frame-level metrics ===
accuracy: 97.52%
precision: 65.92%
recall: 74.15%
frame_f1: 69.79%


In [24]:
from scripts.evaluate import compute_note_micro_metrics

note_metrics = compute_note_micro_metrics(
    model,
    test_loader,
    DEVICE,
    threshold=0.7,
    hop_length=HOP_LENGTH,
    sr=SR,
    onset_tolerance=0.05
)

print("=== Test note-level metrics ===")
for k, v in note_metrics.items():
    if k in ("tp", "fp", "fn"):
        continue
    print(f"{k}: {v * 100:.2f}%")

=== Test note-level metrics ===
accuracy: 32.60%
precision: 39.51%
recall: 65.11%
note_f1: 49.17%


In [None]:
from scripts.inference.py import get_piano_roll, predict, plot_comparison, measure_efficiency

# Test track with corresponding MIDI
AUDIO_PATH = "test_long.wav"
MIDI_PATH = "test_long.midi"




audio, _ = librosa.load(AUDIO_PATH, sr=SR, mono=True)

pred_probs, spec = predict(model, audio, DEVICE)

gt_roll = get_piano_roll(MIDI_PATH, SR, HOP_LENGTH)

plot_comparison(spec, pred_probs, gt_roll, SR, HOP_LENGTH)

In [None]:
stats = measure_efficiency(model, device='cpu', duration_sec=60)