In [1]:
%pip install tqdm
%pip install torchinfo
%pip install torchcrepe
from torchinfo import summary

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm
import joblib
from pickle import dump
import torchcrepe
import scipy
import random
from utils import *
import soundfile as sf
import librosa
#from cloudpickle import dump
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
# Some helper functions to get the output length of a conv1d layer.
from functools import reduce
def get_conv1d_len_func(model):
    return lambda i: torch.floor((i + 2 * model.padding[0] - model.dilation[0] * (model.kernel_size[0] - 1) - 1) / model.stride[0] + 1)
        
def get_transpose_conv1d_len_func(model):
    return lambda i: (i - 1) * model.stride[0] - 2 * model.padding[0] + model.dilation[0] * (model.kernel_size[0] - 1) + model.output_padding[0] + 1

def get_unfold_len(length, window_ks, window_stride):
    return (length - window_ks) // window_stride + 1

# A simple conv + batch norm + relu model. 
# Note that the padding is set to 0.
class Block(nn.Module):
    def __init__(self, num_ins, num_outs, kernel_size=3,stride=1, dilation=1):
        super().__init__()
        
        # Note that the padding is 0 here. 
        self.conv = nn.Conv1d(num_ins, num_outs, kernel_size, padding=0, stride=stride,dilation=dilation)
        self.bn = nn.BatchNorm1d(num_outs) 

    def forward(self, x):
        x = F.relu(self.bn(self.conv(x)))
        return x
    
    # This helps you judge the output length of the model given some input length. 
    def get_output_lengths(self, length):
        return self.len_funcs(length)

# A more complex CNN blosk with residual path. aka ResBlock.
# You can chain several Resblock or Block together. 
class ResBlock(nn.Module):
    '''
    Gaddy and Klein, 2021, https://arxiv.org/pdf/2106.01933.pdf 
    Original code:
        https://github.com/dgaddy/silent_speech/blob/master/transformer.py
    '''
    def __init__(self, num_ins, num_outs, kernel_size = 3, padding = 1, stride=1):
        super().__init__()

        self.conv1 = nn.Conv1d(num_ins, num_outs, kernel_size, padding=padding, stride=stride)
        self.bn1 = nn.BatchNorm1d(num_outs)
        self.conv2 = nn.Conv1d(num_outs, num_outs, kernel_size, padding=padding)
        self.bn2 = nn.BatchNorm1d(num_outs)

        # This helps whenever in_channels != out_channels or stride != 1.
        # E.g. the first input Resblock layer, or you want to downsample the input.
        if stride != 1 or num_ins != num_outs:
            # With kernel size of 1, this is essentially a linear layer but with stride. 
            self.residual_path = nn.Conv1d(num_ins, num_outs, 1, stride=stride)
            self.res_norm = nn.BatchNorm1d(num_outs)
        else:
            self.residual_path = None
        
        # This helps you judge the output length of the model given some input length.
        # len_funcs is a list of functions that takes in a length and returns the output length.
        len_funcs = []
        len_funcs.append(get_conv1d_len_func(self.conv1))
        len_funcs.append(get_conv1d_len_func(self.conv2))
        self.len_funcs = len_funcs
        
        
        # Helps you run a sanity check that if the residual path is activated, 
        # it should be configured such that the output length is the same as the output length of the main path.
        if self.residual_path is not None:
            residual_len_funcs = get_conv1d_len_func(self.residual_path)
            for data_length in torch.arange(10, 100, 1):
                main_len = reduce(lambda x, func: func(x), self.len_funcs, data_length).int()
                res_len = residual_len_funcs(data_length)
                assert main_len == res_len, f"Residual path length {res_len} is not the same as the main path length {main_len}. Please check the configuration or reach out to me."
    
    def get_output_lengths(self, length):
        return reduce(lambda x, func: func(x), self.len_funcs, length).int()
    
    def forward(self, x):
        input_value = x

        x = F.relu(self.bn1(self.conv1(x)))
        x = self.bn2(self.conv2(x))

        if self.residual_path is not None:
            res = self.res_norm(self.residual_path(input_value))
        else:
            res = input_value

        return F.relu(x + res)
    
# An example of CNN that chains several ResBlocks together.
# Generally, this is more powerful than the simple Block model, but is also more prune to overfitting.
# Original conv blocks: no dropout, 2 hidden-hidden blocks.
class Original(nn.Module):
    def __init__(self, in_channels, hidden_dim):
        super().__init__()

    
        self.conv_blocks = nn.Sequential(
            ResBlock(in_channels, hidden_dim, kernel_size = 3, stride = 1),
            ResBlock(hidden_dim, hidden_dim, kernel_size = 3, stride = 1),
            ResBlock(hidden_dim, hidden_dim, kernel_size=3, stride=1),
        )

        def get_model_len_func(model):
            len_funcs = []
            for i in model.conv_blocks:
                for j in range(2):
                    len_funcs.append(get_conv1d_len_func(eval("i.conv" + str(j + 1))))
            return len_funcs

        self.len_funcs = get_model_len_func(self)

    def get_output_lengths(self, length):
        return reduce(lambda x, func: func(x), self.len_funcs, length)
    def forward(self, x):
        """
        Args:
            x: shape (batchsize, num_in_feats, seq_len).
        
        Return:
            out: shape (batchsize, num_out_feats, seq_len).
        """
        return self.conv_blocks(x)
    

In [4]:
def mkdir(dir):
    if not os.path.exists(dir):
        os.mkdir(dir)

def save_pkl(obj, save_path):
    """Save a Pyleecan object in a pkl file using cloudpickle

    Parameters
    ----------
    obj: Pyleecan object
        object to save
    save_path: str
        file path
    """

    with open(save_path, "wb") as save_file:
        dump(obj, save_file)

In [215]:
# TODO: copy this block for MIR 1k dataset and dataloader. 
# Create a pytorch dataset for MIR 1k.


CENTS_PER_BIN = 20  # cents
MAX_FMAX = 2006.  # hz
PITCH_BINS = 360
SAMPLE_RATE = 16000  # hz
WINDOW_SIZE = 1024

def frequency_to_cents(frequency):
    """Convert frequency in Hz to cents"""
    return 1200 * torch.log2(frequency / 10.)

# We can use this function to quantize the pitch to the nearest bin.
# but 1997 actually corresponds to 31.7hz instead of 32.7 as noted in the paper. 
def cents_to_bins(cents, quantize_fn=torch.floor):
    """Converts cents to pitch bins"""
    bins = (cents - 1997.3794084376191) / CENTS_PER_BIN
    return quantize_fn(bins).int()

def frequency_to_bins(frequency, quantize_fn=torch.floor):
    """Convert frequency in Hz to pitch bins"""
    mask = torch.isclose(frequency, torch.tensor([0.0], dtype = torch.float32))
    result = cents_to_bins(frequency_to_cents(frequency), quantize_fn)
    
    # Pitch label: 0 - 359 
    # 360: unvoiced.
    # Total of 361 bins.
    result[mask] = 360
    return result 


class MIR1kDataset(Dataset):
    def __init__(self, mel_dir, pitch_dir, fids):
        self.mel_dir = mel_dir
        self.pitch_dir = pitch_dir
        
        # File ids.
        fids = read(fids).strip().split('\n')
        
        # Check if all features files exist. 
        self.fids = fids
        for i in fids:
            assert os.path.exists(os.path.join(mel_dir, f"{i}.npy"))
            assert os.path.exists(os.path.join(pitch_dir, f"{i}.npy"))
    def __len__(self):
        return len(self.fids)

    def __getitem__(self, idx):
        fid = self.fids[idx]
        mel = np.load(os.path.join(self.mel_dir, f"{fid}.npy"))
        pitch = np.load(os.path.join(self.pitch_dir, f"{fid}.npy"))
        return mel, pitch

# Create a collate fn that randomly crop the waveform and pitch.
# min_len: 1 sec of mel. max_len: 3sec of mel.
def collate_fn(batch, min_len = 50, max_len = 150):
    mel = [i[0] for i in batch]
    pitch = [i[1] for i in batch]
    
    # Randomly crop the mel and pitch.
    crop_len = random.randint(min_len, max_len)
    min_mel_len = min([i.shape[1] for i in mel])
    
    random_start = random.randint(0, min_mel_len - crop_len)
    
    # (B, C, crop_len)
    mel = torch.stack([torch.tensor(i[:, random_start:random_start + crop_len]) for i in mel]).float()
    
    # (B, crop_len)
    pitch = torch.stack([torch.tensor(i[random_start:random_start + crop_len]) for i in pitch]).float()
    orig_pitch = pitch.clone()
    
    # Convert pitch to cents.
    pitch = frequency_to_bins(pitch)
    return mel, pitch, orig_pitch

# Train dataset and dataloader.
train_dataset = MIR1kDataset(mel_dir='/data/pitch_estimation/dataset/MIR-1K/mel_spec_both', 
                             pitch_dir='/data/pitch_estimation/dataset/MIR-1K/pitch_50', 
                             fids='/data/pitch_estimation/dataset/MIR-1K/train.txt')
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)



# Val dataset and dataloader.
val_dataset = MIR1kDataset(mel_dir='/data/pitch_estimation/dataset/MIR-1K/mel_spec_both',
                           pitch_dir='/data/pitch_estimation/dataset/MIR-1K/pitch_50',
                           fids='/data/pitch_estimation/dataset/MIR-1K/val.txt')
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=False, collate_fn=collate_fn)


# Test dataset and dataloader.
test_dataset = MIR1kDataset(mel_dir='/data/pitch_estimation/dataset/MIR-1K/mel_spec_both',
                            pitch_dir='/data/pitch_estimation/dataset/MIR-1K/pitch_50',
                            fids='/data/pitch_estimation/dataset/MIR-1K/test.txt')
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False, collate_fn=collate_fn)


In [245]:
import torch
import torch.nn as nn
import torch
import torch.nn as nn
import torch.nn.functional as F

class CREPE_ResNet_Model(nn.Module):
    def __init__(self, in_channels, hidden_dim, num_bins=361, dropout_prob=0.5):
        super(CREPE_ResNet_Model, self).__init__()

        self.conv_blocks = nn.Sequential(
            ResBlock(in_channels, 64, kernel_size=5, stride=1, padding=2),
            ResBlock(64, 128, kernel_size=5, stride=1, padding=2),
            ResBlock(128, 256, kernel_size=5, stride=1, padding=2),
            ResBlock(256, 512, kernel_size=5, stride=1, padding=2),
            ResBlock(512, hidden_dim, kernel_size=5, stride=1, padding=2),
        )

        self.mlp = nn.Sequential(
            nn.Linear(hidden_dim, 512),
            nn.ReLU(),
            nn.Dropout(dropout_prob),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(dropout_prob),
            nn.Linear(256, num_bins)
        )

    def forward(self, x):
        x = self.conv_blocks(x)  # Pass through the ResBlocks
        x = x.permute(0, 2, 1)  # Permute to (batch_size, sequence_length, hidden_dim)
        
        batch_size, seq_length, hidden_dim = x.size()
        x = x.reshape(-1, hidden_dim)  # Flatten to (batch_size * sequence_length, hidden_dim)
        x = self.mlp(x)  # Pass through MLP
        
        x = x.reshape(batch_size, seq_length, -1)  # Reshape back to (batch_size, seq_length, num_bins)
        x = x.permute(0, 2, 1)  # Permute back to (batch_size, num_bins, sequence_length)
        return x  # Final output shape: (batch_size, num_bins, sequence_length)


class CNN(nn.Module):
    def __init__(self, in_channels = 256, hidden_dim = 512, num_bins=361, dropout_prob=0.25):
        super(CNN, self).__init__()

        self.conv_blocks = nn.Sequential(
            ResBlock(in_channels, hidden_dim),
            ResBlock(hidden_dim, 512),
            ResBlock(512, 256),
            ResBlock(256, 128),
            ResBlock(128, 64),
            ResBlock(64, num_bins),
        )

    def forward(self, x):
        x = self.conv_blocks(x)  # Pass through the ResBlocks
        x = x.permute(0, 2, 1)  # Permute to (batch_size, sequence_length, hidden_dim)
        
        batch_size, seq_length, hidden_dim = x.size()
        x = x.reshape(-1, hidden_dim)  # Flatten to (batch_size * sequence_length, hidden_dim)
        #x = self.mlp(x)  # Pass through MLP
        
        x = x.reshape(batch_size, seq_length, -1)  # Reshape back to (batch_size, seq_length, num_bins)
        x = x.permute(0, 2, 1)  # Permute back to (batch_size, num_bins, sequence_length)
        return x   # Final output shape should be (batch_size, num_bins, sequence_length)


In [258]:
model = CREPE_ResNet_Model(in_channels=256, hidden_dim = 1024)
print(summary(model))
optimizer = optim.Adam(model.parameters(), lr=0.0001, weight_decay=1e-5)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)

Layer (type:depth-idx)                   Param #
CREPE_ResNet_Model                       --
├─Sequential: 1-1                        --
│    └─ResBlock: 2-1                     --
│    │    └─Conv1d: 3-1                  81,984
│    │    └─BatchNorm1d: 3-2             128
│    │    └─Conv1d: 3-3                  20,544
│    │    └─BatchNorm1d: 3-4             128
│    │    └─Conv1d: 3-5                  16,448
│    │    └─BatchNorm1d: 3-6             128
│    └─ResBlock: 2-2                     --
│    │    └─Conv1d: 3-7                  41,088
│    │    └─BatchNorm1d: 3-8             256
│    │    └─Conv1d: 3-9                  82,048
│    │    └─BatchNorm1d: 3-10            256
│    │    └─Conv1d: 3-11                 8,320
│    │    └─BatchNorm1d: 3-12            256
│    └─ResBlock: 2-3                     --
│    │    └─Conv1d: 3-13                 164,096
│    │    └─BatchNorm1d: 3-14            512
│    │    └─Conv1d: 3-15                 327,936
│    │    └─BatchNorm1d: 3-16  

In [259]:
def train_model(model, train_loader, val_loader, num_epochs, num_eval_epoch, patience,
                criterion=None, optimizer=None, scheduler=None, save_dir="", gpu_number=6):
    mkdir(save_dir)
    
    if criterion is None:
        criterion = nn.CrossEntropyLoss()
    
    device = torch.device(f'cuda:{gpu_number}' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    
    model.to(device)
    
    if optimizer is None:
        optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    train_loss = []
    val_loss = []
    val_acc = []
    
    best_val_loss = float('inf')
    patience_counter = 0
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        
        for _, (inputs, labels, _)  in tqdm(enumerate(train_loader), total=len(train_loader)):
            #print(f"inputs:{inputs}")
            #print(f"labels: {labels}")
            inputs = torch.tensor(inputs, dtype = torch.float)
            inputs = inputs.to(device)
            #labels = torch.tensor(labels)
            labels = labels.to(device).long()
            
            optimizer.zero_grad()
            
            outputs = model(inputs)
            
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
        
        train_loss_epoch = running_loss / len(train_loader)
        print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {train_loss_epoch}')
        train_loss.append(train_loss_epoch)
        
        if scheduler is not None:
            scheduler.step()
        
        if (epoch + 1) % num_eval_epoch == 0:
            result = evaluate_model(model, val_loader, criterion, device)
            print(f'Epoch: {epoch} Validation Loss: {result["val_loss"]}, Validation Accuracy: {result["val_acc"]}')
            val_loss.append(result["val_loss"])
            val_acc.append(result["val_acc"])
            
            if result["val_loss"] < best_val_loss:
                best_val_loss = result["val_loss"]
                torch.save({'model_ckpt': model.state_dict(),
                            "optimizer": optimizer.state_dict(),
                            "epoch": epoch,
                            "best_val_loss": best_val_loss,
                            }, os.path.join(save_dir, 'best_val_ckpt.pth'))
                print(f"Best model saved at epoch {epoch}, val loss: {best_val_loss}")
            else:
                patience_counter += 1
                print(f"No improvement in validation loss for {patience_counter} consecutive evaluations.")
            
            if patience_counter >= patience:
                print("Early stopping triggered.")
                break
    stats = {'train_loss': train_loss, 'val_loss': val_loss, 'val_acc': val_acc}
    save_pkl(stats, os.path.join(save_dir, 'stats.pkl'))



def evaluate_model(model, dataloader, criterion, device):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        val_loss = 0.0
        for _, (inputs, labels, _) in tqdm(enumerate(dataloader), total=len(dataloader)):
            inputs = torch.tensor(inputs, dtype = torch.float)
            inputs = inputs.to(device)
            #labels = torch.tensor(labels)
            labels = labels.to(device).long()
            outputs = model(inputs)
            
            val_loss += criterion(outputs, labels).item()
            
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    accuracy =  correct / total
    val_loss /= len(dataloader)
    return {'val_loss': val_loss, 'val_acc': accuracy}


def test_model(model, dataloader, device):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for _, (inputs, labels, _) in tqdm(enumerate(dataloader), total=len(dataloader)):
            inputs = torch.tensor(inputs, dtype = torch.float)
            inputs = inputs.to(device)
            #labels = torch.tensor(labels)
            labels = labels.to(device).long()
            outputs = model(inputs)
            
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    accuracy = correct / total
    return accuracy


In [260]:
train_model(model, train_dataloader, val_dataloader, num_epochs=150, num_eval_epoch=10, patience = 5,optimizer=optimizer, scheduler=scheduler, save_dir="/home/blastaistudent/proj-pitch/")

Using device: cuda:6


  inputs = torch.tensor(inputs, dtype = torch.float)
 15%|█▍        | 8/54 [00:00<00:01, 33.02it/s]

100%|██████████| 54/54 [00:01<00:00, 34.18it/s]


Epoch 1/150, Loss: 4.225996211723045


100%|██████████| 54/54 [00:01<00:00, 33.66it/s]


Epoch 2/150, Loss: 2.6089278724458485


100%|██████████| 54/54 [00:01<00:00, 35.03it/s]


Epoch 3/150, Loss: 2.155066384209527


100%|██████████| 54/54 [00:01<00:00, 34.21it/s]


Epoch 4/150, Loss: 1.975778720996998


100%|██████████| 54/54 [00:01<00:00, 34.86it/s]


Epoch 5/150, Loss: 1.8287992080052693


100%|██████████| 54/54 [00:01<00:00, 34.42it/s]


Epoch 6/150, Loss: 1.7050888096844707


100%|██████████| 54/54 [00:01<00:00, 35.00it/s]


Epoch 7/150, Loss: 1.6773735108198944


100%|██████████| 54/54 [00:01<00:00, 33.91it/s]


Epoch 8/150, Loss: 1.6689927379290264


100%|██████████| 54/54 [00:01<00:00, 34.09it/s]


Epoch 9/150, Loss: 1.647288775002515


100%|██████████| 54/54 [00:01<00:00, 34.65it/s]


Epoch 10/150, Loss: 1.6435350025141682


  inputs = torch.tensor(inputs, dtype = torch.float)
100%|██████████| 4/4 [00:00<00:00, 64.34it/s]


Epoch: 9 Validation Loss: 1.4117673635482788, Validation Accuracy: 63.2
Best model saved at epoch 9, val loss: 1.4117673635482788


100%|██████████| 54/54 [00:01<00:00, 35.13it/s]


Epoch 11/150, Loss: 1.617627101915854


100%|██████████| 54/54 [00:01<00:00, 34.59it/s]


Epoch 12/150, Loss: 1.6104532502315663


100%|██████████| 54/54 [00:01<00:00, 35.35it/s]


Epoch 13/150, Loss: 1.6389148434003193


100%|██████████| 54/54 [00:01<00:00, 34.14it/s]


Epoch 14/150, Loss: 1.6056412701253537


100%|██████████| 54/54 [00:01<00:00, 34.58it/s]


Epoch 15/150, Loss: 1.6217947513968856


100%|██████████| 54/54 [00:01<00:00, 34.17it/s]


Epoch 16/150, Loss: 1.6058288017908733


100%|██████████| 54/54 [00:01<00:00, 34.62it/s]


Epoch 17/150, Loss: 1.6148053738805983


100%|██████████| 54/54 [00:01<00:00, 33.26it/s]


Epoch 18/150, Loss: 1.6064375793492351


100%|██████████| 54/54 [00:01<00:00, 34.00it/s]


Epoch 19/150, Loss: 1.6037602821985881


100%|██████████| 54/54 [00:01<00:00, 35.16it/s]


Epoch 20/150, Loss: 1.6333535291530468


100%|██████████| 4/4 [00:00<00:00, 86.52it/s]


Epoch: 19 Validation Loss: 1.5873027443885803, Validation Accuracy: 46.5
No improvement in validation loss for 1 consecutive evaluations.


100%|██████████| 54/54 [00:01<00:00, 33.51it/s]


Epoch 21/150, Loss: 1.6005880015867728


100%|██████████| 54/54 [00:01<00:00, 33.62it/s]


Epoch 22/150, Loss: 1.5955284591074343


100%|██████████| 54/54 [00:01<00:00, 34.56it/s]


Epoch 23/150, Loss: 1.6077108140344973


100%|██████████| 54/54 [00:01<00:00, 32.94it/s]


Epoch 24/150, Loss: 1.6422210600641038


100%|██████████| 54/54 [00:01<00:00, 34.00it/s]


Epoch 25/150, Loss: 1.5799416414013616


100%|██████████| 54/54 [00:01<00:00, 33.65it/s]


Epoch 26/150, Loss: 1.6114070327193648


100%|██████████| 54/54 [00:01<00:00, 32.77it/s]


Epoch 27/150, Loss: 1.6047050069879603


100%|██████████| 54/54 [00:01<00:00, 29.91it/s]


Epoch 28/150, Loss: 1.579202980906875


100%|██████████| 54/54 [00:01<00:00, 32.00it/s]


Epoch 29/150, Loss: 1.620387092784599


100%|██████████| 54/54 [00:01<00:00, 30.25it/s]


Epoch 30/150, Loss: 1.5961303490179557


100%|██████████| 4/4 [00:00<00:00, 60.18it/s]


Epoch: 29 Validation Loss: 1.3205126225948334, Validation Accuracy: 69.98
Best model saved at epoch 29, val loss: 1.3205126225948334


100%|██████████| 54/54 [00:01<00:00, 27.57it/s]


Epoch 31/150, Loss: 1.6078079055856775


100%|██████████| 54/54 [00:01<00:00, 32.98it/s]


Epoch 32/150, Loss: 1.6214459030716508


100%|██████████| 54/54 [00:01<00:00, 31.34it/s]


Epoch 33/150, Loss: 1.6128426922692194


100%|██████████| 54/54 [00:01<00:00, 31.56it/s]


Epoch 34/150, Loss: 1.6107340852419536


100%|██████████| 54/54 [00:01<00:00, 31.79it/s]


Epoch 35/150, Loss: 1.6235127912627325


100%|██████████| 54/54 [00:01<00:00, 32.26it/s]


Epoch 36/150, Loss: 1.6159026975984927


100%|██████████| 54/54 [00:01<00:00, 30.79it/s]


Epoch 37/150, Loss: 1.6409323745303683


100%|██████████| 54/54 [00:01<00:00, 31.27it/s]


Epoch 38/150, Loss: 1.6324332930423595


100%|██████████| 54/54 [00:01<00:00, 30.85it/s]


Epoch 39/150, Loss: 1.6109015433876603


100%|██████████| 54/54 [00:01<00:00, 31.77it/s]


Epoch 40/150, Loss: 1.6469728416866727


100%|██████████| 4/4 [00:00<00:00, 62.38it/s]


Epoch: 39 Validation Loss: 1.3477917611598969, Validation Accuracy: 67.44
No improvement in validation loss for 2 consecutive evaluations.


100%|██████████| 54/54 [00:01<00:00, 31.39it/s]


Epoch 41/150, Loss: 1.625124829786795


100%|██████████| 54/54 [00:01<00:00, 31.52it/s]


Epoch 42/150, Loss: 1.63656304942237


100%|██████████| 54/54 [00:01<00:00, 31.27it/s]


Epoch 43/150, Loss: 1.6168052355448406


100%|██████████| 54/54 [00:01<00:00, 30.08it/s]


Epoch 44/150, Loss: 1.596388041973114


100%|██████████| 54/54 [00:01<00:00, 30.90it/s]


Epoch 45/150, Loss: 1.6345467766125996


100%|██████████| 54/54 [00:01<00:00, 30.78it/s]


Epoch 46/150, Loss: 1.6105043093363445


100%|██████████| 54/54 [00:01<00:00, 31.00it/s]


Epoch 47/150, Loss: 1.5949187411202326


100%|██████████| 54/54 [00:01<00:00, 31.20it/s]


Epoch 48/150, Loss: 1.6060739623175726


100%|██████████| 54/54 [00:01<00:00, 30.30it/s]


Epoch 49/150, Loss: 1.5937906746511106


100%|██████████| 54/54 [00:01<00:00, 30.28it/s]


Epoch 50/150, Loss: 1.6149051608862701


100%|██████████| 4/4 [00:00<00:00, 67.95it/s]


Epoch: 49 Validation Loss: 1.3187527060508728, Validation Accuracy: 49.74
Best model saved at epoch 49, val loss: 1.3187527060508728


100%|██████████| 54/54 [00:01<00:00, 30.70it/s]


Epoch 51/150, Loss: 1.6210478632538408


100%|██████████| 54/54 [00:01<00:00, 30.06it/s]


Epoch 52/150, Loss: 1.6129687936217696


100%|██████████| 54/54 [00:01<00:00, 28.80it/s]


Epoch 53/150, Loss: 1.5887055330806308


100%|██████████| 54/54 [00:01<00:00, 29.82it/s]


Epoch 54/150, Loss: 1.6108477490919608


100%|██████████| 54/54 [00:01<00:00, 28.77it/s]


Epoch 55/150, Loss: 1.6206931626355205


100%|██████████| 54/54 [00:01<00:00, 29.91it/s]


Epoch 56/150, Loss: 1.636325423364286


100%|██████████| 54/54 [00:01<00:00, 28.56it/s]


Epoch 57/150, Loss: 1.5809076141428064


100%|██████████| 54/54 [00:01<00:00, 28.69it/s]


Epoch 58/150, Loss: 1.6284739220583881


100%|██████████| 54/54 [00:01<00:00, 30.26it/s]


Epoch 59/150, Loss: 1.6287124752998352


100%|██████████| 54/54 [00:01<00:00, 29.12it/s]


Epoch 60/150, Loss: 1.609823015001085


100%|██████████| 4/4 [00:00<00:00, 60.48it/s]


Epoch: 59 Validation Loss: 1.4062286615371704, Validation Accuracy: 67.16
No improvement in validation loss for 3 consecutive evaluations.


100%|██████████| 54/54 [00:01<00:00, 29.60it/s]


Epoch 61/150, Loss: 1.6302253228646737


100%|██████████| 54/54 [00:01<00:00, 29.17it/s]


Epoch 62/150, Loss: 1.590943455696106


100%|██████████| 54/54 [00:01<00:00, 28.90it/s]


Epoch 63/150, Loss: 1.6209275170608803


100%|██████████| 54/54 [00:01<00:00, 29.11it/s]


Epoch 64/150, Loss: 1.6110777656237285


100%|██████████| 54/54 [00:01<00:00, 29.42it/s]


Epoch 65/150, Loss: 1.6127399184085704


100%|██████████| 54/54 [00:01<00:00, 28.88it/s]


Epoch 66/150, Loss: 1.6000090175204806


100%|██████████| 54/54 [00:01<00:00, 29.17it/s]


Epoch 67/150, Loss: 1.628499726454417


100%|██████████| 54/54 [00:01<00:00, 29.86it/s]


Epoch 68/150, Loss: 1.6112713791705944


100%|██████████| 54/54 [00:01<00:00, 31.11it/s]


Epoch 69/150, Loss: 1.5949715971946716


100%|██████████| 54/54 [00:01<00:00, 32.30it/s]


Epoch 70/150, Loss: 1.6350932805626481


100%|██████████| 4/4 [00:00<00:00, 56.25it/s]


Epoch: 69 Validation Loss: 1.4743273556232452, Validation Accuracy: 63.98
No improvement in validation loss for 4 consecutive evaluations.


100%|██████████| 54/54 [00:01<00:00, 31.16it/s]


Epoch 71/150, Loss: 1.582997041719931


100%|██████████| 54/54 [00:01<00:00, 31.18it/s]


Epoch 72/150, Loss: 1.622135058597282


100%|██████████| 54/54 [00:01<00:00, 32.27it/s]


Epoch 73/150, Loss: 1.6316734508231834


100%|██████████| 54/54 [00:01<00:00, 31.41it/s]


Epoch 74/150, Loss: 1.6050028249069497


100%|██████████| 54/54 [00:02<00:00, 26.90it/s]


Epoch 75/150, Loss: 1.6178984001830772


100%|██████████| 54/54 [00:01<00:00, 31.06it/s]


Epoch 76/150, Loss: 1.5787855210127655


100%|██████████| 54/54 [00:01<00:00, 29.53it/s]


Epoch 77/150, Loss: 1.6051572804097776


100%|██████████| 54/54 [00:01<00:00, 29.63it/s]


Epoch 78/150, Loss: 1.6754335871449224


100%|██████████| 54/54 [00:01<00:00, 30.27it/s]


Epoch 79/150, Loss: 1.5827732174484819


100%|██████████| 54/54 [00:01<00:00, 31.84it/s]


Epoch 80/150, Loss: 1.638515869776408


100%|██████████| 4/4 [00:00<00:00, 69.95it/s]

Epoch: 79 Validation Loss: 1.3728472590446472, Validation Accuracy: 86.44
No improvement in validation loss for 5 consecutive evaluations.
Early stopping triggered.





In [267]:
class CNN_LSTM_Classifier(nn.Module):
    def __init__(self, in_channels=8, hidden_dim=1024, num_classes=361, num_lstm_layers=2):
        super(CNN_LSTM_Classifier, self).__init__()

        self.cnn = CNN(in_channels=in_channels, hidden_dim=hidden_dim, num_bins=num_classes)
        self.lstm = nn.LSTM(num_classes, hidden_dim, num_layers=num_lstm_layers, batch_first=True, bidirectional=True)
        self.linear = nn.Linear(hidden_dim * 2, num_classes)  # *2 for bidirectional LSTM output

    def forward(self, x):
        # (B, C, T)
        cnn_out = self.cnn(x)  # Output shape: (B, 1024, T)

        # Permute to (B, T, 1024) for LSTM
        lstm_input = cnn_out.permute(0, 2, 1)  # (B, T, 1024)

        # LSTM output
        lstm_out, _ = self.lstm(lstm_input)  # Shape: (B, T, hidden_dim * 2)

        # Take the last time step's output for classification
        lstm_out_last = lstm_out[:, -1, :]  # Shape: (B, hidden_dim * 2)

        # Linear layer to get class scores
        out = self.linear(lstm_out_last)  # Shape: (B, num_classes)
        return out


In [268]:
model = CNN_LSTM_Classifier(in_channels=256, hidden_dim=1024, num_classes=361, num_lstm_layers=2)
print(summary(model))

Layer (type:depth-idx)                   Param #
CNN_LSTM_Classifier                      --
├─CNN: 1-1                               --
│    └─Sequential: 2-1                   --
│    │    └─ResBlock: 3-1                4,203,520
│    │    └─ResBlock: 3-2                2,888,192
│    │    └─ResBlock: 3-3                723,200
│    │    └─ResBlock: 3-4                181,376
│    │    └─ResBlock: 3-5                45,632
│    │    └─ResBlock: 3-6                486,628
├─LSTM: 1-2                              36,544,512
├─Linear: 1-3                            739,689
Total params: 45,812,749
Trainable params: 45,812,749
Non-trainable params: 0


In [269]:
train_model(model, train_dataloader, val_dataloader, num_epochs=150, num_eval_epoch=10, patience = 5,optimizer=optimizer, scheduler=scheduler, save_dir="/home/blastaistudent/proj-pitch/")

Using device: cuda:6


  inputs = torch.tensor(inputs, dtype = torch.float)
  0%|          | 0/54 [00:00<?, ?it/s]


RuntimeError: 0D or 1D target tensor expected, multi-target not supported