In [1]:
import torch
import torch.nn as nn
import logging
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import sys
sys.path.append('/kaggle/input/pitch-estimation-files')
sys.path.append('/opt/conda/lib/python3.10/site-packages')
from conv_blocks import *
from pytorch_layers import *
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter
import yaml
import glob
import os
import torch.optim as optim

2024-07-29 10:58:09.712197: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-29 10:58:09.712329: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-29 10:58:09.840374: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
configs = {
    "model_params": {
        "in_channels": 8,
        "elayers": 6,
        "hidden_dim": 768,
        "dropout": 0.2,
        "use_ar": False,
        "ar_input": 512,
        "ar_hidden": 256,
        "ar_output": 128,
        "use_tanh": False,
        "dim_feedforward": 3072,
        "use_spk_emb": False,
        "spk_emb_size": 32,
        "spk_emb_hidden": 32,
        "num_ph": None,
        "ph_emb_size": 8,
        "layer_type": "default",
        "use_emb": False,
        "num_emb": 512,
        "emb_dim": 1024,
        "emb_p": None,
        "relative_positional_distance": 100,
        "conv_block_type": "Original",
        "conv_block_params": {}
    },
    "optimizer_params": {
        "lr": 0.001,
        "weight_decay": 0.0001
    },
    "scheduler_params": {
        "step_size": 10,
        "gamma": 0.1
    },
    "logging_params": {
        "log_dir": "./logs"
    },
    "training_params": {
        "epochs": 20
    }
}

# Save the configuration dictionary to a YAML file
yaml_file_path = "/kaggle/working/configs.yaml"
with open(yaml_file_path, 'w') as file:
    yaml.dump(configs, file)

In [3]:

# CNN + transformer.
# No heads.
class Base_Transformer(nn.Module):
    def __init__(self, in_channels=8, elayers=6, hidden_dim=768, dropout=0.2,
                 use_ar=False, ar_input=512, ar_hidden=256, ar_output=128, use_tanh=False, dim_feedforward=3072,
                 use_spk_emb=False, spk_emb_size=32, spk_emb_hidden=32,
                 num_ph=None, ph_emb_size=8, layer_type='default',
                 use_emb=False, num_emb=512, emb_dim=1024, emb_p=None,
                 relative_positional_distance=100,
                 conv_block_type="Original", conv_block_params={}):
        super().__init__()

        # Store some params.
        self.hidden_dim = hidden_dim

        # Conv block initialization.
        self.conv_blocks = eval(conv_block_type)
        self.conv_blocks = self.conv_blocks(in_channels, **conv_block_params)
        self.conv_block_type = conv_block_type
        if conv_block_type == "Pool" or conv_block_type == "Baseline":
            self.pool = nn.AvgPool1d(5)
        self.w_raw_in = nn.Linear(getattr(self.conv_blocks, "hidden_dim", hidden_dim), hidden_dim)

        # Transformer layers.
        if layer_type == 'default':
            encoder_layer = TransformerEncoderLayer(d_model=hidden_dim, nhead=8, relative_positional=True,
                                                    relative_positional_distance=relative_positional_distance,
                                                    dim_feedforward=dim_feedforward, dropout=dropout)
            logging.info(f"Using relative positional distance of {relative_positional_distance}")
        else:
            logging.error('layer_type %s not supported' % layer_type)
            exit()
        self.transformer = nn.TransformerEncoder(encoder_layer, elayers)

        # Input embeddings.
        if num_ph is not None:  # NOTE assuming ph is the input
            self.in_emb_mat = torch.nn.Embedding(num_ph, ph_emb_size)
        else:
            self.in_emb_mat = None

        # Auxiliary features.
        self.use_ar = use_ar
        if use_ar:
            self.ar_model = PastFCEncoder(input_len=ar_input, hidden_dim=ar_hidden, output_dim=ar_output)

        # Speaker embedding.
        self.use_spk_emb = use_spk_emb
        if use_spk_emb:
            self.spk_fc = torch.nn.Linear(spk_emb_size, spk_emb_hidden)

        # Input embeddings.
        self.use_emb = use_emb
        if use_emb:
            if emb_p is None:
                self.emb_mat = torch.nn.Embedding(num_emb, emb_dim)
            else:
                init_array = np.load(emb_p)
                self.emb_mat = torch.nn.Embedding.from_pretrained(torch.tensor(init_array), freeze=False)

    def forward(self, x, after_len=None, spk_id=None, spk=None, ar=None, ph=None, **kwargs):
        """
        Args:
            x: shape (batchsize, num_in_feats, seq_len).
            spk: shape (batchsize, spk_emb_dim).

        Return:
            out: shape (batchsize, num_out_feats, seq_len).
        """
        ###### Input Embeddings.
        if self.use_emb:
            x = self.emb_mat(x)  # (batchsize, seq_len, emb_dim)
            x = x.transpose(1, 2)  # (batchsize, emb_dim, seq_len)
        if self.use_ar:
            ar_feats = self.ar_model(ar)  # (batchsize, ar_output)
            ar_feats = ar_feats.unsqueeze(2).repeat(1, 1, x.shape[2])  # (batchsize, ar_output, length)
            x = torch.cat((x, ar_feats), dim=1)
        if self.use_spk_emb:
            cspk = self.spk_fc(spk)
            cspk = cspk.unsqueeze(2).repeat(1, 1, x.shape[2])
            x = torch.cat((x, cspk), dim=1)
        if self.in_emb_mat is not None:
            # x (batchsize, length)
            x = self.in_emb_mat(x)  # (batchsize, seq_len, ph_emb_size)
            x = x.transpose(1, 2)

        ##### Conv blocks.
        x = self.conv_blocks(x)
        x = x.transpose(1, 2)  # (batchsize, seq_len, num_feats)
        x = self.w_raw_in(x)
        x = x.transpose(0, 1)  # (seq_len, batchsize, num_feats)

        # Transformer.
        # (T, B, C)
        x = self.transformer(x)

        # (T, B, C) => (B, C, T)
        return x.permute(1, 2, 0)

    def get_output_lengths(self, length):
        return self.conv_blocks.get_output_lengths(length)

# Prepare dataset

class NPYDataset(Dataset):
    def __init__(self, npy_files, sample_length, add_white_noise=False, noise_level=0.0):
        self.npy_files = npy_files
        self.sample_length = sample_length
        self.add_white_noise = add_white_noise
        self.noise_level = noise_level
        self.data = [np.load(file) for file in npy_files]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data[idx]
        
        if sample.shape[1] < self.sample_length:
            # Pad to sample_length if necessary
            padding = self.sample_length - sample.shape[1]
            sample = np.pad(sample, ((0, 0), (0, padding)), 'constant')
        elif sample.shape[1] > self.sample_length:
            # Truncate to sample_length if necessary
            sample = sample[:, :self.sample_length]
        
        if self.add_white_noise:
            noise = np.random.normal(0, self.noise_level, sample.shape)
            sample += noise
        
        return torch.tensor(sample, dtype=torch.float32), torch.tensor(sample, dtype=torch.float32)  # Replace with actual target if different

In [4]:
npy_dir = '/kaggle/input/pitch-estimation-files/dataset/dataset/train'

# Get list of all .npy files in the directory
npy_files = glob.glob(os.path.join(npy_dir, '*.npy'))
dataset = NPYDataset(npy_files, sample_length=256,  
                     add_white_noise=True, 
                     noise_level=0.01) 
dataloader = DataLoader(dataset, batch_size=4, shuffle=True, collate_fn=None)  

In [5]:
# Load configurations
with open('/kaggle/working/configs.yaml', 'r') as file:
    configs = yaml.safe_load(file)

# Initialize model
model_params = configs['model_params']
model = Base_Transformer(**configs['model_params'])
# Criterion, optimizer, scheduler, and logger
optimizer_params = configs['optimizer_params']
scheduler_params = configs['scheduler_params']
logging_params = configs['logging_params']
training_params = configs['training_params']

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=optimizer_params['lr'], weight_decay=optimizer_params['weight_decay'])
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=scheduler_params['step_size'], gamma=scheduler_params['gamma'])
writer = SummaryWriter(logging_params['log_dir'])

def train_model(model, dataloader, criterion, optimizer, scheduler, num_epochs):
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for inputs, targets in dataloader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * inputs.size(0)
        
        scheduler.step()
        epoch_loss = running_loss / len(dataloader.dataset)
        print(f'Epoch {epoch}/{num_epochs - 1}, Loss: {epoch_loss:.4f}')
        writer.add_scalar("Loss/train", epoch_loss, epoch)



In [6]:
if __name__ == "__main__":
    train_model(model, dataloader, criterion, optimizer, scheduler, training_params['epochs'])
    writer.close()

RuntimeError: Given groups=1, weight of size [1024, 8, 3], expected input[4, 128, 256] to have 8 channels, but got 128 channels instead