In [2]:
import os
import librosa
import numpy as np

def preprocess_to_spectrograms(root_dir, output_dir, sample_rate=22050, n_mels=128, duration=5):
    os.makedirs(output_dir, exist_ok=True)
    max_len = int(sample_rate * duration)

    for split in ['train', 'val', 'test']:
        input_split_dir = os.path.join(root_dir, split)
        output_split_dir = os.path.join(output_dir, split)
        os.makedirs(output_split_dir, exist_ok=True)

        for file_name in os.listdir(input_split_dir):
            if file_name.endswith('.wav'):
                file_path = os.path.join(input_split_dir, file_name)

                # Load audio
                y, sr = librosa.load(file_path, sr=sample_rate)

                # Trim or pad to fixed length
                if len(y) > max_len:
                    y = y[:max_len]
                else:
                    y = np.pad(y, (0, max_len - len(y)))

                # Convert to mel spectrogram
                mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels)
                mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)

                # Save as .npy
                output_path = os.path.join(output_split_dir, file_name.replace('.wav', '.npy'))
                np.save(output_path, mel_spec_db)
                print(f"Saved {output_path}")


preprocess_to_spectrograms(
    root_dir="VocalSet_processed",
    output_dir="VocalSet_npy",
    sample_rate=22050,
    n_mels=128,
    duration=5  # Each clip will be 5 seconds
)


Saved VocalSet_npy\train\arpeggios_straight_a.npy
Saved VocalSet_npy\train\arpeggios_straight_i.npy
Saved VocalSet_npy\train\arpeggios_straight_o.npy
Saved VocalSet_npy\train\arpeggios_straight_u.npy
Saved VocalSet_npy\train\arps_c_fast_piano.npy
Saved VocalSet_npy\train\arps_fast_piano_c.npy
Saved VocalSet_npy\train\dona_vibrato.npy
Saved VocalSet_npy\train\f1_arpeggios_belt_c_a.npy
Saved VocalSet_npy\train\f1_arpeggios_belt_c_e.npy
Saved VocalSet_npy\train\f1_arpeggios_belt_c_i.npy
Saved VocalSet_npy\train\f1_arpeggios_belt_c_o.npy
Saved VocalSet_npy\train\f1_arpeggios_belt_c_u.npy
Saved VocalSet_npy\train\f1_arpeggios_breathy_a.npy
Saved VocalSet_npy\train\f1_arpeggios_breathy_e.npy
Saved VocalSet_npy\train\f1_arpeggios_breathy_i.npy
Saved VocalSet_npy\train\f1_arpeggios_breathy_o.npy
Saved VocalSet_npy\train\f1_arpeggios_breathy_u.npy
Saved VocalSet_npy\train\f1_arpeggios_c_fast_forte_a.npy
Saved VocalSet_npy\train\f1_arpeggios_c_fast_forte_e.npy
Saved VocalSet_npy\train\f1_arpeggi

In [3]:
import os
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

# ---- Dataset Loader ----
class SpectrogramDataset(Dataset):
    def __init__(self, data_dir):
        self.files = [os.path.join(data_dir, f) for f in os.listdir(data_dir) if f.endswith('.npy')]

    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        spec = np.load(self.files[idx])
        spec = (spec - spec.min()) / (spec.max() - spec.min())  # Normalize
        spec = torch.tensor(spec, dtype=torch.float32).unsqueeze(0)  # [1, H, W]
        noisy_spec = spec + 0.05 * torch.randn_like(spec)
        noisy_spec = torch.clamp(noisy_spec, 0., 1.)
        return noisy_spec, spec

# ---- Model Definition ----
class DenoisingAutoencoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Conv2d(1, 32, 3, stride=2, padding=1), nn.ReLU(),
            nn.Conv2d(32, 64, 3, stride=2, padding=1), nn.ReLU(),
            nn.Conv2d(64, 128, 3, stride=2, padding=1), nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.ConvTranspose2d(128, 64, 3, stride=2, padding=1, output_padding=1), nn.ReLU(),
            nn.ConvTranspose2d(64, 32, 3, stride=2, padding=1, output_padding=1), nn.ReLU(),
            nn.ConvTranspose2d(32, 1, 3, stride=2, padding=1, output_padding=1), nn.Sigmoid()
        )

    def forward(self, x):
        return self.decoder(self.encoder(x))

# ---- Configurations ----
batch_size = 16
epochs = 20
lr = 1e-3
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_dataset = SpectrogramDataset("VocalSet_npy/train")
val_dataset = SpectrogramDataset("VocalSet_npy/val")

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

model = DenoisingAutoencoder().to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

os.makedirs("saved_models", exist_ok=True)

# ---- Training + Validation ----
for epoch in range(1, epochs + 1):
    model.train()
    train_loss = 0
    for noisy, clean in train_loader:
        noisy, clean = noisy.to(device), clean.to(device)
        output = model(noisy)
        loss = criterion(output, clean)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    avg_train_loss = train_loss / len(train_loader)

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for noisy, clean in val_loader:
            noisy, clean = noisy.to(device), clean.to(device)
            output = model(noisy)
            val_loss += criterion(output, clean).item()

    avg_val_loss = val_loss / len(val_loader)
    print(f"Epoch {epoch}/{epochs} - Train Loss: {avg_train_loss:.4f} - Val Loss: {avg_val_loss:.4f}")

# ---- Save only final model ----
torch.save(model.state_dict(), "saved_models/denoising_autoencoder_final.pt")
print("✅ Final model saved to saved_models/denoising_autoencoder_final.pt")

Epoch 1/20 - Train Loss: 0.0150 - Val Loss: 0.0046
Epoch 2/20 - Train Loss: 0.0035 - Val Loss: 0.0032
Epoch 3/20 - Train Loss: 0.0026 - Val Loss: 0.0024
Epoch 4/20 - Train Loss: 0.0022 - Val Loss: 0.0022
Epoch 5/20 - Train Loss: 0.0020 - Val Loss: 0.0019
Epoch 6/20 - Train Loss: 0.0017 - Val Loss: 0.0016
Epoch 7/20 - Train Loss: 0.0015 - Val Loss: 0.0015
Epoch 8/20 - Train Loss: 0.0014 - Val Loss: 0.0015
Epoch 9/20 - Train Loss: 0.0013 - Val Loss: 0.0013
Epoch 10/20 - Train Loss: 0.0013 - Val Loss: 0.0012
Epoch 11/20 - Train Loss: 0.0012 - Val Loss: 0.0012
Epoch 12/20 - Train Loss: 0.0011 - Val Loss: 0.0011
Epoch 13/20 - Train Loss: 0.0011 - Val Loss: 0.0011
Epoch 14/20 - Train Loss: 0.0011 - Val Loss: 0.0011
Epoch 15/20 - Train Loss: 0.0011 - Val Loss: 0.0012
Epoch 16/20 - Train Loss: 0.0010 - Val Loss: 0.0010
Epoch 17/20 - Train Loss: 0.0010 - Val Loss: 0.0010
Epoch 18/20 - Train Loss: 0.0010 - Val Loss: 0.0010
Epoch 19/20 - Train Loss: 0.0010 - Val Loss: 0.0009
Epoch 20/20 - Train L

In [4]:
import matplotlib.pyplot as plt
import torch
from torch.utils.data import DataLoader
import os

# ---- Create Folder to Save Samples ----
os.makedirs("denoised_samples", exist_ok=True)

# ---- Load Model ----
model = DenoisingAutoencoder().to(device)
model.load_state_dict(torch.load("saved_models/denoising_autoencoder_final.pt"))
model.eval()

# ---- Prepare Test Data ----
test_dataset = SpectrogramDataset("VocalSet_npy/test")
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=True)

# ---- Inference + Save Function ----
def run_and_save_inference(n_samples=5):
    count = 0
    with torch.no_grad():
        for noisy, clean in test_loader:
            noisy, clean = noisy.to(device), clean.to(device)
            output = model(noisy)

            noisy_np = noisy.squeeze().cpu().numpy()
            clean_np = clean.squeeze().cpu().numpy()
            output_np = output.squeeze().cpu().numpy()

            fig, axs = plt.subplots(1, 3, figsize=(15, 4))
            axs[0].imshow(clean_np, origin='lower', aspect='auto', cmap='inferno')
            axs[0].set_title("Clean Spectrogram")
            axs[1].imshow(noisy_np, origin='lower', aspect='auto', cmap='inferno')
            axs[1].set_title("Noisy Input")
            axs[2].imshow(output_np, origin='lower', aspect='auto', cmap='inferno')
            axs[2].set_title("Denoised Output")
            for ax in axs:
                ax.axis('off')
            plt.tight_layout()

            save_path = f"denoised_samples/sample_{count+1}.png"
            plt.savefig(save_path)
            plt.close()

            count += 1
            if count >= n_samples:
                break

# ---- Run and Save ----
run_and_save_inference(n_samples=5)


In [5]:
import torch
import torchaudio
import os

# ---- Create Output Folder ----
os.makedirs("denoised_audio", exist_ok=True)

# ---- Inversion Parameters ----
n_fft = 1024
hop_length = 256
n_mels = 128  # ✅ match training
sample_rate = 22050

# ---- Mel Inverse Transformation ----
mel_basis = torchaudio.transforms.MelScale(
    n_mels=n_mels,
    sample_rate=sample_rate,
    n_stft=n_fft // 2 + 1
)
inverse_mel_basis = torch.pinverse(mel_basis.fb).to(device)

def mel_to_audio(mel_spec, file_name):
    mel_tensor = torch.tensor(mel_spec).to(device)  # (n_mels, T)
    linear_spec = torch.matmul(inverse_mel_basis.T, mel_tensor)  # ✅ transpose used
    linear_spec = linear_spec.unsqueeze(0)

    griffin_lim = torchaudio.transforms.GriffinLim(n_fft=n_fft, hop_length=hop_length).to(device)
    audio = griffin_lim(linear_spec.squeeze(0))

    torchaudio.save(f"denoised_audio/{file_name}.wav", audio.unsqueeze(0).cpu(), sample_rate)


# ---- Inference + Save Denoised Audio ----
count = 0
with torch.no_grad():
    for noisy, clean in test_loader:
        noisy, clean = noisy.to(device), clean.to(device)
        output = model(noisy)
        output_np = output.squeeze().cpu().numpy()
        mel_to_audio(output_np, f"denoised_{count+1}")
        count += 1
        if count >= 5:
            break
