In [1]:
!pip install wandb



In [2]:
import pandas as pd
import numpy as np
import random

import wandb
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR, CosineAnnealingLR

import torchaudio
import torchaudio.transforms as T
from torchaudio.transforms import AmplitudeToDB, Vol

device = torch.device("cuda:0")

In [3]:
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mdarya-dare[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

### Custom Dataset

In [4]:
class AudioDataset(Dataset):

    def __init__(self, path_speech, path_noise, eps_value=1e-6,\
                 mel_nframes=300, sample_rate=16000, n_fft=480,\
                 hop_length=160, n_mels=40):

        self.speech = pd.read_csv(path_speech)
        self.noise = pd.read_csv(path_noise)
        self.eps_value = eps_value
        self.mel_nframes = mel_nframes
        self.sample_rate = sample_rate
        self.n_fft = n_fft
        self.hop_length = hop_length
        self.n_mels = n_mels
        self.mel_spectrogram = T.MelSpectrogram(sample_rate=self.sample_rate,
                                                n_fft=self.n_fft,
                                                hop_length=self.hop_length,
                                                center=True,
                                                pad_mode="reflect",
                                                power=2.0,
                                                norm='slaney',
                                                n_mels=self.n_mels)

    def __len__(self):

        return len(self.noise)

    def get_same_shape(self, waveform, w_type):

        melsp = torch.log(self.mel_spectrogram(waveform) + self.eps_value)
        has_mel_nframes = melsp.size(2) if melsp.size(2) < self.mel_nframes\
         else self.mel_nframes

        image_tens = torch.full((1, self.n_mels, self.mel_nframes), melsp.min())
        image_tens[0, :, :has_mel_nframes] = melsp[:, :, :has_mel_nframes]

        return image_tens/255

    def load_item(self, df, idx, w_type):

        sample_wav_file = df['filename'][idx]
        wave, _ = torchaudio.load(sample_wav_file)
        wave = wave.float()

        if w_type==1:
            wave_noise = wave.numpy()
            wave_noise_no_silence = [float(el) for el in wave_noise[0] if abs(el) >= 0.001]
            wave = torch.tensor([wave_noise_no_silence])

        wave_melsp = self.get_same_shape(wave, w_type)

        return wave_melsp

    def __getitem__(self, idx):

        wave_speech = self.load_item(self.speech, idx, w_type=0)

        luck = random.random()
        if luck > 0.5:
            wave_noise = self.load_item(self.noise, idx, w_type=1)

            wave_out = wave_speech + wave_noise
            label = 1

        else:
            wave_out = wave_speech
            label = 0

        return wave_out, torch.tensor(label)

In [5]:
audio_dataset_train = AudioDataset('./CleanSpeechTrain.csv',
                                   './NoiseTrain.csv')

audio_dataset_val = AudioDataset('./CleanSpeechVal.csv',
                                   './NoiseVal.csv')

audio_dataset_test = AudioDataset('./CleanSpeechTest.csv',
                                   './NoiseTest.csv')

### Data Loader

In [6]:
train_loader = DataLoader(dataset=audio_dataset_train,
                          batch_size=32, drop_last=True)

val_loader = DataLoader(dataset=audio_dataset_val,
                        batch_size=32, drop_last=True)

test_loader = DataLoader(dataset=audio_dataset_test,
                         batch_size=32, drop_last=True)

### Train functions

In [7]:
def train(model, train_loader, val_loader, loss,
          optimizer, scheduler, num_epochs):

    wandb.init(
        project='Noise-Estimation',
        config={'learning_rate': 1e-1,
                'model': 'vgg',
                'loss': 'BCEWithLogitsLoss',
                'optimizer': 'SGD',
                'scheduler': 'CosineAnnealing',
                'epochs': 3}
    )

    loss_history = []
    train_history = []
    val_history = []

    for epoch in range(num_epochs):

        model.train()

        loss_accum = 0
        correct_samples = 0
        total_samples = 0

        with tqdm(total=len(train_loader),
                  desc=f'Epoch {epoch+1}',
                  leave=True) as pb:

            for i, (x, y) in enumerate(train_loader):

                x_gpu = x.to(device)
                y_gpu = y.to(device)

                prediction = model(x_gpu)
                value, indices = torch.max(prediction, 1)

                loss_value = loss(value, y_gpu.float())
                optimizer.zero_grad()
                loss_value.backward()
                optimizer.step()

                correct_samples += torch.sum(indices==y_gpu)
                total_samples += y.size(0)
                loss_accum += loss_value

                pb.update()
                pb.set_description( f'Epoch {epoch+1}:'
                                    f' Average loss: {loss_value:.3f},'
                                    f' Train accuracy:'
                                    f' {correct_samples/total_samples:.2f},'
                                    )

            ave_loss = loss_accum / epoch+1
            train_accuracy = float(correct_samples) / total_samples
            val_accuracy = compute_accuracy(model, val_loader)

            scheduler.step()

            loss_history.append(float(ave_loss))
            train_history.append(train_accuracy)
            val_history.append(val_accuracy)

            wandb.log({'Epoch': epoch+1, 'loss': ave_loss,
                       'train_acc': train_accuracy,
                       'val_acc': val_accuracy})

            pb.set_description( f'Epoch {epoch+1}:'
                                f' Average loss: {ave_loss:.3f},'
                                f' Train accuracy: {train_accuracy:.2f},'
                                f' Val accuracy: {val_accuracy:.2f}'
                                )

    wandb.finish()
    return loss_history, train_history, val_history


def compute_accuracy(model, loader):

    model.eval()

    correct_samples = 0
    total_samples = 0

    for i, (x, y) in enumerate(loader):

        x_gpu = x.to(device)
        y_gpu = y.to(device)

        prediction = model(x_gpu)
        indices = torch.argmax(prediction, 1)
        correct_samples += torch.sum(indices==y_gpu)
        total_samples += y_gpu.size(0)

    return float(correct_samples) / total_samples

### MHAttKWS

In [8]:
from BCResNet import BCResNet, MHAttKWS

In [20]:
model_mhat = MHAttKWS(num_classes=2)
model_mhat.type(torch.FloatTensor)
model_mhat.to(device)

MHAttKWS(
  (cnn_extractor): Sequential(
    (0): Conv2d(1, 10, kernel_size=(5, 1), stride=(1, 1))
    (1): BatchNorm2d(10, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): Conv2d(10, 1, kernel_size=(5, 1), stride=(1, 1))
    (4): BatchNorm2d(1, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU(inplace=True)
    (6): AvgPool2d(kernel_size=2, stride=2, padding=0)
  )
  (rnn): LSTM(1, 128, num_layers=2, batch_first=True, bidirectional=True)
  (q_emb): Linear(in_features=256, out_features=1024, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (fc): Sequential(
    (0): Linear(in_features=1024, out_features=64, bias=True)
    (1): ReLU(inplace=True)
    (2): Linear(in_features=64, out_features=32, bias=True)
    (3): Linear(in_features=32, out_features=2, bias=True)
  )
)

In [21]:
loss = nn.BCEWithLogitsLoss().type(torch.FloatTensor)

optimizer = optim.SGD(model_mhat.parameters(),
                       lr=1e-1,
                       weight_decay=1e-1)

scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.3)

In [22]:
loss_history, train_history, val_history = train(model_mhat, train_loader,
                                                 val_loader, loss,
                                                 optimizer, scheduler,
                                                 num_epochs=20)

VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
Epoch,▁▂▃▄▅▆▇█
loss,█▄▃▂▁▁▁
train_acc,▄▄█▆▆█▁▄
val_acc,▁▂▅▄█▅▅▂

0,1
Epoch,8.0
loss,7.03907
train_acc,0.49016
val_acc,0.488


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016669120016740636, max=1.0…

Epoch 1: Average loss: inf, Train accuracy: 0.50, Val accuracy: 0.49: 100%|██████████| 96/96 [03:31<00:00,  2.20s/it]
Epoch 2: Average loss: 67.551, Train accuracy: 0.51, Val accuracy: 0.52: 100%|██████████| 96/96 [03:32<00:00,  2.22s/it]
Epoch 3: Average loss: 34.277, Train accuracy: 0.49, Val accuracy: 0.51: 100%|██████████| 96/96 [03:26<00:00,  2.16s/it]
Epoch 4: Average loss: 23.180, Train accuracy: 0.51, Val accuracy: 0.51: 100%|██████████| 96/96 [03:33<00:00,  2.22s/it]
Epoch 5: Average loss: 17.636, Train accuracy: 0.50, Val accuracy: 0.52: 100%|██████████| 96/96 [03:33<00:00,  2.22s/it]
Epoch 6: Average loss: 14.309, Train accuracy: 0.49, Val accuracy: 0.49: 100%|██████████| 96/96 [03:27<00:00,  2.16s/it]
Epoch 7: Average loss: 12.090, Train accuracy: 0.50, Val accuracy: 0.52: 100%|██████████| 96/96 [03:32<00:00,  2.21s/it]
Epoch 8: Average loss: 10.506, Train accuracy: 0.51, Val accuracy: 0.50: 100%|██████████| 96/96 [03:32<00:00,  2.21s/it]
Epoch 9: Average loss: 9.318, Train

VBox(children=(Label(value='0.007 MB of 0.033 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.211148…

0,1
Epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
loss,█▄▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁
train_acc,▅▇▂▆▅▂▅▆▄▇▆▄▁▄▃▁▅█▅▅
val_acc,▃▆▅▅▆▃▆▄▁▂▃▇▆▃▄▁▄▂█▃

0,1
Epoch,20.0
loss,4.50223
train_acc,0.5
val_acc,0.49707


### BCResNet

In [68]:
model_bcrn = BCResNet(num_labels=2)
model_bcrn.type(torch.FloatTensor)
model_bcrn.to(device)

BCResNet(
  (conv1): Conv2d(1, 16, kernel_size=(5, 5), stride=(2, 1), padding=(2, 2))
  (block1_1): TransitionBlock(
    (freq_dw_conv): Conv2d(8, 8, kernel_size=(3, 1), stride=(1, 1), padding=(1, 0), groups=8, bias=False)
    (ssn): SubSpectralNorm(
      (bn): BatchNorm2d(40, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (temp_dw_conv): Conv2d(8, 8, kernel_size=(1, 3), stride=(1, 1), padding=(0, 1), groups=8, bias=False)
    (bn1): BatchNorm2d(8, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (bn2): BatchNorm2d(8, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU()
    (channel_drop): Dropout2d(p=0.5, inplace=False)
    (swish): SiLU()
    (conv1x1_1): Conv2d(16, 8, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (conv1x1_2): Conv2d(8, 8, kernel_size=(1, 1), stride=(1, 1), bias=False)
  )
  (block1_2): BroadcastedBlock(
    (freq_dw_conv): Conv2d(8, 8, kernel_size=(3, 1), stride=(1, 1), padding=(1, 0),

In [69]:
loss = nn.BCEWithLogitsLoss()

optimizer = optim.SGD(model_bcrn.parameters(),
                       lr=1e-3,
                       weight_decay=0)

scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=15, verbose=True)

Adjusting learning rate of group 0 to 1.0000e-03.


In [71]:
loss_history, train_history, val_history = train(model_bcrn, train_loader,
                                    val_loader, loss,
                                    optimizer, scheduler,
                                    num_epochs=1)

### VGG

In [9]:
from vggNet import VGGModel

In [10]:
model_vgg = VGGModel(2, 'vgg11')
model_vgg.type(torch.FloatTensor)
model_vgg.to(device)

VGGModel(
  (model): VGG(
    (features): Sequential(
      (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): ReLU(inplace=True)
      (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (4): ReLU(inplace=True)
      (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (6): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (7): ReLU(inplace=True)
      (8): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (9): ReLU(inplace=True)
      (10): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (11): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (12): ReLU(inplace=True)
      (13): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (14): ReLU(inplace=True)
      (15): MaxPool2d(kernel_size

In [11]:
loss = nn.BCEWithLogitsLoss().type(torch.FloatTensor)

optimizer = optim.SGD(model_vgg.parameters(), lr=1e-3)

scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=3, verbose=True)

Adjusting learning rate of group 0 to 1.0000e-03.


In [12]:
loss_history, train_history, val_history = train(model_vgg, train_loader,
                                                 val_loader, loss,
                                                 optimizer, scheduler,
                                                 num_epochs=3)

Epoch 1: Average loss: inf, Train accuracy: 0.50, Val accuracy: 0.49: 100%|██████████| 96/96 [03:16<00:00,  2.05s/it]


Adjusting learning rate of group 0 to 7.5000e-04.


Epoch 2: Average loss: 67.515, Train accuracy: 0.50, Val accuracy: 0.52: 100%|██████████| 96/96 [03:16<00:00,  2.05s/it]


Adjusting learning rate of group 0 to 2.5000e-04.


Epoch 3: Average loss: 34.256, Train accuracy: 0.51, Val accuracy: 0.51: 100%|██████████| 96/96 [03:13<00:00,  2.02s/it]

Adjusting learning rate of group 0 to 0.0000e+00.





0,1
Epoch,▁▅█
loss,█▁
train_acc,▆▁█
val_acc,▁█▇

0,1
Epoch,3.0
loss,34.2556
train_acc,0.50684
val_acc,0.51367
