In [1]:
!git clone https://gitlab.com/1byxero/deep_learning_systems.git
!pip install librosa
from IPython import display

Cloning into 'deep_learning_systems'...
remote: Enumerating objects: 10, done.[K
remote: Counting objects: 100% (10/10), done.[K
remote: Compressing objects: 100% (9/9), done.[K
remote: Total 10 (delta 0), reused 0 (delta 0)[K
Unpacking objects: 100% (10/10), done.


In [0]:
from os.path import join

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch import optim

import librosa

common_path = join('deep_learning_systems','Assignment 1', 'data')
train_clean_path = join(common_path,'train_clean_male.wav')
train_dirty_path = join(common_path,'train_dirty_male.wav')
test_x_01_path = join(common_path,'test_x_01.wav')
test_x_02_path = join(common_path,'test_x_02.wav')

s, sr=librosa.load(train_clean_path, sr=None)
S=librosa.stft(s, n_fft=1024, hop_length=512)
sn, sr=librosa.load(train_dirty_path, sr=None)
X=librosa.stft(sn, n_fft=1024, hop_length=512)

In [3]:
S_abs = torch.tensor(np.abs(S)).float()
X_abs = torch.tensor(np.abs(X)).float()


class AudioDataset(Dataset):
    
    def __init__(self, data, label):
        self.sample = data
        self.label = label
        
    
    def __len__(self):
        if len(self.sample) == len(self.label):
            return len(self.sample)
    
    def __getitem__(self, idx):
        return self.sample[idx], self.label[idx]
ds = AudioDataset(X_abs.t(), S_abs.t())
print((X_abs.t().shape, S_abs.t().shape))

train_loader = DataLoader(ds, batch_size=50, shuffle=True, num_workers=2)

class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(513, 1024)
        self.fc2 = nn.Linear(1024, 1024)
        self.fc3 = nn.Linear(1024, 1024)
        self.fc4 = nn.Linear(1024, 1024)
        self.fc5 = nn.Linear(1024, 513)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        x = F.relu(self.fc5(x))
        return x
    
def init_weights(layer):
    if isinstance(layer, nn.Linear):
        nn.init.xavier_normal_(layer.weight.data)
        nn.init.zeros_(layer.bias.data)

net = Net()
net.apply(init_weights)
print(net)

learning_rate = 1e-4
criterion = nn.MSELoss()
optimizer = optim.Adam(net.parameters(), lr=learning_rate)

(torch.Size([2459, 513]), torch.Size([2459, 513]))
Net(
  (fc1): Linear(in_features=513, out_features=1024, bias=True)
  (fc2): Linear(in_features=1024, out_features=1024, bias=True)
  (fc3): Linear(in_features=1024, out_features=1024, bias=True)
  (fc4): Linear(in_features=1024, out_features=1024, bias=True)
  (fc5): Linear(in_features=1024, out_features=513, bias=True)
)


In [4]:
epoch_count = 50
for epoch in range(epoch_count):
    running_loss = 0.0
    
    for index, data in enumerate(train_loader):
        inputs, labels = data
        optimizer.zero_grad() #understand why is this required

        outputs = net(inputs)
        
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        if index % 50 == 49:    # print every 50 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, index + 1, running_loss / 50))
            running_loss = 0.0

print("Done training")

[1,    50] loss: 0.063
[2,    50] loss: 0.033
[3,    50] loss: 0.020
[4,    50] loss: 0.016
[5,    50] loss: 0.012
[6,    50] loss: 0.011
[7,    50] loss: 0.010
[8,    50] loss: 0.009
[9,    50] loss: 0.008
[10,    50] loss: 0.008
[11,    50] loss: 0.007
[12,    50] loss: 0.007
[13,    50] loss: 0.007
[14,    50] loss: 0.006
[15,    50] loss: 0.006
[16,    50] loss: 0.006
[17,    50] loss: 0.006
[18,    50] loss: 0.006
[19,    50] loss: 0.006
[20,    50] loss: 0.005
[21,    50] loss: 0.005
[22,    50] loss: 0.005
[23,    50] loss: 0.005
[24,    50] loss: 0.005
[25,    50] loss: 0.005
[26,    50] loss: 0.005
[27,    50] loss: 0.005
[28,    50] loss: 0.005
[29,    50] loss: 0.005
[30,    50] loss: 0.004
[31,    50] loss: 0.004
[32,    50] loss: 0.005
[33,    50] loss: 0.004
[34,    50] loss: 0.004
[35,    50] loss: 0.004
[36,    50] loss: 0.004
[37,    50] loss: 0.004
[38,    50] loss: 0.004
[39,    50] loss: 0.004
[40,    50] loss: 0.004
[41,    50] loss: 0.004
[42,    50] loss: 0.004
[

In [0]:
t, tr=librosa.load(test_x_01_path , sr=None)
T_one=librosa.stft(t, n_fft=1024, hop_length=512)
T_one_mod = np.abs(T_one)

t2, tr2=librosa.load(test_x_02_path , sr=None)
T_two=librosa.stft(t2, n_fft=1024, hop_length=512)
T_two_mod = np.abs(T_two)

def get_cleaned_signal(signal, output):
    op = signal/np.abs(signal)
    op = np.multiply(op,output.numpy())
    return op
    
testset_one = AudioDataset(T_one_mod.transpose(), T_one_mod.transpose())

test_loader = DataLoader(testset_one, batch_size=50, shuffle=False, num_workers=2)
output_list_one = []

with torch.no_grad():
    for batch in test_loader:
        data, label = batch        
        op_one = net(data)
        output_list_one.append(op_one)    
ops = torch.cat(output_list_one)

clean_one = get_cleaned_signal(T_one, ops.t())
op_istft = librosa.core.istft(clean_one, hop_length=512)

testset_two = AudioDataset(T_two_mod.transpose(), T_two_mod.transpose())

test_loader2 = DataLoader(testset_two, batch_size=50, shuffle=False, num_workers=2)
output_list_two = []

with torch.no_grad():
    for batch in test_loader2:
        data, label = batch        
        op_one = net(data)
        output_list_two.append(op_one)    
ops2 = torch.cat(output_list_two)

clean_one2 = get_cleaned_signal(T_two, ops2.t())
op_istft2 = librosa.core.istft(clean_one2, hop_length=512)


### Test input audio and synthesized audio for the first test file

In [9]:
#this is the input file
display.display(display.Audio(test_x_01_path,rate=tr))
#this is the input file
display.display(display.Audio(data=op_istft,rate=tr))

### Test input audio and synthesized audio for the second test file

In [8]:
#this is the input file
display.display(display.Audio(data=test_x_02_path,rate=tr))
#this is the synthesized file
display.display(display.Audio(data=op_istft2,rate=tr))


We can see the synthesized audio in both test files has reduced the chips eating sound. In the second audio, the synthesized audio has large noise as compared to first one. According to me, this is due to the reason that network we trained is for removing chips eating sound however it is not generalized for all voices as it is trained only on one audio sample. 