In [1]:
import os
import torchaudio
import torch
from torchmetrics import SignalNoiseRatio
import matplotlib.pyplot as plt
from torch.nn import Module, Linear, Sigmoid, LSTM, BCELoss, MSELoss, Conv1d, Conv2d, MaxPool2d, Transformer, LayerNorm, PReLU
from torch.optim import Adam
import torch.nn.functional as F
from pytorch_model_summary import summary
from tqdm import tqdm
import numpy as np
import random
import speechbrain as sb
from speechbrain.nnet.losses import get_si_snr_with_pitwrapper
import pickle



In [29]:
import DataLoader

X,Y,speech,noise,mix = DataLoader.data_loader()

In [8]:
# MASK NET
NUMBER_OF_SPEAKERS = 2
HIDDEN_SIZE=1024 # 1024 (128 is too litte, just learns all 0 or 1)
SAMPLE_RATE = 16000

class TransformerMaskNet(Module):
    def __init__(self,noise=False):
        super(TransformerMaskNet, self).__init__()
        # ENCODER subnet
        self.tdnn = Conv1d(in_channels=1,out_channels=256,kernel_size=16,stride=8,padding=6)

        self.lnorm = LayerNorm(normalized_shape=(256,6250))
        self.prelu = PReLU()

        self.tf1 = Transformer(d_model = 256, nhead=8, dim_feedforward=1024)
        self.tf2 = Transformer(d_model = 256, nhead=8, dim_feedforward=1024)

        # self.fc = Linear(in_features=HIDDEN_SIZE*2 ,out_features=1024)
        # self.fc2 = Linear(in_features=1024 ,out_features=1024)
        # self.fc3 = Linear(in_features=1024 ,out_features=1)
        self.sigmoid = Sigmoid()

    def forward(self,x):
        # TDNN Encoder 1x5000 -> 256x6249
        x = self.tdnn(x)

        # NORMALIZATION and Overlapping
        x = self.lnorm(x)
        x = x.unfold(dimension=1, step=125, size=250).reshape(256,250,49)# Chunking and 50% Overlap
        print(x.shape)

        # SEPFORMER Block
        y = self.tf1(x)
        y = self.tf2(y)
        print(x.shape)
        x = y + x # Residual connection

        # PRELU and Linear
        x = self.prelu(x)

        x = F.relu(x)
        print(x.shape)

        speech_pred = self.sigmoid(x)
        return speech_pred

    

print(summary(TransformerMaskNet(),torch.zeros((1, 50000))))

torch.Size([256, 250, 49])


TypeError: forward() missing 1 required positional argument: 'tgt'

In [71]:
# GOAL NO OVERLAP: 256,250,25
# With OVERLAP: 256, 250, 49

x  = torch.zeros(256,6250)
def chunking(x):
    x = x.unfold(dimension=1, step=125, size=250).reshape(256,250,49)
    print(x.shape)
    #x = torch.stack(torch.tensor_split(input=x, sections=250, dim=1))
    #print(x.shape)
    #x = x.unfold(dimension=2,step=)
    return x
    # for filter in x:
    #     return torch.tensor_split(input=filter, sections=250,dim=-1)
    #     break

x = chunking(x)

torch.Size([256, 250, 49])


In [58]:
x.shape

torch.Size([1, 256, 25, 250])

In [45]:
torch.stack(x).shape

RuntimeError: stack expects each tensor to be equal size, but got [2, 6250] at entry 0 and [1, 6250] at entry 6

In [None]:
EPOCHS = 10
BATCH_SIZE = 1
REFERENCE_CHANNEL = 0
INIT_LR = 15e**(−5)
PICKLE_SAVE_PATH = '/project/data_asr/CHiME5/data/librenoise/models/params.pkl'
MODEL_SAVE_PATH = '/project/data_asr/CHiME5/data/librenoise/models/TF'

CUDA = True # if torch.cuda.is_available()
device =  torch.device("cuda") if torch.cuda.is_available() else torch.device('cpu')
print("Mounted on:", device)

lossBCE = BCELoss().to(device)

model = TransformerMaskNet().to(device)
model= torch.nn.DataParallel(model,device_ids=[0])
opt = Adam(model.parameters(), lr=INIT_LR)

H = {
    "train_loss":[],
    "train_acc":[],
    "val_loss":[],
    "val_acc":[]
}

def check_accuracy_training(speech_pred, y_s):
    speech_pred = (speech_pred>0.15).float()
    return float(torch.sum((speech_pred == y_s).float())/torch.sum(torch.ones(513,speech_pred.shape[1])))

def check_accuracy_validation(model):
    example_nr = int(np.random.random()*(len(speech)-len(trainX))+len(trainX))
    model.eval()
    pred = model(X[example_nr]).reshape(1,513,-1)
    val_loss = lossBCE(pred,Y[example_nr][0].unsqueeze(0))
    pred = (pred>0.15).float()
    model.train()
    return float(torch.sum((pred == Y[example_nr][0]).float())/torch.sum(torch.ones(513,X[example_nr].shape[2])).to(device)),val_loss

print("[INFO] training the network...")

for epoch in range(0, EPOCHS):
    print("Epoch:",str(epoch+1)+"/"+str(EPOCHS))
    # Train Mode
    model.train()
    
    # Initialize
    totalTrainLoss = 0
    totalValLoss = 0
    trainCorrect = 0
    valCorrect = 0

    X = X.to(device)
    Y = Y.to(device)
    trainX = X[:2000]
    trainY = Y
    for i in tqdm(range(0,len(trainX))): # Iterate over Training Examples
        (x, y) = (trainX[i],trainY[i][0].unsqueeze(0))
        speech_pred=model(x)
        loss = lossBCE(speech_pred,y)
        # zero out the gradients, perform the backpropagation step, and update the weights
        opt.zero_grad()
        loss.backward()
        opt.step()
        
        #H["train_acc"].append(check_accuracy_training(speech_pred,y))
        H["train_acc"].append(check_accuracy_training(speech_pred,y))
        H["train_loss"].append(float(loss))
        if i % 10 == 0:
            val_acc, val_loss = check_accuracy_validation(model)
            H["val_acc"].append(val_acc)
            H["val_loss"].append(float(val_loss))
        if i % 100 == 0:
            if i == 0:
                continue
            print("Average Training Accuracy at Iteration",str(i),":",np.mean(np.array(H["train_acc"])))
            print("Total Training Loss at Iteration",str(i),":",np.sum(np.array(H["train_loss"])))
            print("Average Validation Accuracy at Iteration",str(i),":",np.mean(np.array(H["val_acc"])))
            print("Total Validation Loss at Iteration",str(i),":",np.sum(np.array(H["val_loss"])))
    # Save
    torch.save(model.state_dict(), MODEL_SAVE_PATH + "epoch"+ str(epoch+1) + ".pt")

torch.save(model.state_dict(), MODEL_SAVE_PATH + "final" + ".pt")
with open(PICKLE_SAVE_PATH, 'wb') as f:
    pickle.dump(H, f)