### Authors

Anders Melander, s144277 <br>
Sophie Silberbrandt, s144246 <br>
Emil Strøm, s144259 <br>

In collaboration with Corti.ai. https://corti.ai

02456 - Deep Learning - DTU Compute Autumn 2018


### Disclaimer

Many of the following commands have been done on a Linux machine, adjust accordingly to work on your device.

### Getting the data
The data can be downloaded and extracted automatically using the cell below.

If you need to do it manually download the data from https://drive.google.com/uc?id=1Hnlqn48mGNfEbSOFaRcTGkUOsU0SHL5V and extract the it to `an4_dataset/` in your working directory.


In [None]:
# DOWNLOAD DATA
import os

downLoadAutomatically = True

if downloadAutomatically:
    if not (os.path.isdir("zippedData/") and os.path.isdir("an4_dataset/")):
        !mkdir zippedData/
        !wget 'https://drive.google.com/uc?id=1Hnlqn48mGNfEbSOFaRcTGkUOsU0SHL5V' \
        -O zippedData/an4_dataset.zip
        !unzip -q zippedData/an4_dataset.zip
        print("-------------DATA UNZIPPED TO an4_dataset/-------------")
    else:
        print("DATA ALREADY DOWNLOADED!")

### Getting the needed modules.
The modules required for the code to run is:<br>
- numpy
- torch
- librosa

Uncomment the lines as needed.

In [None]:
#!pip install numpy
#!pip install torch
#!pip install librosa

### PyTorch bindings for Warp-ctc
The loss function used is the warp-CTC implementation done by Sean Naren, see GitHub: <br>
https://github.com/SeanNaren/warp-ctc 


The commands below will install warp-ctc on Linux.
To install set `build_warpCTC = True` and run the cell. After install, set back to `False` and restart the kernel for python to recognize the installation.

In [None]:
build_warpCTC = False
if build_warpCTC:
    !git clone https://github.com/SeanNaren/warp-ctc.git
    !cd warp-ctc; mkdir build; cd build; cmake ..; make
    !cd warp-ctc/pytorch_binding; python setup.py install

In [None]:
# Imports
import torch
import librosa
import numpy as np
from torch import nn
from torch.nn import functional as F
from warpctc_pytorch import CTCLoss

In [None]:
def wav2spectrogram(path_to_file, sampling_ms=25, hop_ms=10, num_mels=13):
    y, sr = librosa.load(path_to_file, sr=None)
    n_fft = int((sampling_ms / 1000) * sr)
    hop_length = int((hop_ms / 1000) * sr)
    spectrogram = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=num_mels)
    spectrogram = spectrogram.astype(np.float32)

    mean, std = np.mean(spectrogram), np.std(spectrogram)
    spectrogram = (spectrogram - mean) / std

    return spectrogram


def wav2npy(path_to_data, num_mels):
    for f in os.listdir(path_to_data):
        if '.wav' in f:
            filename = path_to_data + f
            spec = wav2spectrogram(filename, num_mels=num_mels)
            npToSave = np.array(spec)
            outFilename = path_to_data + f.split('.')[0] + '.npy'
            np.save(outFilename, npToSave)


def load_npy(path_to_data):
    data = []
    for f in os.listdir(path_to_data):
        if '.txt' in f:
            ftxt = path_to_data + f
            fnpy = path_to_data + f.split('.')[0] + '.npy'
            with open(ftxt, 'r') as ftxtOpened:
                txt = ftxtOpened.read()
            data.append([np.load(fnpy), txt])

    num_mels = len(data[0][0])
    return np.array(data), num_mels


def get_batch(batch_data, max_time):
    batch_out = np.zeros(shape=(batch_data.shape[0], batch_data[0][0].shape[0], max_time), dtype=np.float32)
    timesteps, labels, labelLens = [], [], []
    for i, (batch, label) in enumerate(batch_data):
        time = batch.shape[1]
        batch_out[i, :, :time] = batch
        timesteps.append(time // 2)
        labels.append(label)
        labelLens.append(len(label))

    return batch_out, timesteps, "".join(labels), labelLens

def WER(r, h):
    r = r.split()
    h = h.split()
    d = np.zeros((len(r)+1)*(len(h)+1), dtype=np.uint8)
    d = d.reshape((len(r)+1, len(h)+1))
    for i in range(len(r)+1):
        for j in range(len(h)+1):
            if i == 0:
                d[0][j] = j
            elif j == 0:
                d[i][0] = i
    
    for i in range(1, len(r)+1):
        for j in range(1, len(h)+1):
            if r[i-1] == h[j-1]:
                d[i][j] = d[i-1][j-1]
            else:
                substitution = d[i-1][j-1] + 1
                insertion = d[i][j-1] + 1
                deletion = d[i-1][j] + 1
                d[i][j] = min(substitution, insertion, deletion)

    return d[len(r)][len(h)]/len(r)


def cleanPred(pred):
    outString = ""
    prevChar = None
    for char in pred:
        if char != '#':
            if char != prevChar:
                outString += char
        prevChar = char
    return outString


In [None]:
TRAIN_PATH = "an4_dataset/train/"
VALIDATION_PATH = "an4_dataset/validation/"
ALPHABET = '#ABCDEFGHIJKLMNOPQRSTUVWXYZ '


# Create the spectograms
NUM_MELS = 13
MAX_SEQ_LEN = 700
PREPROCESS = False
if PREPROCESS:
    wav2npy(TRAIN_PATH, NUM_MELS)
    wav2npy(VALIDATION_PATH, NUM_MELS)

train_data, num_mels = load_npy(TRAIN_PATH)
validation_data, _ = load_npy(VALIDATION_PATH)



In [None]:

class Net(nn.Module):
    def __init__(self, alphabet, num_mels):
        super(Net, self).__init__()

        IN_CHANNELS = num_mels
        CONVR_CHANNELS = 250
        KWR = 48
        STRIDE = 2
        CONVB_CHANNELS = 250
        CONVBEnd_CHANNELS = 2000
        KWB = 7
        KWBEnd = 32
        DROPOUT = 0.5

        CONVG_CHANNELS = 2000
        KWG = 1
        N_VOC = len(alphabet)

        # Red part
        self.convr1 = nn.Conv1d(IN_CHANNELS, CONVR_CHANNELS, KWR, padding=23, stride=STRIDE)
        self.dropr1 = nn.Dropout(DROPOUT)

        # Blue part
        self.convb1 = nn.Conv1d(CONVR_CHANNELS, CONVB_CHANNELS, KWB, padding=3)
        self.dropb1 = nn.Dropout(DROPOUT)

        self.convb2 = nn.Conv1d(CONVB_CHANNELS, CONVB_CHANNELS, KWB, padding=3)
        self.dropb2 = nn.Dropout(DROPOUT)

        self.convb3 = nn.Conv1d(CONVB_CHANNELS, CONVB_CHANNELS, KWB, padding=3)
        self.dropb3 = nn.Dropout(DROPOUT)

        self.convb4 = nn.Conv1d(CONVB_CHANNELS, CONVB_CHANNELS, KWB, padding=3)
        self.dropb4 = nn.Dropout(DROPOUT)

        self.convb5 = nn.Conv1d(CONVB_CHANNELS, CONVB_CHANNELS, KWB, padding=3)
        self.dropb5 = nn.Dropout(DROPOUT)

        self.convb6 = nn.Conv1d(CONVB_CHANNELS, CONVB_CHANNELS, KWB, padding=3)
        self.dropb6 = nn.Dropout(DROPOUT)

        self.convb7 = nn.Conv1d(CONVB_CHANNELS, CONVB_CHANNELS, KWB, padding=3)
        self.dropb7 = nn.Dropout(DROPOUT)

        self.padb8 = nn.ZeroPad2d((15, 16, 0, 0))
        self.convb8 = nn.Conv1d(CONVB_CHANNELS, CONVBEnd_CHANNELS, KWBEnd)
        self.dropb8 = nn.Dropout(DROPOUT)

        # Gray part
        self.convg1 = nn.Conv1d(CONVBEnd_CHANNELS, CONVG_CHANNELS, KWG)
        self.dropg1 = nn.Dropout(DROPOUT)

        self.convg2 = nn.Conv1d(CONVG_CHANNELS, N_VOC, KWG)

    def forward(self, x):
        x = self.dropr1(F.relu(self.convr1(x)))
        x = self.dropb1(F.relu(self.convb1(x)))
        x = self.dropb2(F.relu(self.convb2(x)))
        x = self.dropb3(F.relu(self.convb3(x)))
        x = self.dropb4(F.relu(self.convb4(x)))
        x = self.dropb5(F.relu(self.convb5(x)))
        x = self.dropb6(F.relu(self.convb6(x)))
        x = self.dropb7(F.relu(self.convb7(x)))
        x = self.dropb8(F.relu(self.convb8(self.padb8(x))))
        x = self.dropg1(F.relu(self.convg1(x)))
        x = self.convg2(x)
        return x

    
net = Net(ALPHABET, num_mels).cuda()
print(net)




LEARNING_RATE = 1e-4
optimizer = torch.optim.Adam(net.parameters(), lr=LEARNING_RATE)

In [None]:
# Training loop

BATCH_SIZE = 8
NUM_EPOCHS = 300

for epoch in range(NUM_EPOCHS):
    np.random.shuffle(train_data)
    training_loss = []
    net.train()
    for batch in range(len(train_data) // BATCH_SIZE):
        optimizer.zero_grad()
        batch_data = train_data[batch * BATCH_SIZE: (batch + 1) * BATCH_SIZE]
        specs, n_timesteps, labels, labelLens = get_batch(batch_data, MAX_SEQ_LEN)

        specInput = torch.from_numpy(specs).cuda()
        targets = torch.IntTensor(list(map(ALPHABET.index, labels)))

        # Permute output to have right shape for ctc_loss
        input_lengths = torch.IntTensor(n_timesteps)
        label_lengths = torch.IntTensor(labelLens)

        logits = net(specInput).permute([2, 0, 1]).cuda()
        loss = CTCLoss(size_average=True)(logits, targets, input_lengths, label_lengths)

        loss.backward()
        optimizer.step()
        training_loss += [loss.item()]
    print(f'Epoch: {epoch + 1}, CTC loss: {np.mean(training_loss):1.4f}')
    
    
    
    
       
    net.eval()
    valLoss = 0
    #Validation checks CTC loss and WER on elements not in training set
    Val_WER = 0
    for val in validation_data:
        spectrogram = val[0]

        spectrogramTensor = torch.from_numpy(spectrogram)

        specInput = spectrogramTensor.unsqueeze(0).float().cuda()

        target = torch.IntTensor(list(map(ALPHABET.index, val[1])))
        
        output = net(specInput)
        outString = ""
        for i in range(output.shape[2]):
            outString += ALPHABET[int(torch.argmax(output[:,:,i]))]
        Val_WER = Val_WER + WER(val[1], cleanPred(outString))
        
        
        output = output.permute([2,0,1])
        outshape = output.shape
        
        input_length = torch.IntTensor([outshape[0]])
        target_length = torch.IntTensor([len(val[1])])
        
        

        Loss = CTCLoss(size_average=True)(output, target, input_length, target_length)
        valLoss += float(Loss.item())
    print(f'Epoch: {epoch + 1}, Validation loss: {valLoss/len(validation_data):1.4f}')
    print(f'Epoch: {epoch + 1}, WER: {Val_WER/len(validation_data):1.4f}')

In [None]:
N = 10


for k in range(N):
    data = validation_data[k]
    spec = data[0]
    net.eval()

    specTensor = torch.from_numpy(spec)

    spec2 = specTensor.unsqueeze(0).float()

    spec2 = spec2.cuda()
    output = net(spec2)

    outString = ""
    for i in range(output.shape[2]):
        outString += ALPHABET[int(torch.argmax(output[:,:,i]))]

    print('True:',data[1])
    print('Pred:', outString)
    print('Clean:', cleanPred(outString))
    print('WER:', WER(data[1],cleanPred(outString)))
    print('----------------------------')