In [9]:
!pip install librosa

[33mYou are using pip version 18.0, however version 18.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [1]:
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy
import os
import torch
from warpctc_pytorch import CTCLoss

In [2]:
#Functions for file manipulation
def load_samples(file_path):
    ys, srs = [[]],[[]]
    i = 0
    #loads .wav files
    for filename in os.listdir(file_path):
        if filename.endswith(".wav"):
            y, sr = librosa.load(path+filename, sr=16000)
            ys[i].append(y)
            srs[i].append(sr)
            i = i + 1
            ys.append([])
            srs.append([])  
    ys = ys[0: len(ys) - 1]
    srs = srs[0: len(srs) - 1]
    return (ys, srs)
def load_labels(file_path):
    i = 0
    labels = [[]]
    
    for filename in os.listdir(file_path):
        if filename.endswith(".txt"):
            file = open(file_path+filename, "r") 
            labels[i].append(file.read())
            labels.append([])
            i = i + 1
            
    labels=labels[0: len(labels) - 1]
    return labels

In [3]:
#Functions for feature extraction
def find_max(ys):
    maximum =0
    for y1 in ys:
        for y2 in y1:
            dim = y2.shape[0]
            if dim > maximum:
                maximum = dim
    return maximum

def pad_signal(ys):
    max_length = find_max(ys)
    
    ys_new = ys
    for i, y1 in enumerate(ys_new):
        for j, y2 in enumerate(y1):
            if len(y2) < max_length:
                z = numpy.zeros((max_length - len(y2)))
                pad_signal = numpy.append(y2, z) # Pad Signal to make sure that all frames have equal number of samples without truncating any samples from the original signal
                ys_new[i][j] = pad_signal
    return ys_new
def pre_emphasize(ys, pre_emphasis):
    for i, y in enumerate(ys):
        signal=y[0]
        y[0] = numpy.append(signal[0], signal[1:] - pre_emphasis * signal[:-1])
    return ys
def fourier_transform(ys, N_FFT=512, window='hamming', hop_size=256):

    Ds = [[]]

    for i, y in enumerate(ys):
        Ds[i].append(librosa.core.stft(y=y[0], n_fft=N_FFT, window=window, hop_length=hop_size))
        Ds.append([])
    return Ds

def mfccs(ys, N_FFT=512, sr=16000, n_mfcc=40, hop_size=256):
    mels = [[]]
    
    for i, y in enumerate(ys):
        mels[i].append(librosa.feature.mfcc(y[0], sr=sr, n_mfcc=n_mfcc, n_fft=N_FFT, hop_length=hop_size))
        mels.append([])
    return mels

# Step 1
* we open the samples and labels

In [4]:
path = "an4_dataset/train/"#path to the dataset

ys, srs = load_samples(path)
labels = load_labels(path)

# Step 2
* loaded data is preprocessed
* we perform stft using librosa
* then we add melspectrogram
* and MFCCs, all are prepared, but we can use each of them, so we get different kinds of features

**Note**: For stft we have a window size, typically 512 or 256 and hop size. On each iteration we start at 
$$
n_1 = N_f x H
$$
and we finish at
$$
n_2 = n_1 + M - 1
$$

https://dsp.stackexchange.com/questions/38491/time-position-in-stft-output

H is a hop size (length) and M is a window size.

In [5]:
ys_new = pad_signal(ys)

ys_emphasized=pre_emphasize(ys_new, 0.97)


In [6]:
N_FFT = 400 #window size
window = 'hamming'
hop_size = 160
N_MFCC = 40

Ds=fourier_transform(ys=ys_emphasized, N_FFT=N_FFT, window=window, hop_size=hop_size)
Ms=mfccs(ys=ys_emphasized, n_mfcc=N_MFCC, N_FFT=N_FFT, hop_size=hop_size)

In [7]:
import torch
from torch.autograd import Variable
from torch.nn.parameter import Parameter
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.nn.init as init

from torch.nn import Linear, Conv2d, BatchNorm2d, MaxPool2d, Dropout2d
from torch.nn.functional import relu, elu, relu6, sigmoid, tanh, softmax

In [8]:
# hyperameters of the model
num_classes = 27
channels = 1
height = 1
width = 40
num_filters_conv1 = 16
kernel_size_conv1 = 1 # [height, width]
stride_conv1 = 1 # [stride_height, stride_width]
kernel_size_pool1 = 1
stride_pool1 = 1
num_l1 = 100
padding_conv1 = 0
   
def compute_conv_dim(dim_size):
    return int((dim_size - kernel_size_conv1 + 2 * padding_conv1) / stride_conv1 + 1)

def compute_maxPool_dim(dim_size):
    return int((dim_size - kernel_size_pool1 + 2 * padding_conv1) / stride_pool1 + 1)

# define network
class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        #out_dim = (input_dim - filter_dim + 2 * padding) / stride + 1
        self.conv_1 = Conv2d(in_channels=channels,
                             out_channels=num_filters_conv1,
                             kernel_size=kernel_size_conv1,
                             stride=stride_conv1)
        
       # self.maxPool_1 = MaxPool2d(2, stride=2)
        
        self.conv_out_height = compute_conv_dim(height)
        self.conv_out_width = compute_conv_dim(width)
      #  self.conv_out_height = compute_maxPool_dim(self.conv_out_height)
      #  self.conv_out_width = compute_maxPool_dim(self.conv_out_width)
        
        # add dropout to network
        #self.dropout = Dropout2d(p=0.5)
        self.l1_in_features = num_filters_conv1 * self.conv_out_height * self.conv_out_width
        #self.l1_in_features = channels * height * width
        
        self.l_1 = Linear(in_features=self.l1_in_features, 
                          out_features=num_l1,
                          bias=True)
        self.l_out = Linear(in_features=num_l1, 
                            out_features=num_classes,
                            bias=False)
    
    def forward(self, x): # x.size() = [batch, channel, height, width]
        x = relu(self.conv_1(x))
        #x = self.maxPool_1(x)
        # torch.Tensor.view: http://pytorch.org/docs/master/tensors.html?highlight=view#torch.Tensor.view
        #   Returns a new tensor with the same data as the self tensor,
        #   but of a different size.
        # the size -1 is inferred from other dimensions 
        x = x.view(-1, self.l1_in_features)
        #x = self.dropout(relu(self.l_1(x)))
        x = relu(self.l_1(x))
        return softmax(self.l_out(x), dim=1)


net = Net()
print(net)

Net(
  (conv_1): Conv2d(1, 16, kernel_size=(1, 1), stride=(1, 1))
  (l_1): Linear(in_features=640, out_features=100, bias=True)
  (l_out): Linear(in_features=100, out_features=27, bias=False)
)


In [9]:
criterion = CTCLoss()
optimizer = optim.Adam(net.parameters(), lr=0.001)


In [10]:
print(len(Ms))
print(len(Ms[0]))
print(len(Ms[0][0].T))

949
1
641


In [11]:
batch = []

index = 0
print(len(Ms[0][0].T))
    
for index in Ms[0][0].T:
    x = numpy.array([[index]])
    batch.append(x)
         
batch = numpy.stack(batch, axis=0)
print(batch.shape)

641
(641, 1, 1, 40)


In [12]:
out = net(Variable(torch.from_numpy(batch).float()))
string = ''.join(labels[0])
string = string.strip()
string = string.replace(" ", "")

output = []
for character in string:
    number = ord(character) - 65
    output.append(number)
print output

labels_crit = torch.IntTensor(output)

label_sizes = torch.IntTensor([len(labels_crit)])
probs_sizes = torch.IntTensor([641])

arr = out.detach().numpy()
arr2 = numpy.empty([1, arr.shape[0], arr.shape[1]])
arr2[0]=arr
print(arr2)

probs = torch.FloatTensor(arr2).transpose(0, 1).contiguous()
probs.requires_grad_(True)

[18, 4, 21, 4, 13, 5, 14, 20, 17, 14, 13, 4, 14, 7, 5, 8, 21, 4, 13, 8, 13, 4, 19, 22, 14]
[[[1.18548087e-19 3.69547226e-10 9.28313237e-22 ... 1.17465248e-15
   1.58453426e-19 1.30159242e-21]
  [7.10899956e-19 1.12313636e-09 3.02870404e-21 ... 8.61047213e-16
   9.73347046e-19 9.31324186e-21]
  [7.24357978e-19 4.76523321e-10 3.04038676e-21 ... 9.67024482e-16
   9.73151211e-19 1.14767958e-20]
  ...
  [1.17063550e-23 5.91776845e-12 6.33478341e-27 ... 3.21320248e-20
   1.93243746e-24 5.45376583e-26]
  [1.17063550e-23 5.91776845e-12 6.33478341e-27 ... 3.21320248e-20
   1.93243746e-24 5.45376583e-26]
  [1.17063550e-23 5.91776845e-12 6.33480729e-27 ... 3.21320248e-20
   1.93243746e-24 5.45376583e-26]]]


tensor([[[1.1855e-19, 3.6955e-10, 9.2831e-22,  ..., 1.1747e-15,
          1.5845e-19, 1.3016e-21]],

        [[7.1090e-19, 1.1231e-09, 3.0287e-21,  ..., 8.6105e-16,
          9.7335e-19, 9.3132e-21]],

        [[7.2436e-19, 4.7652e-10, 3.0404e-21,  ..., 9.6702e-16,
          9.7315e-19, 1.1477e-20]],

        ...,

        [[1.1706e-23, 5.9178e-12, 6.3348e-27,  ..., 3.2132e-20,
          1.9324e-24, 5.4538e-26]],

        [[1.1706e-23, 5.9178e-12, 6.3348e-27,  ..., 3.2132e-20,
          1.9324e-24, 5.4538e-26]],

        [[1.1706e-23, 5.9178e-12, 6.3348e-27,  ..., 3.2132e-20,
          1.9324e-24, 5.4538e-26]]], requires_grad=True)

In [13]:
cost=criterion(probs, labels_crit, probs_sizes, label_sizes)

In [14]:
cost.backward()

In [15]:
print(cost)

tensor([1977.4662], grad_fn=<_CTCBackward>)


In [18]:
i = 0
num_epochs = 5

j = 0

while i < num_epochs:
    j=0
    
    net.train()
    while j < 25:
        batch = []

        index = 0
        #print(len(Ms[j][0].T))
    
        for index in Ms[j][0].T:
            x = numpy.array([[index]])
            batch.append(x)
         
        batch = numpy.stack(batch, axis=0)
        #print(batch.shape)
        
        out = net(Variable(torch.from_numpy(batch).float()))
        string = ''.join(labels[j])
        string = string.strip()
        string = string.replace(" ", "")
        print(string)
        output = []
        for character in string:
            number = ord(character) - 65
            output.append(number)
        #print output

        labels_crit = torch.IntTensor(output)

        label_sizes = torch.IntTensor([len(labels_crit)])
        probs_sizes = torch.IntTensor([641])

        arr = out.detach().numpy()
        arr2 = numpy.empty([1, arr.shape[0], arr.shape[1]])
        arr2[0]=arr
        #print(arr2)

        probs = torch.FloatTensor(arr2).transpose(0, 1).contiguous()
        probs.requires_grad_(True)
        #labels_crit.requires_grad_(True)
        
        cost=criterion(probs, labels_crit, probs_sizes, label_sizes)
        optimizer.zero_grad()
        cost.backward()
        print("Cost: ")
        print(cost)
        print(i)
        print(j)
        j = j + 1
        optimizer.step()
    i = i + 1

SEVENFOURONEOHFIVENINETWO
Cost: 
tensor([1977.4662], grad_fn=<_CTCBackward>)
0
0
VERKETHIRTYFIVETHIRTY
Cost: 
tensor([1998.4708], grad_fn=<_CTCBackward>)
0
1
ENTERSIXTYONE
Cost: 
tensor([2045.3849], grad_fn=<_CTCBackward>)
0
2
SIXTHREEEIGHT
Cost: 
tensor([2045.4657], grad_fn=<_CTCBackward>)
0
3
SUNNYVALE
Cost: 
tensor([2072.2852], grad_fn=<_CTCBackward>)
0
4
BRONXVILLE
Cost: 
tensor([2065.2715], grad_fn=<_CTCBackward>)
0
5
TWELVETWENTYONEFIFTYEIGHT
Cost: 
tensor([1977.4788], grad_fn=<_CTCBackward>)
0
6
ONEFIVETWOONESEVEN
Cost: 
tensor([2015.2604], grad_fn=<_CTCBackward>)
0
7
FOURTWOONEOHONENINESIX
Cost: 
tensor([1993.1598], grad_fn=<_CTCBackward>)
0
8
TWOFOURTHREEEIGHTTHREETWOEIGHT
Cost: 
tensor([1953.3746], grad_fn=<_CTCBackward>)
0
9
STEVEN
Cost: 
tensor([2094.5964], grad_fn=<_CTCBackward>)
0
10
ENTERFIVE
Cost: 
tensor([2072.2295], grad_fn=<_CTCBackward>)
0
11
RUBOUTFXCWNFOUR
Cost: 
tensor([2032.9286], grad_fn=<_CTCBackward>)
0
12
ATTGKEIGHTYFOUR
Cost: 
tensor([2032.9745], grad_fn=<_