In [97]:
from __future__ import unicode_literals
import matplotlib.pyplot as plt
from scipy.io import wavfile
import numpy as np
import wave
import sys
import librosa
import librosa.display
import os
import torch.nn as nn
import torch.nn.functional as F
import torch
import torchvision
import torchvision.transforms as transforms
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit

### Obtaining Data

In [111]:
mfccs = []
deltas = []
delta_deltas = []
label = []
X = []
path_hyena = 'data/processed data/hyenas/'
path_lion = 'data/processed data/lions/'
min_shape = 1000

for file in os.listdir(path_hyena):
    hyena = np.load(path_hyena + file)
    if min_shape > hyena[0].shape[1]:
        min_shape = hyena[0].shape[1]
    mfcc = hyena[0][:, :300]
    delta = hyena[1][:, :300]
    delta_delta = hyena[2][:, :300]
    mfccs.append(mfcc)
    deltas.append(delta)
    delta_deltas.append(delta_delta)
    X.append([mfcc, delta, delta_delta])
    label.append(0)
    
for file in os.listdir(path_lion):
    lion = np.load(path_lion + file)
    if min_shape > lion[0].shape[1]:
        min_shape = lion[0].shape[1]
    mfcc = lion[0][:, :300]
    delta = lion[1][:, :300]
    delta_delta = lion[2][:, :300]
    mfccs.append(mfcc)
    deltas.append(delta)
    delta_deltas.append(delta_delta)
    X.append([mfcc, delta, delta_delta])
    label.append(1)

X = np.asarray(X)
print('Minimum length all of our tensors is: %i' % (min_shape))

Minimum length all of our tensors is: 300


So now we have obtained the following for our input data:
1. **mfccs[i]**: the mel-frequency cepstrum coefficients for a single audio file indexed at *i*
2. **deltas[i]**: the first derivative of the mfccs 
3. **delta_deltas[i]**: the second derivative of the mfccs

and our associated label:
1. **label[i]**: the audio file indexed at *i* will be 1 if it is a lion and 0 if it is a hyena

Now splitting our data and randomizing it:

In [112]:
np.asarray(X)

array([[[[-6.54673035e+02, -6.54822021e+02, -6.55042603e+02, ...,
          -4.78476074e+02, -4.72487579e+02, -4.74044006e+02],
         [ 9.02260780e+00,  8.81174850e+00,  8.43821716e+00, ...,
           1.55321533e+02,  1.60563843e+02,  1.50154709e+02],
         [ 8.97599697e+00,  8.76464844e+00,  8.25950050e+00, ...,
           3.65668564e+01,  3.84954224e+01,  3.07052498e+01],
         ...,
         [ 7.86868668e+00,  7.63117075e+00,  7.64832592e+00, ...,
          -2.87141562e+00, -9.56643867e+00, -1.58929396e+00],
         [ 7.70555782e+00,  7.46252918e+00,  7.55868530e+00, ...,
           7.62727213e+00,  1.39502888e+01,  3.31496596e-01],
         [ 7.55107927e+00,  7.30320263e+00,  7.59654760e+00, ...,
          -2.63591266e+00,  4.34623766e+00, -7.76382828e+00]],

        [[ 2.01921959e+01,  2.01921959e+01,  2.01921959e+01, ...,
           2.85432887e+00,  2.85432887e+00,  2.85432887e+00],
         [ 2.13666630e+01,  2.13666630e+01,  2.13666630e+01, ...,
           5.67944586e

In [114]:
split_test = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state=0)
split_val = split_1 = StratifiedShuffleSplit(n_splits = 1, test_size = 0.3, random_state=0)



for train_index, test_index in split_test.split(X, label):
 
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = label[train_index], label[test_index]
        

for train_index, test_index in split_val.split(X_train, y_train):
    X_train, X_val = X_train[train_index], X_train[test_index]
    y_train, y_val = y_train[train_index], y_train[test_index]
    
# printing out number of classes in each
print('Training has %d of class 0 and %d of class 1' % (np.unique(y_train, return_counts=True)[1][0], np.unique(y_train, return_counts=True)[1][1]))
print('Validation has %d of class 0 and %d of class 1' % (np.unique(y_val, return_counts=True)[1][0], np.unique(y_val, return_counts=True)[1][1]))
print('Testing has %d of class 0 and %d of class 1' % (np.unique(y_test, return_counts=True)[1][0], np.unique(y_test, return_counts=True)[1][1]))

TypeError: only integer scalar arrays can be converted to a scalar index

In [91]:
X_train[0][2].size()

torch.Size([13, 300])

### Network

In [None]:
class Net(nn.Module):
    def __init__(self, mfcc_total):
        super(Net, self).__init__()
        self.conv_mfcc = nn.Conv2d(in_channels = 1, out_channels = 1, kernel_size = (mfcc_total,1), stride = 1)
        self.conv_delta = nn.Conv2d(in_channels = 1, out_channels = 1, kernel_size = (mfcc_total,1), stride = 1)
        self.conv_delta_delta = nn.Conv2d(in_channels = 1, out_channels = 1, kernel_size = (mfcc_total,1), stride = 1)
        
        ###### DECLARE NETWORK COMPONENTS HERE ######
        
        
    def forward(self, mfcc, delta, delta_delta):
        mfcc_features = self.conv_mfcc(mfcc)
        delta_features = self.conv_delta(delta)
        delta_delta_features = self.conv_delta_delta(delta_delta)
        features = torch.cat((mfcc_features, delta_features, delta_delta_features), 2)
        ###### REST OF THE FORWARD FUNCTION HERE ######
        
        
        return 0
    
def load_model(lr, seed, mfcc_total):
    torch.manual_seed(seed)
    model = Net(mfcc_total)
    
    ###### DECLARE LOSS FUNCTION & OPTIMIZER ######
    #loss_function = 
    #optimizer = 
    
    return model#, loss_function, optimizer

### Training

In [None]:
###### SET HYPERPARAMETERS HERE ######
lr = 0.1
seed = 42
mfcc_total = 13
epochs = 1

In [None]:
def main_test():
    torch.manual_seed(seed)
    model = load_model(lr, seed, mfcc_total)
    for epoch in range(epochs):
        for i in range(len(mfccs)):
            mfcc = mfccs[i][1: mfcc_total + 1][:, :65].unsqueeze(0).unsqueeze(0)
            delta = deltas[i][1: mfcc_total + 1][:, :65].unsqueeze(0).unsqueeze(0)
            delta_delta = delta_deltas[i][1: mfcc_total + 1][:, :65].unsqueeze(0).unsqueeze(0)
            model(mfcc, delta, delta_delta)
            ###### FILL THIS OUT ######
            
        
main_test()