In [1]:
from __future__ import unicode_literals
import matplotlib.pyplot as plt
from scipy.io import wavfile
import numpy as np
import wave
import sys
import librosa
import librosa.display
import os
import torch.nn as nn
import torch.nn.functional as F
import torch
import torchvision
import torchvision.transforms as transforms
import numpy as np

In [2]:
def wav_to_mfcc(files):
    audio, sample_rate = librosa.load('test/'+files)
    audio = normalize_audio(audio)
    mfccs = librosa.feature.mfcc(audio, sample_rate)
    delta = librosa.feature.delta(mfccs)
    delta_delta = librosa.feature.delta(mfccs, order = 2)
    return mfccs, delta, delta_delta

def normalize_audio(audio):
    audio = audio / np.max(np.abs(audio))
    return audio

def plot_mfcc(mfccs):
    librosa.display.specshow(mfccs, x_axis = 'time')
    plt.colorbar()
    plt.title('Normalized MFCC')
    plt.tight_layout()
    plt.show()
    plt.close()
    return 0

def plot_wav(files):
    audio, sample_rate = librosa.load('test/'+files)
    audio = normalize_audio(audio)
    plt.figure(figsize=(15,4))
    plt.plot(np.linspace(0, len(audio) / sample_rate, num=len(audio)), audio)
    plt.grid(True)
    plt.show()
    plt.close()
    return 0

In [3]:
# 0 = hyenas    1 = lions
mfccs = []
deltas = []
delta_deltas = []
label = []
for file in os.listdir('test/'):
    Mfcc, Delta, Delta_delta = wav_to_mfcc(file)
    mfccs.append(torch.from_numpy(Mfcc))
    deltas.append(torch.from_numpy(Delta))
    delta_deltas.append(torch.from_numpy(Delta_delta))
    if 'lion' in file:
        label.append(1)
    else:
        label.append(0)

So now we have obtained the following for our input data:
1. **mfccs[i]**: the mel-frequency cepstrum coefficients for a single audio file indexed at *i*
2. **deltas[i]**: the first derivative of the mfccs 
3. **delta_deltas[i]**: the second derivative of the mfccs

and our associated label:
1. **label[i]**: the audio file indexed at *i* will be 1 if it is a lion and 0 if it is a hyena

## Network

In [54]:
class Net(nn.Module):
    def __init__(self, mfcc_total):
        super(Net, self).__init__()
        self.conv_mfcc = nn.Conv2d(in_channels=1, out_channels=1, kernel_size=(mfcc_total, 3), stride=1)
        self.conv_delta = nn.Conv2d(in_channels=1, out_channels=1, kernel_size=(mfcc_total, 3), stride=1)
        self.conv_delta_delta = nn.Conv2d(in_channels=1, out_channels=1, kernel_size=(mfcc_total, 3), stride=1)

        self.conv_1 = nn.Conv2d(in_channels=1, out_channels= 1, kernel_size=(3, 10), stride= 3)
        self.conv_2 = nn.Conv2d(in_channels=1, out_channels= 1, kernel_size=(1, 5), stride = 1)
        self.fc1 = nn.Linear(14, 10)
        self.fc2 = nn.Linear(10, 1)
 

    def forward(self, mfcc, delta, delta_delta):
        # Compressing features into 3x300 matrix
        mfcc_features = self.conv_mfcc(mfcc)
        delta_features = self.conv_delta(delta)
        delta_delta_features = self.conv_delta_delta(delta_delta)
        features = torch.cat((mfcc_features, delta_features, delta_delta_features), 2)
        features = F.relu(self.conv_1(features))
        features = F.relu(self.conv_2(features))
        features = features.view(-1, 14)
        features = self.fc1(features).squeeze(0)
        features = torch.sigmoid(self.fc2(features))
    
        return features
    
def load_model(lr, seed, mfcc_total):
    torch.manual_seed(seed)
    model = Net(mfcc_total)
    loss_function = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    return model#, loss_function, optimizer

In [18]:
mfccs[0].size()

torch.Size([20, 66])

In [26]:
###### SET HYPERPARAMETERS HERE ######
lr = 0.1
seed = 42
mfcc_total = 13
epochs = 1

In [55]:
def main_test():
    torch.manual_seed(seed)
    model = load_model(lr, seed, mfcc_total)
    for epoch in range(epochs):
        for i in range(len(mfccs)):
            mfcc = mfccs[i][1: mfcc_total + 1][:, :65].unsqueeze(0).unsqueeze(0)
            delta = deltas[i][1: mfcc_total + 1][:, :65].unsqueeze(0).unsqueeze(0)
            delta_delta = delta_deltas[i][1: mfcc_total + 1][:, :65].unsqueeze(0).unsqueeze(0)
            model(mfcc, delta, delta_delta)
            ###### FILL THIS OUT ######
            
        
main_test()

## Training