In [1]:
from __future__ import unicode_literals
import matplotlib.pyplot as plt
from scipy.io import wavfile
import numpy as np
import wave
import sys
import librosa
import librosa.display
import os
import torch.nn as nn
import torch.nn.functional as F
import torch
import torchvision
import torchvision.transforms as transforms
import numpy as np

In [2]:
def wav_to_mfcc(audio,sample_rate, mfcc_total,**kwargs):
    audio = normalize_audio(audio)
    mfccs = librosa.feature.mfcc(audio, sample_rate,n_mfcc = mfcc_total,**kwargs)
    delta = librosa.feature.delta(mfccs)
    delta_delta = librosa.feature.delta(mfccs, order = 2)
    return np.array([mfccs, delta, delta_delta])

def normalize_audio(audio):
    audio = audio / np.max(np.abs(audio))
    return audio

def plot_mfcc(mfccs):
    librosa.display.specshow(mfccs, x_axis = 'time')
    plt.colorbar()
    plt.title('Normalized MFCC')
    plt.tight_layout()
    plt.show()
    plt.close()
    return 0

def plot_wav(files):
    audio, sample_rate = librosa.load('test/'+files)
    audio = normalize_audio(audio)
    plt.figure(figsize=(15,4))
    plt.plot(np.linspace(0, len(audio) / sample_rate, num=len(audio)), audio)
    plt.grid(True)
    plt.show()
    plt.close()
    return 0

In [None]:
# 0 = hyenas    1 = lions
mfccs = []
deltas = []
delta_deltas = []
label = []
for file in os.listdir('data/processed data/lions/'):
    features = np.load('data/processed data/lions/'+file)
    mfccs.append(torch.from_numpy(features[0]))
    deltas.append(torch.from_numpy(features[1]))
    delta_deltas.append(torch.from_numpy(features[2]))
    label.append(1)
print('lions done')
for file in os.listdir('data/processed data/hyenas/'):
    features = np.load('data/processed data/hyenas/'+file)
    mfccs.append(torch.from_numpy(features[0]))
    deltas.append(torch.from_numpy(features[1]))
    delta_deltas.append(torch.from_numpy(features[2]))
    label.append(0)
print('hyenas done')

So now we have obtained the following for our input data:
1. **mfccs[i]**: the mel-frequency cepstrum coefficients for a single audio file indexed at *i*
2. **deltas[i]**: the first derivative of the mfccs 
3. **delta_deltas[i]**: the second derivative of the mfccs

and our associated label:
1. **label[i]**: the audio file indexed at *i* will be 1 if it is a lion and 0 if it is a hyena

## Building the Model to Compress the MFCCs and their Gradients

In [17]:
class Net(nn.Module):
    def __init__(self, mfcc_total):
        super(Net, self).__init__()
        self.conv_mfcc = nn.Conv2d(in_channels = 1, out_channels = 1, kernel_size = (mfcc_total,1), stride = 1)
        self.conv_delta = nn.Conv2d(in_channels = 1, out_channels = 1, kernel_size = (mfcc_total,1), stride = 1)
        self.conv_delta_delta = nn.Conv2d(in_channels = 1, out_channels = 1, kernel_size = (mfcc_total,1), stride = 1)
        
        ###### DECLARE NETWORK COMPONENTS HERE ######
        
        
    def forward(self, mfcc, delta, delta_delta):
        mfcc_features = self.conv_mfcc(mfcc)
        delta_features = self.conv_delta(delta)
        delta_delta_features = self.conv_delta_delta(delta_delta)
        features = torch.cat((mfcc_features, delta_features, delta_delta_features), 2).size()
        
        ###### REST OF THE FORWARD FUNCTION HERE ######
        
        
        return 0
    
def load_model(lr, seed, mfcc_total):
    torch.manual_seed(seed)
    model = Net(mfcc_total)
    
    ###### DECLARE LOSS FUNCTION & OPTIMIZER ######
    loss_function = 
    optimizer = 
    
    return model, loss_function, optimizer

In [5]:
###### SET HYPERPARAMETERS HERE ######
lr = 0.1
seed = 42
mfcc_total = 13
epochs = 1

In [18]:
def main_test():
    torch.manual_seed(seed)
    model = load_model(lr, seed, mfcc_total)
    for epoch in epochs:
        for i in range(len(mfccs)):
            mfcc = mfccs[i][1: mfcc_total + 1][:, :65].unsqueeze(0).unsqueeze(0)
            delta = deltas[i][1: mfcc_total + 1][:, :65].unsqueeze(0).unsqueeze(0)
            delta_delta = delta_deltas[i][1: mfcc_total + 1][:, :65].unsqueeze(0).unsqueeze(0)
            model(mfcc, delta, delta_delta)
            ###### FILL THIS OUT ######
            
        
main_test()

torch.Size([1, 1, 3, 66])
tensor([[[[ 4.4374e+01,  4.3826e+01,  3.3012e+01,  3.3431e+01,  3.4678e+01,
            3.5562e+01,  3.5181e+01,  3.4412e+01,  3.1669e+01,  3.3911e+01,
            3.4997e+01,  3.4262e+01,  3.5298e+01,  3.4562e+01,  3.3271e+01,
            3.4524e+01,  3.3667e+01,  3.1729e+01,  3.2050e+01,  3.1409e+01,
            3.1635e+01,  3.2727e+01,  3.4980e+01,  2.9375e+01,  2.3864e+01,
            2.2500e+01,  2.1185e+01,  2.0635e+01,  2.3102e+01,  3.0774e+01,
            3.1253e+01,  3.0775e+01,  3.4563e+01,  3.5907e+01,  3.0187e+01,
            3.1119e+01,  2.9665e+01,  2.6491e+01,  2.4682e+01,  2.4168e+01,
            2.2292e+01,  2.2468e+01,  2.1570e+01,  1.9592e+01,  1.6495e+01,
            1.5363e+01,  1.2851e+01,  1.1953e+01,  9.8541e+00,  6.5279e+00,
            6.2432e+00,  5.2996e+00,  5.2389e+00,  5.3460e+00,  5.4316e+00,
            5.5053e+00,  5.3035e+00,  5.2968e+00,  5.2657e+00,  5.2324e+00,
            5.1943e+00,  5.2974e+00,  5.1813e+00,  5.4773e+00,