# Testing Model Reproducibility
Test whether the results are reproducible or not using the same dataset and trained model weights

In [16]:
using DSP,WAV
using PyCall
using PyPlot
using MFCC
using FFTW
using Flux
using Printf,BSON
using Flux: onehotbatch, onecold, crossentropy, throttle, Conv,relu
using Base.Iterators: partition
using StatsBase
using MLLabelUtils,MLDataPattern
IpY = pyimport("IPython")
using Images,MLBase


## Loading the data
We will be testing our model on the same dataset. Thouh different audio data can also be tried 

In [2]:
cd("./Downloads")   # Or the path of folder containing the dataset Folder(Spoken_Digit) after extraction
A = readdir("./Spoken_Digit/recordings")
cd("./Spoken_Digit/recordings")
X = []
X_fs = []
Y = []
for i in 1:length(A)
    s,fs = wavread(A[i])
    push!(X,s)
    push!(X_fs,fs)
    push!(Y,Int(A[i][1]-'0'))
end
cd("./../../")

## Data Preprocessing
Similar to training data we will change audio files to spectrogram, normalise the data and then use that as test set. 

In [3]:
imgs = []
for i in 1:length(X)
    b = spectrogram(X[i][:,1])
    push!(imgs,b.power)
end
labels = Y;
for i in 1:length(imgs)
    imgs[i] = Flux.normalise(imgs[i],dims=2)
end

imgs_,labels_ = shuffleobs((imgs,labels));

Loading the complete data as a single minibatch to test the model performance on the data as a whole

In [4]:
img_size = (256,32)
m,n = img_size

function make_minibatch(X,Y,idxs)
    X_batch = Array{Float32}(undef,(img_size)..., 1, length(idxs))
    for i in 1:length(idxs)
        img = Float32.(imresize((X[idxs[i]]),(img_size)...))
        X_batch[:, :, :, i] = img
    end
    Y_batch = onehotbatch(Y[idxs], 0:9)
    return (X_batch, Y_batch)
end

make_minibatch (generic function with 1 method)

Take any random small chunk from the dataset for testing

In [9]:
mb_indices = [] 
ind = []
for i in 500:800
    push!(ind,i)
end
test_set = [make_minibatch(imgs_,labels_,ind)];

In [10]:
accuracy(x, y) = mean(onecold(model(x)) .== onecold(y))

accuracy (generic function with 1 method)

## Model Performance Testing of Trained Model

In [7]:
#Model evaluation on Training Data
BSON.@load "./MNIST_Speech.bson" model


In [11]:
accuracy(test_set[1]...)

0.9833887043189369

In [12]:
y_label = model(test_set[1][1])

10×301 Array{Float32,2}:
 3.8437e-13   1.91665e-9   1.57188e-7   …  1.43238e-6   2.79504e-8 
 4.82582e-9   0.999623     0.000236523     2.06655e-8   0.000121791
 4.39358e-7   5.5426e-9    0.000238135     0.000882037  0.00011656 
 3.30992e-11  7.50751e-9   6.39884e-6      2.86941e-5   3.69492e-5 
 0.999999     1.28694e-6   0.995579        0.000281947  0.999325   
 9.67511e-7   8.0464e-5    0.00353203   …  8.95394e-9   0.000340723
 3.90226e-10  1.40138e-9   0.000303763     0.998564     5.52701e-5 
 4.46966e-11  7.38176e-8   2.74527e-5      8.30553e-8   2.76701e-6 
 2.86833e-11  1.60256e-9   6.0581e-5       0.000241771  8.11351e-8 
 2.36902e-11  0.000294738  1.61166e-5      3.64062e-10  6.61961e-7 

Confusion Matrix:

In [29]:
y_true = onecold(test_set[1][2])
y_pred = onecold(y_label)

MLBase.confusmat(10,y_true,y_pred)

10×10 Array{Int64,2}:
 28   0   0   0   0   0   0   0   0   0
  0  32   0   0   0   0   0   0   0   0
  0   0  24   0   0   0   0   0   0   0
  0   0   0  27   0   0   0   0   0   0
  0   0   0   0  35   0   0   0   0   0
  0   0   0   0   0  28   0   0   0   1
  0   0   0   0   0   0  26   0   0   0
  0   0   0   0   0   0   0  27   0   0
  0   0   0   1   1   0   1   0  32   0
  0   0   0   0   0   0   0   1   0  37