# Spoken Digit Classification

In [24]:
import numpy as np
import librosa
import os
import matplotlib.pyplot as plt
import sklearn.svm
import IPython.display as ipd
import scipy as sp
%matplotlib inline
from sklearn.mixture import BayesianGaussianMixture
# import seaborn as sns


# Define features

## Zero crossing rate

In [28]:
def compute_zcr(win, Fs):
    # returns an array of 1 (value > 0) and 0 (value < 0)
    win_sign = np.sign(win)
    # length calculated across input dimension (in this case 0)
    N = win.shape[0]
    # difference inside the formula 
    # (NB win_sign[:-1] means from first to the second last, so the last is excluded)
    sign_diff = np.abs(win_sign[:-1] - win_sign[1:])
    
    zcr = np.sum(sign_diff) * Fs / (2 * N)
    zcr = len(sign_diff[sign_diff != 0]) * Fs / N
    return zcr

## Spectral decrease

In [29]:
def compute_specdec(spec):
    mul_fact = 1 / (np.sum(np.abs(spec[1:])))
    num = np.abs(spec[1:]) - np.abs(spec[0])
    den = np.arange(1, len(spec)) - 1
    den[den == 0] = 1
    spectral_decrease = mul_fact * np.sum(num / den)
    return spectral_decrease

## Spectral centroid

In [30]:
def compute_speccentr(spec, Fs):
    freq_axis = np.linspace(0, Fs//2, len(spec))
    centr = np.dot(freq_axis, np.abs(spec)) / np.sum(np.abs(spec), axis = 0)
    return centr

## import recordings

In [34]:
# define the 10 classes
classes = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine']

# initialize train features
train_features = {'zero': [], 'one': [], 'two': [], 'three': [], 'four': [], 'five': [], 'six': [], 'seven': [], 'eight': [], 'nine': []}

train_root = '../free-spoken-digit-dataset/recordings'
for c in classes:
    # train array (for now it will contain all even recordings)
    class_train_files = []
    # test array (for now it will contain all odd recordings)
    class_test_files = []
    for f in os.listdir(train_root):
        if int(f[-5]) % 2 == 0:
            #print("{} è pari".format(f[-5]))
            class_train_files.append(f)
        else:
            #print("{} è dispari".format(f[-5]))
            class_test_files.append(f)

# check length of train and test arrays
print("number of train files: {}".format(len(class_train_files)))
print("number of test files: {}".format(len(class_test_files)))

number of train files: 1500
number of test files: 1500


In [36]:
# listen to an audio file
audio, fs = librosa.load(os.path.join(train_root, class_train_files[0]), sr=None)
ipd.Audio(audio, rate=fs) # load the file