# Spoken Digit Classification

In [15]:
import numpy as np
import librosa
import os
import matplotlib.pyplot as plt
import sklearn.svm
import IPython.display as ipd
import scipy as sp
%matplotlib inline
from sklearn.mixture import BayesianGaussianMixture
# import seaborn as sns


# Define features

## Zero crossing rate

In [16]:
def compute_zcr(win, Fs):
    # returns an array of 1 (value > 0) and 0 (value < 0)
    win_sign = np.sign(win)
    # length calculated across input dimension (in this case 0)
    N = win.shape[0]
    # difference inside the formula 
    # (NB win_sign[:-1] means from first to the second last, so the last is excluded)
    sign_diff = np.abs(win_sign[:-1] - win_sign[1:])
    
    zcr = np.sum(sign_diff) * Fs / (2 * N)
    zcr = len(sign_diff[sign_diff != 0]) * Fs / N
    return zcr

## Spectral decrease

In [17]:
def compute_specdec(spec):
    mul_fact = 1 / (np.sum(np.abs(spec[1:])))
    num = np.abs(spec[1:]) - np.abs(spec[0])
    den = np.arange(1, len(spec)) - 1
    den[den == 0] = 1
    spectral_decrease = mul_fact * np.sum(num / den)
    return spectral_decrease

## Spectral centroid

In [18]:
def compute_speccentr(spec, Fs):
    freq_axis = np.linspace(0, Fs//2, len(spec))
    centr = np.dot(freq_axis, np.abs(spec)) / np.sum(np.abs(spec), axis = 0)
    return centr

## MFCC

In [19]:
def compute_mfcc(audio, fs, n_mfcc):
    # Compute the spectrogram of the audio signal
    X = np.abs(librosa.stft(
        audio,
        window='hamming',
        n_fft=1024,
        hop_length=512,)
        )
    
    # Find the weights of the mel filters
    mel = librosa.filters.mel(
        sr=fs,
        n_fft=1024,
        n_mels=40,
        fmin=133.33,
        fmax=6853.8,
    )
    
    # Apply the filters to spectrogram
    melspectrogram = np.dot(mel, X)
    # Take the logarithm
    log_melspectrogram = np.log10(melspectrogram + 1e-16)
    
    # Apply the DCT to log melspectrogram to obtain the coefficients
    mfcc = sp.fftpack.dct(log_melspectrogram, axis=0, norm='ortho')[1:n_mfcc+1]
    return mfcc

## import recordings

In [22]:
# define the 10 classes
classes = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine']

root = '../free-spoken-digit-dataset/recordings'
for c in classes:
    # train array (for now it will contain all even recordings)
    class_train_files = []
    # test array (for now it will contain all odd recordings)
    class_test_files = []
    
    for f in os.listdir(root):
        if int(f[-5]) % 2 == 0:
            #print("{} è pari".format(f[-5]))
            class_train_files.append(f)
        else:
            #print("{} è dispari".format(f[-5]))
            class_test_files.append(f)

# check length of train and test arrays
n_train_samples = len(class_train_files)
n_test_samples = len(class_test_files)

print("number of train files: {}".format(n_train_samples))
print("number of test files: {}".format(n_test_samples))

number of train files: 1500
number of test files: 1500


In [24]:
# listen to an audio file
audio, fs = librosa.load(os.path.join(root, class_train_files[0]), sr=None)
ipd.Audio(audio, rate=fs) # load the file

## compute MFCC

In [25]:
# initialize dictionaries
dict_train_features = {'zero': [], 'one': [], 'two': [], 'three': [], 'four': [], 'five': [], 'six': [], 'seven': [], 'eight': [], 'nine': []}
dict_test_features = {'zero': [], 'one': [], 'two': [], 'three': [], 'four': [], 'five': [], 'six': [], 'seven': [], 'eight': [], 'nine': []}


# why 13? try different values. check warning
n_mfcc = 13

for c in classes:
    train_features = np.zeros((n_train_samples), n_mfcc))
    for index, f in enumerate(class_train_files):
        audio, fs = librosa.load(os.path.join(root, f), sr=None)
        mfcc = compute_mfcc(audio, fs, n_mfcc)
        train_features[index, :] = np.mean(mfcc, axis=1)
    dict_train_features[c] = train_features
    
    test_features = np.zeros((n_test_samples), n_mfcc))
    for index, f in enumerate(class_test_files):
        audio, fs = librosa.load(os.path.join(root, f), sr=None)
        mfcc = compute_mfcc(audio, fs, n_mfcc)
        test_features[index, :] = np.mean(mfcc, axis=1)
    dict_test_features[c] = test_features
    


  "Empty filters detected in mel frequency basis. "


## compute other features

In [26]:
features_names = ['Zero Crossing Rate', 'Spectral Decrease', 'Spectral Centroid']

# check parameters
win_length = int(np.floor(0.01 * fs))
hop_size = int(np.floor(0.0075 * fs))

window = sp.signal.get_window(window='hanning', Nx=win_length)
n_features = len(features_names)

In [35]:
train_basic_features = []
# for each class
for c in classes:
    
    # for each train audio file
    for index, f in enumerate(class_train_files):
        audio, fs = librosa.load(os.path.join(root, f), sr=None)
        train_win_number = int(np.floor(audio.shape[0] - win_length) / hop_size)
        
        audio_features = np.zeros((train_win_number, n_features))
        
        for i in np.arange(train_win_number):
            frame = audio[i * hop_size : i * hop_size + win_length]
            frame_wind = frame * window

            spec = np.fft.fft(frame_wind)
            nyquist = int(np.floor(spec.shape[0] / 2))
            spec = spec[1:nyquist]

            audio_features[i, 0] = compute_zcr(frame_wind, fs)
            audio_features[i, 1] = compute_specdec(spec)
            audio_features[i, 2] = compute_speccentr(spec, fs)
        
        train_basic_features.append(audio_features)

print("finished")


finished


In [32]:
print(train_basic_features[0].shape)
print(train_win_number)
print(train_basic_features[0])

(38, 3)
51
[[ 1.10000000e+03 -1.52378090e-01  8.77889275e+02]
 [ 1.20000000e+03 -1.04533151e-01  7.09824130e+02]
 [ 9.00000000e+02  9.49024304e-02  1.45867690e+03]
 [ 1.70000000e+03  1.72979950e-01  1.51325997e+03]
 [ 2.30000000e+03  2.29357236e-01  1.37032009e+03]
 [ 2.30000000e+03  1.46944005e-01  1.46838952e+03]
 [ 2.00000000e+03  6.37848605e-02  1.78550520e+03]
 [ 1.90000000e+03  1.32156388e-01  1.73454973e+03]
 [ 1.70000000e+03  2.86262286e-01  1.35183094e+03]
 [ 2.10000000e+03  1.74912524e-01  1.41377249e+03]
 [ 1.40000000e+03  8.02855520e-02  1.64502617e+03]
 [ 1.50000000e+03  2.02296296e-01  1.62178907e+03]
 [ 1.50000000e+03  3.22887721e-01  1.31904195e+03]
 [ 1.40000000e+03  3.42168129e-01  1.17515540e+03]
 [ 1.40000000e+03  2.39485640e-01  1.43487403e+03]
 [ 1.30000000e+03  2.11733534e-01  1.62275740e+03]
 [ 1.90000000e+03  2.43860539e-01  1.46424856e+03]
 [ 2.10000000e+03  2.62709266e-01  1.37931412e+03]
 [ 1.10000000e+03  3.53844672e-01  9.33581634e+02]
 [ 1.10000000e+03  4

## SVM multiclass

In [70]:
X_train = np.array(np.zeros(4))
for i,f in enumerate(X_train):
    print(i,f)

print(dict_train_features[classes[0]].shape)


0 0.0
1 0.0
2 0.0
3 0.0
(1500, 13)


In [78]:
X_train = []
X_test = []
X_train_max = np.zeros(10)
X_train_min = np.zeros(10)

for i in np.arange(10):
    X_train.append(dict_train_features[classes[i]])
    X_test.append(dict_test_features[classes[i]])
    X_train_max[i] = np.max(dict_train_features[classes[i]], axis=0)
    X_train_min[i] = np.min(dict_train_features[classes[i]], axis=0)
    
feat_max = np.max(X_train_max, axis=0)
feat_min = np.min(X_train_min, axis=0)

X_train_normalized = []
X_test_normalized = []

for i in np.arange(10):
    X_train_normalized.append((X_train[i] - feat_min) / (feat_max - feat_min))
    X_test_normalized.append((X_test[i] - feat_min) / (feat_max - feat_min))

X_test_mc_normalized = X_test_normalized[0]
for i in np.arange(1,10):
    X_test_mc_normalized = np.concatenate((X_test_mc_normalized, X_test_normalized[i]), axis=0)

ValueError: setting an array element with a sequence.

### Notes

In [None]:
# maybe check if volume is constant in each recording
# also duration seems to change, is this a problem?