# Spoken Digit Classification

In [None]:
import numpy as np
import librosa
import os
import matplotlib.pyplot as plt
import sklearn.svm
import IPython.display as ipd
import scipy as sp
%matplotlib inline
from sklearn.mixture import BayesianGaussianMixture
# import seaborn as sns


# Define features

## Zero crossing rate

In [None]:
def compute_zcr(win, Fs):
    # returns an array of 1 (value > 0) and 0 (value < 0)
    win_sign = np.sign(win)
    # length calculated across input dimension (in this case 0)
    N = win.shape[0]
    # difference inside the formula 
    # (NB win_sign[:-1] means from first to the second last, so the last is excluded)
    sign_diff = np.abs(win_sign[:-1] - win_sign[1:])
    
    zcr = np.sum(sign_diff) * Fs / (2 * N)
    zcr = len(sign_diff[sign_diff != 0]) * Fs / N
    return zcr

## Spectral decrease

In [None]:
def compute_specdec(spec):
    mul_fact = 1 / (np.sum(np.abs(spec[1:])))
    num = np.abs(spec[1:]) - np.abs(spec[0])
    den = np.arange(1, len(spec)) - 1
    den[den == 0] = 1
    spectral_decrease = mul_fact * np.sum(num / den)
    return spectral_decrease

## Spectral centroid

In [None]:
def compute_speccentr(spec, Fs):
    freq_axis = np.linspace(0, Fs//2, len(spec))
    centr = np.dot(freq_axis, np.abs(spec)) / np.sum(np.abs(spec), axis = 0)
    return centr

## MFCC

In [None]:
def compute_mfcc(audio, fs, n_mfcc):
    # Compute the spectrogram of the audio signal
    X = np.abs(librosa.stft(
        audio,
        window='hamming',
        n_fft=1024,
        hop_length=512,)
        )
    
    # Find the weights of the mel filters
    mel = librosa.filters.mel(
        sr=fs,
        n_fft=1024,
        n_mels=40,
        fmin=133.33,
        fmax=6853.8,
    )
    
    # Apply the filters to spectrogram
    melspectrogram = np.dot(mel, X)
    # Take the logarithm
    log_melspectrogram = np.log10(melspectrogram + 1e-16)
    
    # Apply the DCT to log melspectrogram to obtain the coefficients
    mfcc = sp.fftpack.dct(log_melspectrogram, axis=0, norm='ortho')[1:n_mfcc+1]
    return mfcc

## import recordings

In [None]:
# define the 10 classes
classes = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine']

root = '../free-spoken-digit-dataset/recordings'
for c in classes:
    # train array (for now it will contain all even recordings)
    class_train_files = []
    # test array (for now it will contain all odd recordings)
    class_test_files = []
    
    for f in os.listdir(root):
        if int(f[-5]) % 2 == 0:
            #print("{} è pari".format(f[-5]))
            class_train_files.append(f)
        else:
            #print("{} è dispari".format(f[-5]))
            class_test_files.append(f)

# check length of train and test arrays
n_train_samples = len(class_train_files)
n_test_samples = len(class_test_files)

print("number of train files: {}".format(n_train_samples))
print("number of test files: {}".format(n_test_samples))

In [None]:
# listen to an audio file
audio, fs = librosa.load(os.path.join(train_root, class_train_files[0]), sr=None)
ipd.Audio(audio, rate=fs) # load the file

## compute MFCC

In [None]:
# initialize dictionaries
dict_train_features = {'zero': [], 'one': [], 'two': [], 'three': [], 'four': [], 'five': [], 'six': [], 'seven': [], 'eight': [], 'nine': []}
dict_test_features = {'zero': [], 'one': [], 'two': [], 'three': [], 'four': [], 'five': [], 'six': [], 'seven': [], 'eight': [], 'nine': []}


# why 13? try different values. check warning
n_mfcc = 13

for c in classes:
    train_features = np.zeros((n_train_samples, n_mfcc))
    for index, f in enumerate(class_train_files):
        audio, fs = librosa.load(os.path.join(root, f), sr=None)
        mfcc = compute_mfcc(audio, fs, n_mfcc)
        train_features[index, :] = np.mean(mfcc, axis=1)
    dict_train_features[c] = train_features
    
    test_features = np.zeros((n_test_samples, n_mfcc))
    for index, f in enumerate(class_test_files):
        audio, fs = librosa.load(os.path.join(root, f), sr=None)
        mfcc = compute_mfcc(audio, fs, n_mfcc)
        test_features[index, :] = np.mean(mfcc, axis=1)
    dict_test_features[c] = test_features
    


## compute other features

In [None]:
features_names = ['Zero Crossing Rate', 'Spectral Decrease', 'Spectral Centroid']

# check parameters
win_length = int(np.floor(0.01 * fs))
hop_size = int(np.floor(0.0075 * fs))

window = sp.signal.get_window(window='hanning', Nx=win_length)
n_features = len(features_names)

In [52]:

train_basic_features = np.array([])
# for each class
for c in classes:
    
    # for each train audio file
    for index, f in enumerate(class_train_files):
        audio, fs = librosa.load(os.path.join(root, f), sr=None)
        train_win_number = int(np.floor(audio.shape[0] - win_length) / hop_size)
        
        audio_features = np.zeros((train_win_number, n_features))
        
        for i in np.arange(train_win_number):
            frame = audio[i * hop_size : i * hop_size + win_length]
            frame_wind = frame * window

            spec = np.fft.fft(frame_wind)
            nyquist = int(np.floor(spec.shape[0] / 2))
            spec = spec[1:nyquist]

            audio_features[i, 0] = compute_zcr(frame_wind, fs)
            audio_features[i, 1] = compute_specdec(spec)
            audio_features[i, 2] = compute_speccentr(spec, fs)
        
        train_basic_features.append(audio_features)

print("finished")

AttributeError: 'numpy.ndarray' object has no attribute 'append'

### Notes

In [None]:
# maybe check if volume is constant in each recording
# also duration seems to change, is this a problem?