In [48]:
import numpy
import scipy.io.wavfile
from scipy.fftpack import dct

# Features Extraction
per ora estrae le features di 3.5 secondi di un audio scemo

da migliorare aggiungendo le delta e le delta-delta

In [49]:
sample_rate, signal = scipy.io.wavfile.read('OSR_us_000_0010_8k.wav')  # File assumed to be in the same directory
signal = signal[0:int(3.5 * sample_rate)]  # Keep the first 3.5 seconds

In [39]:
pre_emphasis = 0.97
emphasized_signal = numpy.append(signal[0], signal[1:] - pre_emphasis * signal[:-1])

In [40]:
frame_size = 0.025
frame_stride = 0.01

frame_length, frame_step = frame_size * sample_rate, frame_stride * sample_rate  # Convert from seconds to samples
signal_length = len(emphasized_signal)
frame_length = int(round(frame_length))
frame_step = int(round(frame_step))
num_frames = int(numpy.ceil(float(numpy.abs(signal_length - frame_length)) / frame_step))  # Make sure that we have at least 1 frame

pad_signal_length = num_frames * frame_step + frame_length
z = numpy.zeros((pad_signal_length - signal_length))
pad_signal = numpy.append(emphasized_signal, z) # Pad Signal to make sure that all frames have equal number of samples without truncating any samples from the original signal

indices = numpy.tile(numpy.arange(0, frame_length), (num_frames, 1)) + numpy.tile(numpy.arange(0, num_frames * frame_step, frame_step), (frame_length, 1)).T
frames = pad_signal[indices.astype(numpy.int32, copy=False)]

In [41]:
frames *= numpy.hamming(frame_length)
# frames *= 0.54 - 0.46 * numpy.cos((2 * numpy.pi * n) / (frame_length - 1))  # Explicit Implementation **

In [42]:
NFFT = 512
mag_frames = numpy.absolute(numpy.fft.rfft(frames, NFFT))  # Magnitude of the FFT
pow_frames = ((1.0 / NFFT) * ((mag_frames) ** 2))  # Power Spectrum

In [43]:
nfilt = 40

low_freq_mel = 0
high_freq_mel = (2595 * numpy.log10(1 + (sample_rate / 2) / 700))  # Convert Hz to Mel
mel_points = numpy.linspace(low_freq_mel, high_freq_mel, nfilt + 2)  # Equally spaced in Mel scale
hz_points = (700 * (10**(mel_points / 2595) - 1))  # Convert Mel to Hz
bin = numpy.floor((NFFT + 1) * hz_points / sample_rate)

fbank = numpy.zeros((nfilt, int(numpy.floor(NFFT / 2 + 1))))
for m in range(1, nfilt + 1):
    f_m_minus = int(bin[m - 1])   # left
    f_m = int(bin[m])             # center
    f_m_plus = int(bin[m + 1])    # right

    for k in range(f_m_minus, f_m):
        fbank[m - 1, k] = (k - bin[m - 1]) / (bin[m] - bin[m - 1])
    for k in range(f_m, f_m_plus):
        fbank[m - 1, k] = (bin[m + 1] - k) / (bin[m + 1] - bin[m])
filter_banks = numpy.dot(pow_frames, fbank.T)
filter_banks = numpy.where(filter_banks == 0, numpy.finfo(float).eps, filter_banks)  # Numerical Stability
filter_banks = 20 * numpy.log10(filter_banks)  # dB

In [44]:
# to skip if we want only filter banks features
num_ceps = 12
mfcc = dct(filter_banks, type=2, axis=1, norm='ortho')[:, 1 : (num_ceps + 1)] # Keep 2-13

In [45]:
# to skip if we want only banks features
cep_lifter = 22

(nframes, ncoeff) = mfcc.shape
n = numpy.arange(ncoeff)
lift = 1 + (cep_lifter / 2) * numpy.sin(numpy.pi * n / cep_lifter)
mfcc *= lift  #*

In [46]:
filter_banks -= (numpy.mean(filter_banks, axis=0) + 1e-8)

In [32]:
mfcc -= (numpy.mean(mfcc, axis=0) + 1e-8)

In [73]:
mfcc.shape

(348, 12)

### CNN

In [97]:
import keras 
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Dense, Flatten, Dropout
from keras.optimizers import Adam
from keras.callbacks import TensorBoard

In [100]:
# Create the CNN

feat_rows = 348
feat_cols = 12
batch_size = 512
feats_shape = (348, 12, 1)

# x_train = x_train.reshape(x_train.shape[0], *feats_shape) 
# x_test = x_test.reshape(x_test.shape[0], *feats_shape)
# x_validate = x_validate.reshape(x_validate.shape[0], *feats_shape)
# print('x_train shape: {}'.format(x_train.shape))
# print('x_test shape: {}'.format(x_test.shape)
# print('x_validate shape: {}'.format(x_validate.shape))


In [114]:
# 

n_kws = 5

model = Sequential()
model.add(Conv2D(filters = 32, kernel_size = 3, strides=(1, 1), input_shape = feats_shape, 
            padding='valid', data_format='channels_last', dilation_rate=(1, 1),
            activation='relu', use_bias=True, kernel_initializer='glorot_uniform',
            bias_initializer='zeros'))

model.add(MaxPooling2D(pool_size = 2))
model.add(Dropout(0.2))
model.add(Conv2D(32, (3,3), strides=(1, 1),  padding='valid',
                 data_format='channels_last', dilation_rate=(1, 1),
                 activation='relu', use_bias=True, kernel_initializer='glorot_uniform', 
                 bias_initializer='zeros'))
model.add(MaxPooling2D(1))
model.add(Flatten())
model.add(Dense(32, activation = 'relu'))
model.add(Dense(n_kws, activation='softmax'))


In [115]:
tensorboard = TensorBoard(log_dir = r'\logs{}'.format('cnn_1layer'), 
                         write_graph = True, write_grads = True, histogram_freq = 1, write_images = True)


model.compile(loss='sparse_categorical_crossentropy',
              optimizer=Adam(lr=0.001), metrics=['accuracy'])


In [116]:
model.fit(x_train, y_train, batch_size = batch_size, epochs=10,
          verbose = 1, validation_data = (x_validate, y_validate), callbacks = [tensorboard]) 

NameError: name 'x_train' is not defined

In [None]:
score = model.evaluate(x_test, y_test, batch_size=16, verbose = 0)

print('test loss: {:.4f}'.format(score[0]))
print('test acc: {:.4f}'.format(score[1]))