# A second method for changing the data
****
We will attempt a second method of changing the data used by https://www.kaggle.com/kcs93023/tf-speech-recognition-by-cnn

This translates the data into numpy arrays directly. It will be possible to build the model directly in this notebook.

In [2]:
import os
from os.path import isdir, join
from pathlib import Path


#Scientific Library
import numpy as np
from scipy.fftpack import fft
from scipy import signal
from scipy.io import wavfile

# Visualization Library
import matplotlib.pyplot as plt
import tensorflow as tf
import IPython.display as ipd
#import plotly.offline as py
#py.init_notebook_mode(connected=True)
#import plotly.graph_objs as go
    
%matplotlib inline

In [3]:
print(os.listdir("../Speech_Recognition/"))
audio_path = '../Speech_Recognition/train/audio/'
print(os.listdir(audio_path))

['Preliminary_CNN.ipynb', '.DS_Store', 'test', 'Convert_to_Spec.ipynb', 'specs', 'specs_split', 'README.md', 'train', '.ipynb_checkpoints', '.git', 'Data_spec_pt_2.ipynb']
['right', 'eight', 'cat', 'tree', 'bed', 'happy', 'go', 'dog', 'no', 'wow', 'nine', 'left', 'stop', 'three', '_background_noise_', 'sheila', 'one', 'bird', 'zero', 'seven', 'up', 'marvin', 'two', 'house', 'down', 'six', 'yes', 'on', 'five', 'off', 'four']


In [4]:
def log_specgram(audio, sample_rate, window_size=20, step_size = 10,
                eps = 1e-10):
    nperseg = int(round(window_size * sample_rate / 1e3))
    noverlap = int(round(step_size * sample_rate / 1e3))
    freqs, times, spec = signal.spectrogram(audio,
                                           fs = sample_rate,
                                           window='hann',
                                           nperseg = nperseg,
                                           noverlap=noverlap,
                                           detrend=False)
    return freqs, times, np.log(spec.T.astype(np.float32) + eps)

In [5]:
dirs = [f for f in os.listdir(audio_path) if isdir(join(audio_path, f))]
dirs.sort()

print(dirs)

['_background_noise_', 'bed', 'bird', 'cat', 'dog', 'down', 'eight', 'five', 'four', 'go', 'happy', 'house', 'left', 'marvin', 'nine', 'no', 'off', 'on', 'one', 'right', 'seven', 'sheila', 'six', 'stop', 'three', 'tree', 'two', 'up', 'wow', 'yes', 'zero']


In [6]:
all_data = []
spec_all = []
target_all = []
target_value = {}

for i, direct in enumerate(dirs):
    # read wave files each directory
    waves = [f for f in os.listdir(join(audio_path, direct)) if f.endswith('.wav')]
    target_value[direct] = i
    for j, wav in enumerate(waves):
        target_all.append(direct)
        sample_rate, samples = wavfile.read(join(audio_path, direct, wav))
        #Resample
        if samples.shape[0] != 16000 :
            continue;
#         resamples = signal.resample(samples, new_sample_rate)
        freqs, times, spec = log_specgram(samples, sample_rate)
        # min-max Normalization
        spec = (spec - spec.min())/(spec.max() - spec.min())
     
        all_data.append([spec.T, direct])



In [7]:
# shuffle data
np.random.shuffle(all_data)
# split data to Spectrogram and Label
spec_all = np.reshape(np.delete(all_data,1,1),(len(all_data)))
target_all = [i for i in np.delete(all_data,0,1).tolist()]

In [8]:
# get 80% train indices 
train_indices = np.random.choice(len(target_all),
                                 round(len(target_all) * 0.8), replace=False)
# get 20% test indices without train indices
test_indices = np.array(list(set(range(len(target_all)))
                                 - set(train_indices)))
# Arrangement
spec_vals = np.array([x for x in spec_all])
target_vals = np.array([x for x in target_all])

# split data train and test
train_spec = spec_vals[train_indices][:]
train_target = target_vals[train_indices][:]
test_spec = spec_vals[test_indices][:]
test_target = target_vals[test_indices][:]

In [13]:
train_spec[0]

array([[0.78473073, 0.6655199 , 0.7712438 , ..., 0.7938667 , 0.7821201 ,
        0.7791913 ],
       [0.78149277, 0.7461919 , 0.7597199 , ..., 0.77292025, 0.7490061 ,
        0.7811874 ],
       [0.6986473 , 0.6939601 , 0.7037581 , ..., 0.6874292 , 0.6938926 ,
        0.72256225],
       ...,
       [0.31088853, 0.3300051 , 0.24489383, ..., 0.26862344, 0.3622802 ,
        0.3643632 ],
       [0.28287166, 0.28495398, 0.26801312, ..., 0.2821961 , 0.22291708,
        0.33880827],
       [0.2537828 , 0.18082158, 0.29916936, ..., 0.3052732 , 0.25261372,
        0.3096    ]], dtype=float32)