## Mount Drive

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
!pip install spafe



## Importing Libraries


In [None]:
import tensorflow as tf
print(tf.__version__)

import librosa
import sys,os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tensorflow.keras.applications import VGG16
import sys,os
from tensorflow.keras.layers import Dense, Dropout, Flatten, Conv1D, Input, MaxPooling1D , Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.applications import VGG16
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.utils import to_categorical 

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

%matplotlib inline

2.4.0


# Defining Features

In [None]:
def mfcc_feature(audio, sample_rate):
    mfcc = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
 
    return mfcc   # it returns a np.array with size (40,'n') where n is the number of audio frames.

def melspectrogram_feature(audio, sample_rate):
    melspectrogram = librosa.feature.melspectrogram(y=audio, sr=sample_rate, n_fft=2048)
 
    return melspectrogram   # it returns a np.array with size (128,'n') where n is the number of audio frames.

def poly_feature(audio, sample_rate):
    poly_features = librosa.feature.poly_features(y=audio, sr=sample_rate, n_fft=2048)
 
    return poly_features   # it returns a np.array with size (2,'n') where n is the number of audio frames.

def zero_crossing_rate_features(audio):
    zero_crossing_rate = librosa.feature.zero_crossing_rate(y=audio)
 
    return zero_crossing_rate   # it returns a np.array with size (1,'n') where n is the number of audio frames.


## Normalize Functions

In [None]:
def normalize_2d(v): 
  for i in range(v.shape[0]):
    norm = np.linalg.norm(v[i]) 
    if norm == 0: 
      v[i]= v[i] 
    else:
      v[i]= v[i] / norm
  return v

def normalize_1d(v): 
  norm = np.linalg.norm(v) 
  if norm == 0: 
    return v 

  return v / norm

# MP3 to Dataframe

In [None]:
import warnings
warnings.filterwarnings('ignore')

# setting the path where all file's folder are
root = "gdrive/MyDrive/word_data/kaggle_original_data/train/audio/"

Featured_data = pd.DataFrame(columns=['MFCC', 'Mel-scaled-spectrogram', 'Poly','ZCR','class'])              

i = 0
max_len = 300
sample_rate = 16000
no_of_samples = 800
labels = ['bed', 'cat', 'down', 'left', 'no', 'right', 'seven', 'stop', 'yes', 'up']

# Loading the features in the dataframe
for label in labels:
  
  print(label)
  folders = os.path.join(root,label)
  items = os.listdir(folders)

  for item in items[:no_of_samples]:
    
    path = os.path.join(folders,item)


    #Convert .wave into array
    samples, sample_rate=librosa.load(path ,sr=sample_rate)

    #Extract Feautures
    MFCC = mfcc_feature(samples , sample_rate)
    MSS = melspectrogram_feature(samples , sample_rate)
    poly = poly_feature(samples , sample_rate)
    ZCR = zero_crossing_rate_features(samples) 

    #Normalizing
    MFCC = normalize_2d(MFCC)

    # zero-pad the mfccs features in order to have all compatible shapes for input of the CNN.
    # max_pad_len is the biggest number of audio frames   
    # obtained by extracting features from all the audio files.
    
    pad_width = max_len - MFCC.shape[1]
    if pad_width > 0:
      MFCC = np.pad(MFCC, pad_width=((0,0), (0, pad_width)), mode='constant')
   
    pad_width = max_len - MSS.shape[1]
    if pad_width > 0:
      MSS = np.pad(MSS, pad_width=((0,0), (0, pad_width)), mode='constant')
      
    pad_width = 1536 - poly.shape[1]
    if pad_width > 0:
      poly = np.pad(poly, pad_width=((0,0), (0, pad_width)), mode='constant')

    pad_width = 3072 - ZCR.shape[1]
    if pad_width > 0:
      ZCR = np.pad(ZCR, pad_width=((0,0), (0, pad_width)), mode='constant')

    #Trimming the array upto fix size

    MFCC = MFCC[:,:max_len]
    MSS = MSS[:,:max_len]
    poly = poly[:,:1536]
    ZCR = ZCR[:,:3072]
    # Add to dataframe
    Featured_data.loc[i] = [ MFCC,  MSS, poly, ZCR, label]

    i += 1

bed
cat
down
left
no
right
seven
stop
yes
up


In [None]:
Featured_data.head()

Unnamed: 0,MFCC,Mel-scaled-spectrogram,Poly,ZCR,pitch,LPCC,RPLP,class
0,"[[-0.19713012389135587, -0.1945905536383745, -...","[[0.0001326223119728629, 0.0003938418726812632...","[[-2.9107745546378306e-06, -2.276604681310361e...","[[0.11376953125, 0.14794921875, 0.20556640625,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[-2.8603629233984336, -1.4728316731866917, -0...","[[-0.6558703204441466, -0.3993365324161662, -0...",bed
1,"[[-0.2753804014193173, -0.2743249005224783, -0...","[[0.00018727586306803044, 4.8209101116233705e-...","[[-5.03670855138826e-06, -5.279823526322521e-0...","[[0.0869140625, 0.13623046875, 0.18701171875, ...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[-2.8603629368672423, -1.4728316801219266, -0...","[[-0.6558703204441466, -0.3993365324161662, -0...",bed
2,"[[-0.24450878886256341, -0.24450878886256341, ...","[[0.0, 0.0, 0.0, 0.0, 0.0, 9.383146488754954e-...","[[0.0, 0.0, 0.0, 0.0, 0.0, -4.111676586916924e...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0029296875, 0.012...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[-2.860362985281585, -1.472831705050993, -0.2...","[[-0.6558703204441466, -0.3993365324161662, -0...",bed
3,"[[-0.19789257887175704, -0.19176426496606228, ...","[[0.011882200598090163, 0.009293751574537351, ...","[[4.4662728760687e-06, 2.6258644590753796e-06,...","[[0.1123046875, 0.15966796875, 0.1953125, 0.18...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[-2.86036306482868, -1.4728317460106488, -0.2...","[[-0.6558703204441466, -0.3993365324161662, -0...",bed
4,"[[-0.18748303586367043, -0.1949403284832351, -...","[[2.8792433556922997e-05, 8.247128168659806e-0...","[[-7.717891492684842e-06, -7.2206441119186415e...","[[0.03173828125, 0.04248046875, 0.04833984375,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[-2.8603629498162317, -1.4728316867895006, -0...","[[-0.6558703204441466, -0.3993365324161662, -0...",bed


In [None]:
# np.set_printoptions(threshold=sys.maxsize)

In [None]:
#Saving DataFrame into .CSV

# Featured_data.to_csv('data.csv', index = False)

## Splitting into data ino test and train

In [None]:
Featured_data_train, Featured_data_test = train_test_split(Featured_data, random_state = 42, train_size = .80, shuffle=True,  stratify = Featured_data['class'].values)

## Converting dataframe columns into list

In [None]:
# Train Features
MFCC_train = np.array(Featured_data_train.MFCC.tolist())          
MSS_train = np.array(Featured_data_train['Mel-scaled-spectrogram'].tolist())
poly_train = np.array(Featured_data_train.Poly.tolist())
ZCR_train = np.array(Featured_data_train.ZCR.tolist())

# test Features
MFCC_test = np.array(Featured_data_test.MFCC.tolist())          
MSS_test = np.array(Featured_data_test['Mel-scaled-spectrogram'].tolist())
poly_test = np.array(Featured_data_test.Poly.tolist())
ZCR_test = np.array(Featured_data_test.ZCR.tolist())

# Test Labels
labels_test =np.array(Featured_data_test['class'].tolist())   

# Train Labels
labels_train =np.array(Featured_data_train['class'].tolist())   

## Transform Lables

In [None]:
le = LabelEncoder()
labels_train = to_categorical(le.fit_transform(labels_train)) 
labels_test = to_categorical(le.fit_transform(labels_test)) 

## Reshaping 

In [None]:
# Train Features

MFCC_train = MFCC_train.reshape(MFCC_train.shape[0], 80,50,3)
MSS_train = MSS_train.reshape(MSS_train.shape[0], 128, 100,3)
poly_train = poly_train.reshape(poly_train.shape[0],32, 32,3)
ZCR_train = ZCR_train.reshape(ZCR_train.shape[0], 32, 32,3)

# Test Features

MFCC_test = MFCC_test.reshape(MFCC_test.shape[0],80,50,3)  
MSS_test = MSS_test.reshape(MSS_test.shape[0],128, 100,3)
poly_test = poly_test.reshape(poly_test.shape[0], 32, 32,3)
ZCR_test = ZCR_test.reshape(ZCR_test.shape[0], 32, 32,3)

## Modeling

In [None]:
input_MFCC = Input(shape=(80,50,3), name='MFCC')
input_MSS = Input(shape=(128, 100,3), name='MSS')
input_poly = Input(shape=(32, 32,3), name='poly')
input_ZCR = Input(shape=(32, 32,3), name='ZCR')

def compute_wave_features(conv, nam=None):

  conv_base = VGG16(weights='imagenet',
                    include_top=False)
  conv_base.trainable = False
  conv_base._name = str(np.random.choice(5000))
  
  #First Conv1D layer
  conv = conv_base(conv)
  #Flatten layer
  flatten = Flatten()(conv)

  return flatten

feature_MFCC = compute_wave_features(input_MFCC, 'feature_MFCC')
feature_MSS = compute_wave_features(input_MSS, 'feature_MSS')
feature_poly = compute_wave_features(input_poly, 'feature_poly')
feature_ZCR = compute_wave_features(input_ZCR, 'feature_ZCR')

features = Concatenate()([feature_MFCC,
                          feature_MSS,
                          feature_poly,
                          feature_ZCR])

features = Dense(32, activation='relu')(features)
readout = Dense(10, activation='softmax', name='readout')(features)

model = Model([input_MFCC,input_MSS,input_poly,input_ZCR], readout)
model.summary()

In [None]:
model.compile(loss='categorical_crossentropy',optimizer='RMSprop' ,metrics=['accuracy'])

In [None]:
class ActivationLogger(tf.keras.callbacks.Callback):
  def set_model(self, model):
    self.model = model
    # serialize model to JSON
    model_json = model.to_json()
    with open("/content/gdrive/MyDrive/word_data/model_new.json", "w") as json_file:
        json_file.write(model_json)
    # serialize weights to HDF5
    model.save_weights("/content/gdrive/MyDrive/word_data/model_new.h5")
    print("Saved model to disk")

In [None]:
history=model.fit([MFCC_train, MSS_train,poly_train, ZCR_train], 
                  labels_train ,
                  epochs=500,
                  callbacks=[
                             EarlyStopping(monitor='val_loss',
                                             mode='min',
                                             verbose=1,
                                             patience=10,
                                             min_delta=0.0001),
                             ActivationLogger()],
                  batch_size=42,
                  validation_split=.17)

Saved model to disk
Epoch 1/500


In [None]:
from keras.models import model_from_json
 # load json and create model
json_file = open('/content/gdrive/MyDrive/word_data/model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("/content/gdrive/MyDrive/word_data/model.h5")

# evaluate loaded model on test data
loaded_model.compile(loss='categorical_crossentropy',optimizer=RMSprop(lr=2e-5),metrics=['accuracy'])

In [None]:
history.history

In [None]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)
plt.plot(epochs, acc, 'ro', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()
plt.figure()
plt.plot(epochs, loss, 'ro', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()

In [None]:
# Evaluate the model on the test data using `evaluate`
print("Evaluate on test data")
test_loss, test_acc = model.evaluate([MFCC_test,MSS_test,poly_test, ZCR_test], labels_test, batch_size=32)
print("test loss, test acc:",test_loss )

Evaluate on test data
test loss, test acc: 2.7033603191375732


In [None]:
print(f'Accuracy {test_acc} \nloss {test_loss}')

Accuracy 0.15000000596046448 
loss 2.7033603191375732
