In [212]:
import os
import pathlib

import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import tensorflow as tf

from tensorflow.keras.layers.experimental import preprocessing
from tensorflow.keras import layers
from tensorflow.keras import models
from IPython import display
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D
from tensorflow.keras.utils import to_categorical
from keras import optimizers

In [213]:
data_dir = pathlib.Path("D:\\speech recog\\train set2")

In [216]:
commands = np.array(tf.io.gfile.listdir(str(data_dir)))
print('Commands:', commands)

Commands: ['คอส' 'คูณ' 'บวก' 'ยกกำลัง' 'ยี่' 'ร้อย' 'ลบ' 'วาย' 'ศูนย์' 'สอง' 'สาม'
 'สิบ' 'สี่' 'ส่วน' 'หก' 'หนึ่ง' 'หาร' 'ห้า' 'เก้า' 'เจ็ด' 'เท่ากับ' 'เศษ'
 'เอ็กซ์' 'เอ็ด' 'แซด' 'แทน' 'แปด' 'ไซน์']


In [217]:
filenames = tf.io.gfile.glob(str(data_dir) + '/*/*')
filenames = tf.random.shuffle(filenames)
num_samples = len(filenames)
print('Number of total examples:', num_samples)
print('Number of examples per label:',
      len(tf.io.gfile.listdir(str(data_dir/commands[0]))))
print('Example file tensor:', filenames[0])

Number of total examples: 4480
Number of examples per label: 160
Example file tensor: tf.Tensor(b'D:\\speech recog\\train set2\\\xe0\xb9\x80\xe0\xb8\xa8\xe0\xb8\xa9\\\xe0\xb9\x80\xe0\xb8\xa8\xe0\xb8\xa9200cut238.wav', shape=(), dtype=string)


In [218]:
testfiles = pathlib.Path("D:\\speech recog\\test set")
testfiles = tf.io.gfile.glob(str(testfiles) + '/*/*')

valfiles = pathlib.Path("D:\\speech recog\\validation")
valfiles = tf.io.gfile.glob(str(valfiles) + '/*/*')

train_files = filenames
val_files = valfiles
test_files = testfiles

print('Training set size', len(train_files))
print('Validation set size', len(val_files))
print('Test set size', len(test_files))

Training set size 4480
Validation set size 560
Test set size 840


In [219]:
def decode_audio(audio_binary):
    audio, _ = tf.audio.decode_wav(audio_binary)
    return tf.squeeze(audio, axis=-1)

In [220]:
def get_label(file_path):
    parts = tf.strings.split(file_path, os.path.sep)

  # Note: You'll use indexing here instead of tuple unpacking to enable this 
  # to work in a TensorFlow graph.
    return parts[-2]

In [222]:
def get_waveform_and_label(file_path):
    label = get_label(file_path)
    audio_binary = tf.io.read_file(file_path)
    waveform = decode_audio(audio_binary)
    return waveform, label

In [223]:
AUTOTUNE = tf.data.AUTOTUNE
files_ds = tf.data.Dataset.from_tensor_slices(train_files)
waveform_ds = files_ds.map(get_waveform_and_label, num_parallel_calls=AUTOTUNE)

In [110]:
print(waveform_ds)

<ParallelMapDataset shapes: ((None,), ()), types: (tf.float32, tf.string)>


In [111]:
print(files_ds)
print(AUTOTUNE)

<TensorSliceDataset shapes: (), types: tf.string>
-1


In [224]:
def get_spectrogram(waveform):
  # Zero-padding for an audio waveform with less than 16,000 samples.
    input_len = 217413
    waveform = waveform[:input_len]
    zero_padding = tf.zeros([217413] - tf.shape(waveform),dtype=tf.float32)
  # Cast the waveform tensors' dtype to float32.
    waveform = tf.cast(waveform, dtype=tf.float32)
  # Concatenate the waveform with `zero_padding`, which ensures all audio
  # clips are of the same length.
    equal_length = tf.concat([waveform, zero_padding], 0)
  # Convert the waveform to a spectrogram via a STFT.
    spectrogram = tf.signal.stft(equal_length, frame_length=255, frame_step=128)
  # Obtain the magnitude of the STFT.
    spectrogram = tf.abs(spectrogram)
  # Add a `channels` dimension, so that the spectrogram can be used
  # as image-like input data with convolution layers (which expect
  # shape (`batch_size`, `height`, `width`, `channels`).
    #spectrogram = spectrogram[..., tf.newaxis]
    return spectrogram

In [12]:
#wavshape = []
for waveform, label in waveform_ds:
    label = label.numpy().decode('utf-8')
    spectrogram = get_spectrogram(waveform)
    #wavshape = np.append(wavshape,waveform.shape)
    #print('Waveform shape:', waveform.shape)
#print(wavshape)
    

print('Label:', label)
print('Waveform shape:', waveform.shape)
print('Spectrogram shape:', spectrogram.shape)
print('Audio playback')
display.display(display.Audio(waveform, rate=44100))

Label: เศษ
Waveform shape: (20109,)
Spectrogram shape: (1697, 129)
Audio playback


In [None]:
shape=[]
for waveform, label in waveform_ds:
    shape = np.append(shape,waveform.shape)

In [None]:
np.max(shape)

In [225]:
def get_spectrogram_and_label_id(audio, label):
    spectrogram = get_spectrogram(audio)
    spectrogram = tf.expand_dims(spectrogram, -1)
    label_id = tf.argmax(label == commands)
    return spectrogram, label_id

In [None]:
for waveform, label in waveform_ds.take(10):
    label = label.numpy().decode('utf-8')
    A = get_spectrogram_and_label_id(waveform, label)
    print(A)
    print(len(A))

In [226]:
spectrogram_ds = waveform_ds.map(
    map_func=get_spectrogram_and_label_id,
    num_parallel_calls=AUTOTUNE)

In [17]:
spectrogram_ds

<ParallelMapDataset shapes: ((None, 129, 1), ()), types: (tf.float32, tf.int64)>

In [227]:
def preprocess_dataset(files):
    files_ds = tf.data.Dataset.from_tensor_slices(files)
    output_ds = files_ds.map(
        map_func=get_waveform_and_label,
        num_parallel_calls=AUTOTUNE)
    output_ds = output_ds.map(
        map_func=get_spectrogram_and_label_id,
        num_parallel_calls=AUTOTUNE)
    return output_ds

In [228]:
train_ds = spectrogram_ds
val_ds = preprocess_dataset(val_files)
test_ds = preprocess_dataset(test_files)

In [118]:
val_ds

<ParallelMapDataset shapes: ((None, 129, 1), ()), types: (tf.float32, tf.int64)>

In [None]:
for spectrogram,label in spectrogram_ds.take(4):
    print(spectrogram)
    print("label:",label)

In [229]:
batch_size = 64
train_ds = train_ds.batch(batch_size)
val_ds = val_ds.batch(batch_size)

In [230]:
train_ds = train_ds.cache().prefetch(AUTOTUNE)
val_ds = val_ds.cache().prefetch(AUTOTUNE)

In [22]:
for spectrogram, _ in spectrogram_ds.take(1):
    input_shape = spectrogram.shape
print('Input shape:', input_shape)
num_labels = len(commands)

Input shape: (1697, 129, 1)


"norm_layer = preprocessing.Normalization()\nnorm_layer.adapt(data=spectrogram_ds.map(map_func=lambda spec, label: spec))\n\nmodel = models.Sequential([\n    layers.Input(shape=input_shape),\n    preprocessing.Resizing(32, 32), \n    norm_layer,\n    layers.Conv2D(32, 3, activation='relu'),\n    layers.Conv2D(64, 3, activation='relu'),\n    layers.MaxPooling2D(),\n    layers.Dropout(0.25),\n    layers.Flatten(),\n    layers.Dense(128, activation='relu'),\n    layers.Dropout(0.5),\n    layers.Dense(num_labels),\n])"

Model Training

In [51]:
def get_model():
    model = Sequential()
    model.add(preprocessing.Resizing(32, 32))
    model.add(Conv2D(32, kernel_size=(2, 2), activation='relu', input_shape=(1697, 129, 1)))
    model.add(Conv2D(48, kernel_size=(2, 2), activation='relu'))
    model.add(Conv2D(120, kernel_size=(2, 2), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.25))
    model.add(Flatten())
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.25))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.4))
    model.add(Dense(num_labels, activation='softmax'))
    return model

In [53]:
model = get_model()

optimizer = tf.keras.optimizers.Adam()
# optimizer = optimizers.Adagrad(lr=0.01, epsilon=None, decay=0.0)

model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              optimizer=optimizer,
              metrics=['accuracy'])
model.fit(train_ds, batch_size=batch_size, epochs=30, verbose=1, validation_data=val_ds)

Epoch 1/30




Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x222837b2c10>

In [61]:
#model.save("model_Spectrogram1.h5")

In [231]:
from keras.models import load_model
model = load_model('model_Spectrogram1.h5')
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
resizing_2 (Resizing)        (None, 32, 32, 1)         0         
_________________________________________________________________
conv2d_5 (Conv2D)            (None, 31, 31, 32)        160       
_________________________________________________________________
conv2d_6 (Conv2D)            (None, 30, 30, 48)        6192      
_________________________________________________________________
conv2d_7 (Conv2D)            (None, 29, 29, 120)       23160     
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 14, 14, 120)       0         
_________________________________________________________________
dropout_5 (Dropout)          (None, 14, 14, 120)       0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 23520)            

In [240]:
test_audio = []
test_labels = []

for audio, label in test_ds:
    test_audio.append(audio.numpy())
    test_labels.append(label.numpy())

test_audio = np.array(test_audio)
test_labels = np.array(test_labels)

In [124]:
y_pred = np.argmax(model.predict(test_audio), axis=1)
y_true = test_labels

test_acc = sum(y_pred == y_true) / len(y_true)
print(f'Test set accuracy: {test_acc:.0%}')

Test set accuracy: 92%


In [130]:
y_label = []
for i in range(len(y_true)):
    label = commands[int(y_true[i])]
    y_label = np.append(y_label,label)

In [132]:
ypred_label = []
for i in range(len(y_pred)):
    label = commands[int(y_pred[i])]
    ypred_label = np.append(ypred_label,label)

In [133]:
wrong_pred = []
for i in range(len(y_label)):
    if ypred_label[i]!= y_label[i]:
        wrong_pred = np.append(wrong_pred,y_label[i])

In [136]:
wrong_pred

array(['คูณ', 'คูณ', 'คูณ', 'คอส', 'ลบ', 'ลบ', 'ลบ', 'ลบ', 'ลบ', 'ลบ',
       'ศูนย์', 'ศูนย์', 'ศูนย์', 'ศูนย์', 'ศูนย์', 'วาย', 'วาย', 'สอง',
       'ร้อย', 'หาร', 'หาร', 'แปด', 'ส่วน', 'แปด', 'แปด', 'ส่วน', 'ส่วน',
       'ส่วน', 'ส่วน', 'ส่วน', 'ส่วน', 'ส่วน', 'ส่วน', 'ส่วน', 'ไซน์',
       'แทน', 'หก', 'หก', 'หก', 'หก', 'หก', 'หก', 'หก', 'หก', 'หก', 'หก',
       'หก', 'หก', 'หก', 'หก', 'หก', 'เอ็กซ์', 'เอ็กซ์', 'เอ็กซ์',
       'เอ็กซ์', 'เอ็กซ์', 'เจ็ด', 'เจ็ด', 'เจ็ด', 'เจ็ด', 'เจ็ด', 'เจ็ด',
       'เจ็ด', 'เจ็ด', 'เก้า', 'ห้า'], dtype='<U32')

In [200]:
import pandas as pd

df_test = pd.read_csv('testaccequation.csv')

file_n = df_test['file_location'].values.tolist()
label_n = df_test['label'].values.tolist()

In [201]:
sampleCon_ds=preprocess_dataset(file_n)
sampleCon_ds

<ParallelMapDataset shapes: ((None, 129, 1), ()), types: (tf.float32, tf.int64)>

In [202]:
testCon = []
for audio, label in sampleCon_ds:
    testCon.append(audio.numpy())
    
testCon = np.array(testCon)

In [203]:
sampleCon = model.predict(testCon)
#sampleCon

In [204]:
finalCon = []
final_2nd = []
final_3rd = []
for i in range(len(sampleCon)):
    #print("Sample",i+1)
    rank = []
    array = sampleCon[i]
    worddict = commands
    for i in range(len(sampleCon[i])):
        max = np.argmax(array)
        result = worddict[int(max)]
        array = np.delete(array,int(max))
        worddict = np.delete(worddict,int(max))
        rank = np.append(rank,result)
        #print(max)
        #print("Candidate",i+1)
        #print(result)
    #print(rank)
        #print(array)
    finalCon = np.append(finalCon,rank[0])
    final_2nd = np.append(final_2nd,rank[1])
    final_3rd = np.append(final_3rd,rank[2])

print("Predict",finalCon)
print("2nd rank",final_2nd)
print("3rd rank",final_3rd)

Predict ['สิบ' 'ร้อย' 'สอง' 'ลบ' 'สิบ' 'สิบ' 'สาม' 'หก' 'สอง' 'สิบ' 'สิบ' 'บวก'
 'สิบ' 'สิบ' 'แปด' 'สิบ' 'แปด' 'บวก' 'สิบ' 'เอ็กซ์' 'ลบ' 'สิบ' 'สิบ' 'สิบ'
 'ลบ' 'แปด' 'สิบ' 'สิบ' 'สอง' 'ลบ' 'สี่' 'สิบ' 'สอง' 'ลบ' 'คอส' 'สิบ'
 'แซด' 'หก' 'สิบ' 'หก' 'สิบ' 'สอง' 'ลบ' 'สิบ' 'สิบ' 'บวก' 'สิบ' 'สิบ'
 'สิบ' 'สิบ' 'สิบ' 'สอง' 'ลบ' 'สิบ' 'เอ็ด' 'สิบ' 'สิบ' 'สิบ' 'สิบ' 'สิบ'
 'สิบ' 'แปด' 'สิบ' 'สิบ' 'สิบ' 'สิบ' 'ลบ' 'ห้า' 'สิบ' 'เอ็ด' 'ลบ' 'สิบ'
 'คอส' 'สิบ' 'สิบ' 'แปด' 'ลบ' 'คอส' 'สิบ' 'สี่' 'บวก' 'สิบ' 'ห้า' 'แปด'
 'ร้อย' 'สิบ' 'สาม' 'บวก' 'เก้า' 'สิบ' 'สิบ' 'บวก' 'เอ็กซ์' 'สิบ' 'สิบ'
 'ลบ' 'สิบ' 'สิบ' 'สิบ' 'แซด' 'ร้อย' 'สิบ' 'เก้า' 'บวก' 'เก้า' 'สิบ' 'สิบ'
 'สิบ' 'สิบ' 'สิบ' 'สิบ' 'ลบ' 'สิบ' 'สิบ' 'สิบ' 'เจ็ด' 'ร้อย' 'สี่' 'สิบ'
 'สิบ' 'ลบ' 'หก' 'สิบ' 'หก' 'ลบ' 'เก้า' 'บวก' 'สิบ' 'สิบ' 'สี่' 'เอ็กซ์'
 'ไซน์' 'สอง' 'บวก' 'สี่' 'เศษ' 'วาย' 'หาร' 'แซด' 'เท่ากับ' 'ศูนย์' 'เก้า'
 'ร้อย' 'ยกกำลัง' 'สอง' 'บวก' 'แปด' 'สิบ' 'เอ็ด' 'บวก' 'เอ็กซ์' 'หาร'
 'ไซน์' 'ยกกำลัง' 'ยี่' 'บวก' 'ห้า' 'เศษ' 'ยี่' '

In [208]:
y_predEq = finalCon
y_trueEq = label_n

test_accEq = sum(y_predEq == y_trueEq) / len(y_trueEq)
print(f'Test set accuracy: {test_accEq:.02%}')

Test set accuracy: 64.89%


In [209]:
#test 1st+2nd rank
test_accEq2 = (sum(y_predEq == y_trueEq)+ sum(final_2nd == y_trueEq))/ len(y_trueEq)
print(f'Test set accuracy: {test_accEq2:.0%}')

Test set accuracy: 84%


In [210]:
#test 1st+2nd+3rd rank
test_accEq3 = (sum(y_predEq == y_trueEq) + sum(final_2nd == y_trueEq) + sum(final_3rd == y_trueEq))/ len(y_trueEq)
print(f'Test set accuracy: {test_accEq3:.0%}')

Test set accuracy: 91%


In [None]:
samples = tf.io.gfile.glob(str("D:\\speech recog\\testttt\\T4Acut*.wav"))
num_samples = len(samples)
print(samples)
print(num_samples)

In [37]:
final = []
for i in range(num_samples):
    sample_file = samples[i]
    
    sample_ds = preprocess_dataset([str(sample_file)])
    

    for spectrogram, label in sample_ds.batch(1):
        prediction = model(spectrogram)
        #plt.bar(commands, tf.nn.softmax(prediction[0]))
        #plt.show
        pred = tf.nn.softmax(prediction[0])
        #print(pred)
        #print(np.max(pred))
        
    for i in range(len(commands)):     
        if pred[i]==np.max(pred):
            result = commands[i]
            print(result)
            final = np.append(final,result)

i=0
sen= str(final[0])
while i<=num_samples-2:
    sen = sen+str(final[i+1])
    i+=1
print(sen)

หาร
ร้อย
สิบ
เก้า
บวก
เก้า
สิบ
สิบ
บวก
เอ็กซ์
สิบ
สิบ
ลบ
เอ็ด
สิบ
สิบ
หารร้อยสิบเก้าบวกเก้าสิบสิบบวกเอ็กซ์สิบสิบลบเอ็ดสิบสิบ
