In [1]:
import os
import pathlib
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from keras import models, layers

In [None]:
tf.keras.utils.get_file(
    origin = "http://storage.googleapis.com/download.tensorflow.org/data/mini_speech_commands.zip",
    extract = True,
    cache_dir = '.',
    cache_subdir = 'data'
)

'./data/mini_speech_commands.zip'

In [None]:
DATASET_PATH = "data/mini_speech_commands/"
data_dir = pathlib.Path(DATASET_PATH)

In [None]:
train_ds, val_ds = tf.keras.utils.audio_dataset_from_directory(
    # directory data
    directory = data_dir,
    # jumlah sample dalam satu batch
    batch_size = 64,
    # train 70%, validation 30%\
    validation_split = 0.3,
    seed = 0,
    output_sequence_length = 16000,
    subset = 'both'
)

Found 8000 files belonging to 8 classes.
Using 5600 files for training.
Using 2400 files for validation.


In [None]:
# datain dari dataset kita ada label namesnya apa aja
label_names = np.array(train_ds.class_names)
print(label_names)

['down' 'go' 'left' 'no' 'right' 'stop' 'up' 'yes']


In [None]:
# squeeze -> untk hilangin axis terakhir utk jadiin dia mono audio
# (num samples, num_channels) -> (num_samples,)

def squeeze(audio, label):
  audio = tf.squeeze(audio, axis = -1)
  return audio, label

# autotune -> kontrol berapa banyak calls yang bisa dijalankan secara pararel atau bersamaan berdasarkan resource yang ada
train_ds = train_ds.map(squeeze, tf.data.AUTOTUNE)
val_ds = val_ds.map(squeeze, tf.data.AUTOTUNE)

In [None]:
# split validation using shard
# awalnya dataset validation = 30%, tapi soal mintanya 15% val, 15% test

test_ds = val_ds.shard(num_shards = 2, index = 0)
val = val_ds.shard(num_shards = 2, index = 1)
# val ds akan kepecah menjadi 2, test ds ambil yang 50% pertama dari 30% = 15%
# val ds ambil 50% kedua dari 30% = 15%

# training = 70%, val = 15%, test = 15%
# data sekarang training = 70%, val = 15%, test 15%

In [None]:
# bikin spectogram dari waveform
def get_spectogram(waveform):
  # STFT -> Short Time Fourier Transform
  spectogram = tf.signal.stft(
      waveform,
      frame_length = 255,
      frame_step = 128
  )

  # ambil magnitudenya
  spectogram = tf.abs(spectogram)
  # perlu tambahin satu axis/dimensi, supaya bisa di feed ke model kita (CNN)
  spectogram = spectogram[..., tf.newaxis]
  return spectogram

# function tambahan untuk mempermudah apply/call function spectogram

def make_spec_ds(ds):
  return ds.map(
      # aplikasiin lambda function utk panggil si get_spectogram
      map_func = lambda audio, label: (get_spectogram(audio), label),num_parallel_calls = tf.data.AUTOTUNE
  )



In [None]:
train_spectogram_ds = make_spec_ds(train_ds)
val_spectogram_ds = make_spec_ds(val_ds)
test_spectogram_ds = make_spec_ds(test_ds)

# optimization (cache & AUTOTUNE) dan shuffle
train_spectogram_ds = train_spectogram_ds.cache().shuffle(10000).prefetch(tf.data.AUTOTUNE)
val_spectogram_ds = val_spectogram_ds.cache().prefetch(tf.data.AUTOTUNE)
test_spectogram_ds = test_spectogram_ds.cache().prefetch(tf.data.AUTOTUNE)

In [None]:
# ambil salah satu batch data

for example_sc, example_sc_lbl in train_spectogram_ds.take(1):
  break

In [None]:
# architecture CNN
# input layer -> entry pointdata kedalam model (termasuk preprocess data sebelum dilanjutin ke hidden layer)

# hidden layer -> inti dari model, resize, convolution layer, maxpooling

# output layer -> dense layer/fully connected layer, ini perlu disesuaikan sama banyaknya label /jenis output

input_shape = example_sc.shape[1:]

norm_layer = layers.Normalization()
# dipasangin lambda function krn hanya butuh spectogramnya saja, tidak perlu semuanya
norm_layer.adapt(data = train_spectogram_ds.map(map_func = lambda spec, label: spec))

model = models.Sequential([
    # input layer
    layers.Input(shape = input_shape),
    # resize layer -> utk ngeresize spectogram supaya seragam
    layers.Resizing(32, 32),
    # normalization layer -> supaya data berada dalam satu range yang sama semua
    norm_layer,

    # hidden layer -> maxpooling, dropout, convolution
    # 32 filter, 3x3 size filter
    # menghasilkan 32 basic pattern dari setiap spectogram
    layers.Conv2D(32, 3, activation = 'relu'),
    layers.Conv2D(64, 3, activation = 'relu'),

    # maxpooling -> utk extract fitur/pattern yang dominan -> tuk prevent overfitting
    layers.MaxPooling2D(),
    # droupout -> randomly ngedrop data supaya ngeprevent modelnya overfitting
    layers.Dropout(0.2),
    # o.2 = 20%

    # overfitting = modelnya terlalu smart dalam mengingat hal yang sudah di train, sehingga menyebabkan kaku, jadi sulit recognize hal yang baru-> biasanya terjadi karena model terlalu complex

    # underfitting = simply kurang pinter modelnya

    # flatten layer

    layers.Flatten(),

    # dense layer
    layers.Dense(128, activation = 'relu'),
    layers.Dropout(0.1),

    # output layer -> sesuain sama jumlah label kalian
    layers.Dense(len(label_names))
])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 resizing_3 (Resizing)       (None, 32, 32, 1)         0         
                                                                 
 normalization_3 (Normaliza  (None, 32, 32, 1)         3         
 tion)                                                           
                                                                 
 conv2d_6 (Conv2D)           (None, 30, 30, 32)        320       
                                                                 
 conv2d_7 (Conv2D)           (None, 28, 28, 64)        18496     
                                                                 
 max_pooling2d_3 (MaxPoolin  (None, 14, 14, 64)        0         
 g2D)                                                            
                                                                 
 dropout_3 (Dropout)         (None, 14, 14, 64)        0