In [None]:
!pip install -q tensorflow tensorflow_io

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.6/49.6 MB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from google.colab import files
uploaded = files.upload()

Saving guess.zip to guess.zip


In [None]:
!wget http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz

--2025-05-05 01:06:19--  http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz
Resolving download.tensorflow.org (download.tensorflow.org)... 192.178.163.207, 142.251.188.207, 74.125.20.207, ...
Connecting to download.tensorflow.org (download.tensorflow.org)|192.178.163.207|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2428923189 (2.3G) [application/gzip]
Saving to: ‘speech_commands_v0.02.tar.gz’


2025-05-05 01:06:29 (237 MB/s) - ‘speech_commands_v0.02.tar.gz’ saved [2428923189/2428923189]



In [None]:
# Making "no" folders
!mkdir -p dataset/no

# Extract the entire "speech_commands" archive
!mkdir -p dataset/speech_commands
!tar -xzf speech_commands_v0.02.tar.gz -C dataset/speech_commands

# Copy only "no" samples into dataset
!cp dataset/speech_commands/no/*.wav dataset/no/

In [None]:
# Unzip "guess" into a temp directory
!unzip -q /content/guess.zip -d /content/tmp_guess

# Create "guess" folder for WAVs
!mkdir -p /content/dataset/guess

# Install ffmpeg
!apt-get -y install ffmpeg

# Convert all .m4a to .wav for "guess"
import os
import subprocess

m4a_dir = "/content/tmp_guess/guess"
wav_dir = "/content/dataset/guess"

for fname in os.listdir(m4a_dir):
    if fname.endswith(".m4a"):
        m4a_path = os.path.join(m4a_dir, fname)
        wav_path = os.path.join(wav_dir, fname.replace(".m4a", ".wav"))
        subprocess.run(["ffmpeg", "-y", "-i", m4a_path, wav_path],
                       stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

print("Conversion complete. WAV files:", len(os.listdir(wav_dir)))

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 34 not upgraded.
Conversion complete. WAV files: 20


In [None]:
import os
import shutil

guess_dir = "/content/dataset/guess"
no_dir = "/content/dataset/no"

guess_count = len([f for f in os.listdir(guess_dir) if f.endswith('.wav')])
no_count = len([f for f in os.listdir(no_dir) if f.endswith('.wav')])

current_count = guess_count
target_count = no_count

print(f" Duplicating {target_count - current_count} samples...")

files = [f for f in os.listdir(guess_dir) if f.endswith('.wav')]

for i in range(target_count - current_count):
    original = os.path.join(guess_dir, files[i % current_count])
    duplicate_name = f"dup_{i}_{os.path.basename(original)}"
    duplicate_path = os.path.join(guess_dir, duplicate_name)
    shutil.copy(original, duplicate_path)

print(f" Done! Total 'guess' samples: {len(os.listdir(guess_dir))}")

 Duplicating 3921 samples...
 Done! Total 'guess' samples: 3941


In [None]:
import tensorflow as tf
import numpy as np
import os
import random
from sklearn.model_selection import train_test_split
from tensorflow.signal import mfccs_from_log_mel_spectrograms

# Parameters
SAMPLE_RATE = 16000
DURATION = 1  # in seconds
NUM_MFCC = 13

# Directories
data_dir = "/content/dataset"
labels = ["guess", "no"]

def load_wav_file(filepath):
    audio = tf.io.read_file(filepath)
    audio, _ = tf.audio.decode_wav(audio, desired_channels=1, desired_samples=SAMPLE_RATE)
    return tf.squeeze(audio, axis=-1)

def extract_mfcc(audio):
    stft = tf.signal.stft(audio, frame_length=256, frame_step=128)
    spectrogram = tf.abs(stft)
    mel = tf.signal.linear_to_mel_weight_matrix(40, stft.shape[-1], SAMPLE_RATE, 20, 4000)
    mel_spectrogram = tf.tensordot(spectrogram, mel, 1)
    mel_spectrogram.set_shape(spectrogram.shape[:-1].concatenate(mel.shape[-1:]))
    log_mel = tf.math.log(mel_spectrogram + 1e-6)
    mfccs = mfccs_from_log_mel_spectrograms(log_mel)[..., :NUM_MFCC]
    return mfccs

# Load data
X, y = [], []
for idx, label in enumerate(labels):
    files = tf.io.gfile.glob(os.path.join(data_dir, label, "*.wav"))
    for file in random.sample(files, min(1000, len(files))):  # use up to 1000 samples per class
        audio = load_wav_file(file)
        mfcc = extract_mfcc(audio)
        X.append(mfcc.numpy())
        y.append(idx)

X = tf.keras.preprocessing.sequence.pad_sequences(X, padding='post', dtype='float32')
X = np.array(X)
y = np.array(y)

# Train/Val split
X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

print(" Feature extraction complete")
print(" X_train:", X_train.shape)
print(" X_val:", X_val.shape)

 Feature extraction complete
 X_train: (1600, 124, 13)
 X_val: (400, 124, 13)


In [None]:
import os
import tarfile
import numpy as np
import soundfile as sf
import random
import shutil

# Extract Dataset
TARBALL_PATH = "/content/speech_commands_v0.02.tar.gz"
EXTRACT_DIR = "/content/dataset"
DATASET_DIR = os.path.join(EXTRACT_DIR, "speech_commands")

# Extract if not already extracted
if not os.path.exists(DATASET_DIR):
    with tarfile.open(TARBALL_PATH, "r:gz") as tar:
        tar.extractall(EXTRACT_DIR)
    print("Dataset extracted.")

# Define Parameters and Paths
SILENCE_DIR = os.path.join(DATASET_DIR, "silence")
UNKNOWN_DIR = os.path.join(DATASET_DIR, "unknown")
TARGET_WORDS = {"guess", "no", "silence", "unknown"}

SAMPLE_RATE = 16000
DURATION = 1  # seconds
SILENCE_COUNT = 500
UNKNOWN_COUNT = 500

# Create target folders
os.makedirs(SILENCE_DIR, exist_ok=True)
os.makedirs(UNKNOWN_DIR, exist_ok=True)

# Generate Silence Samples
for i in range(SILENCE_COUNT):
    silence = np.zeros(SAMPLE_RATE, dtype=np.int16)
    sf.write(f"{SILENCE_DIR}/silence_{i}.wav", silence, SAMPLE_RATE)

print(f"Generated {SILENCE_COUNT} silence samples.")

# Generate Unknown Samples
source_dirs = [
    os.path.join(DATASET_DIR, d)
    for d in os.listdir(DATASET_DIR)
    if os.path.isdir(os.path.join(DATASET_DIR, d)) and d not in TARGET_WORDS
]

unknown_files = []
for dir_path in source_dirs:
    unknown_files.extend([
        os.path.join(dir_path, f)
        for f in os.listdir(dir_path)
        if f.endswith(".wav")
    ])

random.shuffle(unknown_files)

for i in range(UNKNOWN_COUNT):
    src_path = unknown_files[i % len(unknown_files)]
    dst_path = os.path.join(UNKNOWN_DIR, f"unknown_{i}.wav")
    shutil.copy(src_path, dst_path)

print(f"Created {UNKNOWN_COUNT} unknown samples from non-target datasets.")

# Final Structure Check
for d in [SILENCE_DIR, UNKNOWN_DIR]:
    print(f"{d}: {len(os.listdir(d))} files")

Generated 500 silence samples.
Created 500 unknown samples from non-target datasets.
/content/dataset/speech_commands/silence: 500 files
/content/dataset/speech_commands/unknown: 500 files


In [None]:
!mv /content/dataset/speech_commands/silence /content/dataset/
!mv /content/dataset/speech_commands/unknown /content/dataset/

In [None]:
!pip install numpy==1.23.5 --quiet
!pip install librosa==0.9.2

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.1/17.1 MB[0m [31m84.8 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
xarray 2025.3.1 requires numpy>=1.24, but you have numpy 1.23.5 which is incompatible.
jax 0.5.2 requires numpy>=1.25, but you have numpy 1.23.5 which is incompatible.
scikit-image 0.25.2 requires numpy>=1.24, but you have numpy 1.23.5 which is incompatible.
bigframes 2.1.0 requires numpy>=1.24.0, but you have numpy 1.23.5 which is incompatible.
treescope 0.1.9 requires numpy>=1.25.2, but you have numpy 1.23.5 which is incompatible.
albumentations 2.0.6 requires numpy>=1.24.4, but you have numpy 1.23.5 which is incompatible.
imbalanced-learn 0.13.0 requires numpy<3,>=1.24.3, but you have numpy 1.23.5 which is incompatible.
pymc 5.22.0 requires numpy>=1.25.0, but you have numpy 1.23.5 wh

In [None]:
import os
import librosa
import numpy as np
from tqdm import tqdm

# Directory paths for each class
label_map = {
    "guess": 0,
    "no": 1,
    "silence": 2,
    "unknown": 3
}

X = []
y = []

print(" Extracting MFCC features for all classes...\n")

for folder, label in label_map.items():
    folder_path = f"/content/dataset/{folder}"
    for file_name in tqdm(os.listdir(folder_path), desc=f"Processing '{folder}'"):
        if not file_name.endswith(".wav"):
            continue
        file_path = os.path.join(folder_path, file_name)
        try:
            audio, sr = librosa.load(file_path, sr=16000)
            mfcc = librosa.feature.mfcc(audio, sr=sr, n_mfcc=13)
            if mfcc.shape[1] < 124:
                # Pad if shorter
                mfcc = np.pad(mfcc, ((0, 0), (0, 124 - mfcc.shape[1])), mode='constant')
            else:
                # Trim if longer
                mfcc = mfcc[:, :124]
            X.append(mfcc.T)  # Shape: (124, 13)
            y.append(label)
        except Exception as e:
            print(f"Skipping {file_path}: {e}")

X = np.array(X)
y = np.array(y)

print(f"\n Extracted: {X.shape[0]} samples")
print(f" X shape: {X.shape}, y shape: {y.shape}")

# Save arrays
!mkdir -p /content/array
np.save("/content/array/X_full.npy", X)
np.save("/content/array/y_full.npy", y)

print(" Saved: X_full.npy and y_full.npy")

 Extracting MFCC features for all classes...



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
 -0.00927734] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mfcc = librosa.feature.mfcc(audio, sr=sr, n_mfcc=13)
  2.1362305e-04  2.4414062e-04] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mfcc = librosa.feature.mfcc(audio, sr=sr, n_mfcc=13)
 -0.00091553] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mfcc = librosa.feature.mfcc(audio, sr=sr, n_mfcc=13)
  0.00253296] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mfcc = librosa.feature.mfcc(audio, sr=sr, n_mfcc=13)
 1.1596680e-03] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mfcc = librosa.feature.mfcc(audio, sr=sr, n_mfcc=13)
 -0.00839233] as keyword args. From version 0.10 passing these as positional argument


 Extracted: 8882 samples
 X shape: (8882, 124, 13), y shape: (8882,)
 Saved: X_full.npy and y_full.npy


In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
import tensorflow as tf

# Load data
X = np.load('/content/array/X_full.npy')  # (8882, 124, 13)
y = np.load('/content/array/y_full.npy')  # (8882,)

# One-hot encode labels
y_cat = to_categorical(y, num_classes=4)

# Reshape for Conv2D: (samples, height, width, channels)
X = X[..., np.newaxis]

# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X, y_cat, test_size=0.2, random_state=42, stratify=y)

# Build model
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(124, 13, 1)),
    MaxPooling2D((2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(4, activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

# Train
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=15, batch_size=64)

# Save model
model.save("keyword_model.h5")
print(" Model saved as 'keyword_model.h5'")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/15
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 32ms/step - accuracy: 0.8045 - loss: 2.3303 - val_accuracy: 0.9494 - val_loss: 0.1376
Epoch 2/15
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9539 - loss: 0.1397 - val_accuracy: 0.9572 - val_loss: 0.1145
Epoch 3/15
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9573 - loss: 0.1246 - val_accuracy: 0.9550 - val_loss: 0.1114
Epoch 4/15
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9640 - loss: 0.0967 - val_accuracy: 0.9679 - val_loss: 0.0831
Epoch 5/15
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9756 - loss: 0.0715 - val_accuracy: 0.9685 - val_loss: 0.0794
Epoch 6/15
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9763 - loss: 0.0642 - val_accuracy: 0.9640 - val_loss: 0.1031
Epoch 7/15
[1m112/112[0m 



 Model saved as 'keyword_model.h5'


In [None]:
import tensorflow as tf

# Load model
model = tf.keras.models.load_model("keyword_model.h5")

# Convert to TFLite (float32 first)
converter = tf.lite.TFLiteConverter.from_keras_model(model)
tflite_model = converter.convert()

# Save TFLite model
with open("keyword_model.tflite", "wb") as f:
    f.write(tflite_model)
print(" Saved: keyword_model.tflite (float32)")

# Now quantize to int8
def representative_dataset_gen():
    for i in range(100):
        yield [X_train[i:i+1].astype(np.float32)]

converter.optimizations = [tf.lite.Optimize.DEFAULT]
converter.representative_dataset = representative_dataset_gen
converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
converter.inference_input_type = tf.int8
converter.inference_output_type = tf.int8

tflite_quant_model = converter.convert()

# Save quantized model
with open("keyword_model_quant.tflite", "wb") as f:
    f.write(tflite_quant_model)
print(" Saved: keyword_model_quant.tflite (int8)")



Saved artifact at '/tmp/tmpdv3_psm1'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): TensorSpec(shape=(None, 124, 13, 1), dtype=tf.float32, name='input_layer')
Output Type:
  TensorSpec(shape=(None, 4), dtype=tf.float32, name=None)
Captures:
  132611976821456: TensorSpec(shape=(), dtype=tf.resource, name=None)
  132611976816272: TensorSpec(shape=(), dtype=tf.resource, name=None)
  132614402475216: TensorSpec(shape=(), dtype=tf.resource, name=None)
  132614402476176: TensorSpec(shape=(), dtype=tf.resource, name=None)
  132614402475984: TensorSpec(shape=(), dtype=tf.resource, name=None)
  132614402478288: TensorSpec(shape=(), dtype=tf.resource, name=None)
  132614402476560: TensorSpec(shape=(), dtype=tf.resource, name=None)
  132614402488656: TensorSpec(shape=(), dtype=tf.resource, name=None)
 Saved: keyword_model.tflite (float32)
Saved artifact at '/tmp/tmpls2jc37q'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_O



 Saved: keyword_model_quant.tflite (int8)


In [None]:
# Convert TFLite model to C array
!xxd -i keyword_model_quant.tflite > model.cpp

# Fix the array name and wrap with C++ header includes
with open("model.cpp", "r") as f:
    lines = f.readlines()

# Replace variable name and wrap in header
with open("model.cpp", "w") as f:
    f.write('#include "model.h"\n\n')
    for line in lines:
        f.write(line.replace('unsigned char keyword_model_quant_tflite[]', 'const unsigned char g_model[]')
                  .replace('unsigned int keyword_model_quant_tflite_len', 'const int g_model_len'))