# Key Word- Spotting Smart-Home System: ELECT_ENG 475: Machine Learning: Foundations, Applications, and Algorithms Project Notebook

**Eric Oliveira, Justin Ansell, Vivek Matta**

**Note: This 'requirements.txt' was generated from a working conda environment on an Apple Silicon MacBook running macOS, using Miniforge3 (conda/Anaconda-compatible) as the Python distribution and Jupyter/IPython for development. The package versions reflect that specific setup and were tested in that environment; users on other platforms may need to adjust versions or install platform-specific builds accordingly.**

Additional code / YouTube video:

- [All the file processing code (GitHub)](https://github.com/Eclo19/smart-home-kws)
- [Model training (Google Colab)](https://colab.research.google.com/drive/1zg0SPHg8Gk4uLPPREuo5_4F8gLJBUEz0?authuser=0)
- [Documentation video (YouTube)](https://youtu.be/afolWC9hfCI?si=sEM4GCfJR_onvcgo)


In [None]:
try:
    import os, sys
    import numpy as np, librosa
    import matplotlib.pyplot as plt
    from IPython.display import Audio, display
except ImportError:
    print("To run the code in its entirity, please install the requirements.txt file.")


## Clone the GitHub Repository 

If this does not work, please drop the smart-home-kws directory in this script's current directory for a full demo.

In [None]:

try:
    !git clone https://github.com/Eclo19/smart-home-kws
    print("\nSuccessfully cloned the repository.")
    sys.path.append("smart-home-kws")
except Exception:
    print("Could not clone repository. Please add the 'smart-home-kws' directory to this working space.")
    print("   --> After adding 'smart-home-kws', please run sys.path.append('smart-home-kws')")


## Data Gathering and Processing

In [None]:
# --- 1) Show a raw data point (label is the first word; 'blue' here) ---

raw_m4a_path = 'smart-home-kws/Toy_Dataset/Full/blue_eric_03_full.m4a'
raw_m4a_audio_data, sr = librosa.load(raw_m4a_path, sr=None)
print("Raw audio File info:\n"
      f"    Audio file format: {raw_m4a_path.split('.')[-1]}\n"
      f"    Audio Shape       = {raw_m4a_audio_data.shape}\n"
      f"    Sample rate       = {sr}\n"
      f"    Audio dtype       = {raw_m4a_audio_data.dtype}\n")
print("Displaying audio file:\n")
display(Audio(raw_m4a_audio_data, rate=sr))

t = np.arange(len(raw_m4a_audio_data)) / sr
plt.figure(figsize=(6, 4))
plt.plot(t, raw_m4a_audio_data, color='black')
plt.title("Example Waveform: blue_eric_03_full.m4a")
plt.xlabel("Time (s)")
plt.ylabel("Amplitude (float32)")


In [None]:
print("Forcing project standards to the raw recording. This includes:\n"
        "    - forcing our sample rate of 22.05 kHz\n"
        "    - forcing wav format\n"
        "    - summing to mono if stereo\n"
        "    - ensuring dtype is float32\n"
        "    - normalizing\n")

try:
    import data_augmentation
except ImportError:
    print("Could not import Python script from the cloned Repository.")
else:
    data_augmentation.VANILLA_DATA_PATH = "smart-home-kws/Toy_Dataset/Full"
    data_augmentation.sanitize_vanilla_dataset()

    audio_path = "smart-home-kws/Toy_Dataset/Full/blue_eric_03_full.wav"
    audio_data, sr = librosa.load(audio_path, sr=None)

    print("Sanitized audio File info:\n\n"
            f"    Audio file format: {audio_path.split('.')[-1]}\n"
            f"    Audio Shape       = {audio_data.shape}\n"
            f"    Sample rate       = {sr}\n"
            f"    Audio dtype       = {audio_data.dtype}\n")


In [None]:
# --- 2) Show how sanitized files are chopped ---

try:
    import file_chopper
except ImportError:
    print("Could not import Python script from the cloned Repository.")

print(
    "The default parameters should be a good fit, but the user must choose\n"
    "their threshold and window_length to adapt to each recording.\n\n"
    "NOTE: A prompt will show up to check/reset chopping parameters.\n"
    "If you do not wish to write the chops, press 'esc' in each prompt\n"
    "(this will throw an error).\n"
    "For demo purposes, press 1 for both prompts and write the chops.\n"
    "\nThe parameters are already set, so simply press '1' for both prompts to generate data.\n"
)

file_chopper.FULL_DATA_PATH = "smart-home-kws/Toy_Dataset/Full"
file_chopper.CHOPPED_DIR    = "smart-home-kws/Toy_Dataset/Live_chops"
file_chopper.parse(wait=int(22050*0.15), duration=int(22050*0.45))


In [None]:
live_chops_path = "smart-home-kws/Toy_Dataset/Live_chops"
if os.path.isdir(live_chops_path):
    data_augmentation.VANILLA_DATA_PATH = live_chops_path
    data_augmentation.sanitize_vanilla_dataset()

    print("Now displaying the first chop:")
    blue_1_path = os.path.join(live_chops_path, "blue_eric_03_01.wav")
    audio_data, sr = librosa.load(blue_1_path, sr=None)
    display(Audio(audio_data, rate=sr))

    t = np.arange(len(audio_data)) / sr
    plt.figure(figsize=(6, 4))
    plt.plot(t, audio_data, color="black")
    plt.title("Example Waveform: blue_eric_03_01.wav")
    plt.xlabel("Time (s)")
    plt.ylabel("Amplitude (float32)")
    plt.show()
else:
    print("Live chops directory not found. Skipping...")


In [None]:
# --- 3) Show data augmentation transformations ---
print(
    "We split the data with wrapper.split_dataset(), but it is too costly to run here.\n"
    "Instead, we show how we expanded the training set:\n"
    "  - Filtering: taps designed with pyfda, convolved via scipy\n"
    "  - Pitch-shifting: librosa.effects\n\n"
    "Now displaying all audio transformations applied to a training chop:\n"
)
transforms = [
    ("Original chop:",       lambda x: x),
    ("Low-passed:",          data_augmentation.low_pass),
    ("High-passed:",         data_augmentation.high_pass),
    ("Band-passed:",         data_augmentation.band_pass),
    ("Pitched Up:",          data_augmentation.pitch_up),
    ("Pitched Down:",        data_augmentation.pitch_down),
    ("Added red noise:",     data_augmentation.add_noise),
]
for desc, fn in transforms:
    print(desc)
    transformed_audio_data = fn(audio_data)
    display(Audio(transformed_audio_data, rate=sr))


In [None]:
# --- 5) Show Feature Extraction ---
print(
    "\nWe build json feature dictionaries for train/val/test using "
    "feature_extraction.build_feature_dict(). This calls MFCC extraction and "
    "label vectorization for each point, writing (flat_mfccs, vec_label) to JSON."
)
try:
    import feature_extraction, wrapper
except ImportError:
    print("Could not import Python script from the cloned Repository.")
else:
    wrapper.force_standard_size(dirname=live_chops_path, size=39243)

    audio_data, sr = librosa.load(blue_1_path, sr=None)
    print(f"Data length = {len(audio_data)}")
    mfccs = feature_extraction.extract_mfccs(audio_data=audio_data)
    print(f"\nMFCCs shape: {mfccs.shape}")

    l = os.path.basename(blue_1_path).split("_")[0]
    vec_label = feature_extraction.vectorize_label(label=l)
    print(f"\nParsed label = {l}, vec label:\n{vec_label}")

    mfcc_masked = np.ma.masked_equal(mfccs, 0.0)
    cmap = plt.cm.viridis.copy()
    cmap.set_bad("white")

    plt.figure(figsize=(8, 4))
    im = plt.imshow(mfcc_masked, aspect="auto", origin="lower", cmap=cmap)
    plt.title("Model's input - MFCC Visualization (Zero-padding shown in white)")
    plt.xlabel("Time frames")
    plt.ylabel("MFCC coefficient index")
    plt.colorbar(im, label="MFCC value")
    plt.tight_layout()
    plt.show()


## The Model

The full script for training the model is available and at https://colab.research.google.com/drive/1zg0SPHg8Gk4uLPPREuo5_4F8gLJBUEz0?usp=sharing. We show some basics of how the model was trained and some test data inferencing as well, but showing the full training would require too much time and memory.

In [None]:
try:
    import tensorflow as tf, keras

except ImportError:
    print("Could not import tensorflow. This is likely due to an environment/OS mismatch.")

In [None]:
# --- 1) Loading the data ---
print(
    "Our processed data is stored as a json feature dictionary of (x, y) tuples.\n"
    "We load it with wrapper.wrapper(dirname, augmented=False).\n\n"
    "When loading the augmented training, validation, and test data, the shapes should be:\n\n"
    "Train's x shape: (15561, 4928)\n"
    "Train's y shape: (15561, 9)\n\n"
    "Val's x shape:   (485, 4928)\n"
    "Val's y shape:   (485, 9)\n\n"
    "Test's x shape:  (473, 4928)\n"
    "Test's y shape:  (473, 9)\n\n"
    "The split is roughly 70% train, 15% test, 15% validation (for the vanilla set)."
)

In [None]:
# --- 2) Defining the Model ---
L = tf.keras.layers

inputs = tf.keras.Input(shape=(32, 154, 1))

# Block 1
x = L.Conv2D(64, (4, 4), activation="relu", padding="same")(inputs)
x = L.BatchNormalization()(x); x = L.MaxPool2D(2)(x); x = L.SpatialDropout2D(0.1)(x)

# Block 2
x = L.Conv2D(64, (3, 3), activation="relu", padding="same")(x)
x = L.BatchNormalization()(x); x = L.MaxPool2D(2)(x); x = L.SpatialDropout2D(0.1)(x)

# Block 3
x = L.Conv2D(128, (3, 3), activation="relu", padding="same")(x)
x = L.BatchNormalization()(x); x = L.MaxPool2D(2)(x); x = L.SpatialDropout2D(0.2)(x)

# Global pooling + Dense
x = L.GlobalAveragePooling2D()(x)
x = L.Dense(128, activation="relu")(x)
x = L.Dropout(0.3)(x)

#Outputs
outputs = L.Dense(9, activation="softmax")(x)
cnn = tf.keras.Model(inputs, outputs)
cnn.summary()

In [None]:
# --- 2) Training sketch (no data provided to avoid computation) ---

# Hyperparameters
lr, batch_size, epochs = 1e-3, 50, 20
stop_patience, lr_patience, lr_factor = 5, 2, 0.5

# Compile model
cnn.compile(
    optimizer=tf.keras.optimizers.Adam(lr),
    loss="categorical_crossentropy",
    metrics=["accuracy"]
)

C = tf.keras.callbacks
early_stop = C.EarlyStopping(
    monitor="val_loss", patience=stop_patience,
    restore_best_weights=True, verbose=1
)
reduce_lr = C.ReduceLROnPlateau(
    monitor="val_loss", factor=lr_factor,
    patience=lr_patience, min_lr=1e-6, verbose=1
)
checkpoint = C.ModelCheckpoint(
    filepath="best_cnn_run4_padrobust-epoch{epoch:02d}-"
             "valloss{val_loss:.4f}-valacc{val_accuracy:.4f}.keras",
    monitor="val_loss", save_best_only=True, verbose=1
)
# Sketch of the training loop (disabled)
if False:
    x = y = xv = yv = None
    history = cnn.fit(
        x, y,
        batch_size=batch_size,
        epochs=epochs,
        callbacks=[early_stop, reduce_lr, checkpoint],
        validation_data=(xv, yv),
        verbose=1
    )

## Real-Time Inference

Real-time inference works in the following way:

1) Record a 4s long buffer when the button is pressed
2) Extract an input window of length 39243 (model's input size in samples) based on DSP techniques (Leaky integrator's maximum will determine the midpoint of the input window)
3) Normalize and force zeros around the midpoint of the window, to mimic zero-padded training data
4) Extract zeroed-out window's MFCCs and feed it to the model
5) Get a prediction
6) Take some action based on the prediction in the embedded device.

In [None]:
# --- 1) Run Inference on Test Data ---

test_path  = "smart-home-kws/Toy_Dataset/Test Data"
MODEL_PATH = "smart-home-kws/Models/best_cnn_run4_padrobust-epoch15-valloss0.0025-valacc1.0000.keras"
N_MFCC, T_FRAMES = feature_extraction.N_MFCC, feature_extraction.T_FRAMES

# Random test point + true label
test_points    = [f for f in os.listdir(test_path) if f.lower().endswith((".wav", ".m4a", ".flac", ".ogg"))]
test_filename  = np.random.choice(test_points)
test_data_path = os.path.join(test_path, test_filename)
true_label_str = test_filename.split("_")[0]
print(f"Random test file: {test_filename}")
print(f"True label parsed from filename: {true_label_str}")

# Label mapping dim --> string
label_mapping = {
    "red": 0, "green": 1, "blue": 2, "white": 3, "off": 4,
    "time": 5, "temperature": 6, "unknown": 7, "noise": 8
}
idx_to_label = {v: k for k, v in label_mapping.items()}

# Audio + MFCCs
audio_data, sr = librosa.load(test_data_path)
display(Audio(audio_data, rate=sr))
mfccs = feature_extraction.extract_mfccs(audio_data=audio_data)
print("MFCCs shape:", mfccs.shape)

# Load model
print(f"\nLoading model from: {MODEL_PATH}")
model = keras.models.load_model(MODEL_PATH, compile=False)
print("Loaded model successfully.\n")

# Prepare input + predict
input_matrix = mfccs.reshape(1, N_MFCC, T_FRAMES, 1).astype(np.float32)
print("input_matrix shape:", input_matrix.shape, "dtype:", input_matrix.dtype)
y_pred    = model.predict(input_matrix, verbose=0)[0]
pred_idx  = int(np.argmax(y_pred))
pred_label = idx_to_label[pred_idx]

print("\nInference Result")
print(f"    - True label      : {true_label_str}")
print(f"    - Predicted label : {pred_label}")
print("    - Model output (softmax probabilities):")
print(y_pred.ravel())

In [None]:
# --- 2) Run Inference on Pre-Recorded Window, Simulating Embedded Behavior ---

audio_data, sr = librosa.load("smart-home-kws/Toy_Dataset/Window/full_window.wav", sr=None)
audio_data = audio_data.astype(np.float32)
audio_data /= np.max(np.abs(audio_data))
print("\nInput window test:"); display(Audio(audio_data, rate=sr))

try:
    import inference_test as it
except ImportError:
    print("Could not import Python script from the cloned Repository.")
else:
    # Extract best window + zero-out edges
    input_window        = it.plot_signal_and_loudest_window_leaky(audio_data, sr=sr, tau_ms=200)
    input_window_zeroed = it.zero_out(input_window)

    # Plot pipeline
    t = np.arange(len(input_window)) / sr
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 6), sharex=True)
    ax1.plot(t, input_window);        ax1.set_title("Extracted Input Window"); ax1.set_ylabel("Amplitude")
    ax2.plot(t, input_window_zeroed); ax2.set_title("Zeroed-Out Extracted Input Window")
    ax2.set_xlabel("Time(s)");        ax2.set_ylabel("Amplitude")
    fig.tight_layout(); plt.show()

    print("\nCleaned input window:"); display(Audio(input_window_zeroed, rate=sr))

    # Extract features + run inference
    mfccs = feature_extraction.extract_mfccs(audio_data=input_window_zeroed)
    print("MFCCs shape:", mfccs.shape)
    input_matrix = mfccs.reshape(1, N_MFCC, T_FRAMES, 1).astype(np.float32)
    y_pred = model.predict(input_matrix, verbose=0)[0]
    pred_idx  = int(np.argmax(y_pred))
    pred_label = idx_to_label[pred_idx]

    print("\n===== Inference Result =====")
    print(f"    - Predicted label : {pred_label}\n"
          f"    - Model output (softmax probabilities):\n{y_pred.ravel()}")