## Training new Wake Words


# 1. Test Example Training Clip Generation
Since openWakeWord models are trained on synthetic examples of your
target wake word, it's a good idea to make sure that the examples
sound correct. Type in your target wake word below, and run the
cell to listen to it.
Here are some tips that can help get the wake word to sound right:

- If your wake word isn't being pronounced in the way
you want, try spelling out the sounds phonetically with underscores
separating each part.
For example: "hey siri" --> "hey_seer_e".

- Spell out numbers ("2" --> "two")

- Avoid all punctuation except for "?" and "!", and remove unicode characters

- Create multiple different ways of spelling out the same phrase phonetically. This helps the model learn various pronunciations and intonations people might use. For example, "Hey Zelda" could be varied as: ['hey Zelda', 'heyeh, zelda', 'hayeh, zelda', 'hey Zelda?', 'hay Zelduh', 'hay Zelda', 'hay Zelda?', 'hazelle_duh']
  - This could potentially increase false activations so be careful with this
  - You can check for duplicates in phenome identification with the data generator

In [5]:
import os
import sys
from IPython.display import Audio

# if not os.path.exists("./piper-sample-generator"):
!git clone https://github.com/rhasspy/piper-sample-generator
!wget -O piper-sample-generator/models/en_US-libritts_r-medium.pt 'https://github.com/rhasspy/piper-sample-generator/releases/download/v2.0.0/en_US-libritts_r-medium.pt'

# Install system dependencies
!pip install piper-phonemize
!pip install webrtcvad

if "piper-sample-generator/" not in sys.path:
    sys.path.append("piper-sample-generator/")
from generate_samples import generate_samples


target_word = ["hey tester", "hay tester"]

def text_to_speech(text):
    generate_samples(text = text,
                max_samples=1,
                noise_scales=[1.0], noise_scale_ws=[1.0],
                length_scales=[1.1],
                # length_scales=[0.75, 1.0, 1.25],
                output_dir = './', batch_size=1, auto_reduce_batch_size=True,
                file_names=["test_generation.wav"]
                )
# Test the pronounciation of the words confirm it is what you want
text_to_speech(target_word[0])
Audio("test_generation.wav", autoplay=True)

fatal: destination path 'piper-sample-generator' already exists and is not an empty directory.
zsh:1: command not found: wget
[31mERROR: Could not find a version that satisfies the requirement piper-phonemize (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for piper-phonemize[0m[31m
[0mCollecting webrtcvad
  Using cached webrtcvad-2.0.10-cp310-cp310-macosx_11_0_arm64.whl
Installing collected packages: webrtcvad
Successfully installed webrtcvad-2.0.10


ModuleNotFoundError: No module named 'piper_phonemize'

In [None]:
# Mount at google drive for saving output files when running in the background
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Install all dependencies and download data

import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

# install openwakeword (full installation to support training)
!git clone https://github.com/EthanEpp/openwakeword
!pip install -e ./openwakeword
!cd openwakeword

# install other dependencies
!pip install mutagen==1.47.0
!pip install torchinfo==1.8.0
!pip install torchmetrics==1.2.0
!pip install speechbrain==0.5.14
!pip install audiomentations==0.33.0
!pip install torch-audiomentations==0.11.0
!pip install acoustics==0.2.6
!pip uninstall tensorflow -y
!pip install tensorflow-cpu==2.8.1
!pip install tensorflow_probability==0.16.0
!pip install onnx_tf==1.10.0
!pip install pronouncing==0.2.0
!pip install datasets==2.14.6
!pip install deep-phonemizer==0.0.19

# Download required models (workaround for Colab)
os.makedirs("./openwakeword/openwakeword/resources/models")
!wget https://github.com/dscripka/openWakeWord/releases/download/v0.5.1/embedding_model.onnx -O ./openwakeword/openwakeword/resources/models/embedding_model.onnx
!wget https://github.com/dscripka/openWakeWord/releases/download/v0.5.1/embedding_model.tflite -O ./openwakeword/openwakeword/resources/models/embedding_model.tflite
!wget https://github.com/dscripka/openWakeWord/releases/download/v0.5.1/melspectrogram.onnx -O ./openwakeword/openwakeword/resources/models/melspectrogram.onnx
!wget https://github.com/dscripka/openWakeWord/releases/download/v0.5.1/melspectrogram.tflite -O ./openwakeword/openwakeword/resources/models/melspectrogram.tflite

# Imports
if "piper-sample-generator/" not in sys.path:
    sys.path.append("piper-sample-generator/")
from generate_samples import generate_samples
import numpy as np
import torch
from pathlib import Path
import uuid
import yaml
import datasets
import scipy
from tqdm import tqdm

## Download all data

## Download MIR RIR data (impulse responses)
output_dir = "./mit_rirs"
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
    !git lfs install
    !git clone https://huggingface.co/datasets/davidscripka/MIT_environmental_impulse_responses
    rir_dataset = datasets.Dataset.from_dict({"audio": [str(i) for i in Path("./MIT_environmental_impulse_responses/16khz").glob("*.wav")]}).cast_column("audio", datasets.Audio())
    # Save clips to 16-bit PCM wav files
    for row in tqdm(rir_dataset):
        name = row['audio']['path'].split('/')[-1]
        scipy.io.wavfile.write(os.path.join(output_dir, name), 16000, (row['audio']['array']*32767).astype(np.int16))

## Download noise and background audio

# Audioset Dataset (https://research.google.com/audioset/dataset/index.html)
# Download one part of the audioset .tar files, extract, and convert to 16khz
# For full-scale training, it's recommended to download the entire dataset from
# https://huggingface.co/datasets/agkphysics/AudioSet, and
# even potentially combine it with other background noise datasets (e.g., FSD50k, Freesound, etc.)

if not os.path.exists("audioset"):
    os.mkdir("audioset")

    fname = "bal_train09.tar"
    out_dir = f"audioset/{fname}"
    link = "https://huggingface.co/datasets/agkphysics/AudioSet/resolve/main/data/" + fname
    !wget -O {out_dir} {link}
    !cd audioset && tar -xvf bal_train09.tar

    output_dir = "./audioset_16k"
    if not os.path.exists(output_dir):
        os.mkdir(output_dir)

    # Save clips to 16-bit PCM wav files
    audioset_dataset = datasets.Dataset.from_dict({"audio": [str(i) for i in Path("audioset/audio").glob("**/*.flac")]})
    audioset_dataset = audioset_dataset.cast_column("audio", datasets.Audio(sampling_rate=16000))
    for row in tqdm(audioset_dataset):
        name = row['audio']['path'].split('/')[-1].replace(".flac", ".wav")
        scipy.io.wavfile.write(os.path.join(output_dir, name), 16000, (row['audio']['array']*32767).astype(np.int16))

# Free Music Archive dataset
# https://github.com/mdeff/fma

output_dir = "./fma"
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
    fma_dataset = datasets.load_dataset("rudraml/fma", name="small", split="train", streaming=True)
    fma_dataset = iter(fma_dataset.cast_column("audio", datasets.Audio(sampling_rate=16000)))

    # Save clips to 16-bit PCM wav files
    n_hours = 1  # use only 1 hour of clips for this example notebook, recommend increasing for full-scale training
    for i in tqdm(range(n_hours*3600//30)):  # this works because the FMA dataset is all 30 second clips
        row = next(fma_dataset)
        name = row['audio']['path'].split('/')[-1].replace(".mp3", ".wav")
        scipy.io.wavfile.write(os.path.join(output_dir, name), 16000, (row['audio']['array']*32767).astype(np.int16))
        i += 1
        if i == n_hours*3600//30:
            break

# Download pre-computed openWakeWord features for training and validation, validation is made in house for licensing

# training set (~2,000 hours from the ACAV100M Dataset)
# See https://huggingface.co/datasets/davidscripka/openwakeword_features for more information
if not os.path.exists("./openwakeword_features_ACAV100M_2000_hrs_16bit.npy"):
    !wget https://huggingface.co/datasets/davidscripka/openwakeword_features/resolve/main/openwakeword_features_ACAV100M_2000_hrs_16bit.npy


# validation set for false positive rate estimation
if not os.path.exists("negative_features_dipco_ccv11.npy"):
    !wget https://huggingface.co/datasets/ethan3048/validfeaturescup/resolve/main/negative_features_dipco_ccv11.npy


# Training Parameters

Each paramater controls a different aspect of training:
- `number_of_examples` controls how many examples of your wakeword
are generated.

- `number_of_training_steps` controls how long to train the model.
Similar to the number of examples, the default (10,000) usually works well
but training longer usually helps.

- `false_activation_penalty` controls how strongly false activations
are penalized during the training process. Higher values can make the model
much less likely to activate when it shouldn't, but may also cause it
to not activate when the wake word isn't spoken clearly and there is
background noise.

- `custom_negative_phrases` similar to phrases chosen at the beginning but these should be variations that should not be accepted and will be used as adversarial. Example with target phrase "Hey Zelda". Negative phrases: ["Hey Melda", "Say Zelda", "They sell the"]
  - This could potentially increase false rejections, so be careful with how these are selected. \

#Time For Different Scales on T4 Low Ram

- number_of_examples = 30000, number_of_training_steps = 100000: ~1 - 1.5 hours
  - Seemingly strong performance both on synthetic metrics and manual testing
- number_of_examples = 50000, number_of_training_steps = 500000: ~2.5 - 3 hours
  - Improved performance, but not relative to time increase
- number_of_examples = 300000, number_of_training_steps = 1000000: ~10 - 14 hours
  - Percentage of improvement is not relative to amount of time increase, but in theory should still do better


When the model finishes training, you can navigate to the `my_custom_model` folder
in the file browser on the left (click on the folder icon), and download
the [your target wake word].onnx or  <your target wake word>.tflite files.
They are also set to go to a google drive folder as navigated by destination phrase

In [None]:
# Load default YAML config file for training
import yaml
config = yaml.load(open("openwakeword/examples/custom_model.yml", 'r').read(), yaml.Loader)

# Modify values in the config and save a new version
number_of_examples = 250
number_of_training_steps = 1000
false_activation_penalty = 3000
config["target_phrase"] = target_word
config["model_name"] = config["target_phrase"][0].replace(" ", "_")
config["n_samples"] = number_of_examples
config["n_samples_val"] = max(500, number_of_examples//10)
config["steps"] = number_of_training_steps
config["target_accuracy"] = 0.8
config["target_recall"] = 0.8
config["output_dir"] = "./my_custom_model"
config["max_negative_weight"] = false_activation_penalty
# config["custom_negative_phrases"] = []
config["background_paths"] = ['./audioset_16k', './fma']  # multiple background datasets are supported
# config["false_positive_validation_data_path"] = "/content/drive/MyDrive/your_data_folder/negative_features_dipco_ccv11.npy"
config["false_positive_validation_data_path"] = "/content/negative_features_dipco_ccv11.npy"
config["feature_data_files"] = {"ACAV100M_sample": "openwakeword_features_ACAV100M_2000_hrs_16bit.npy"}

with open('my_model.yaml', 'w') as file:
    documents = yaml.dump(config, file)

# Generate clips
!{sys.executable} openwakeword/openwakeword/train.py --training_config my_model.yaml --generate_clips

# Step 2: Augment the generated clips

!{sys.executable} openwakeword/openwakeword/train.py --training_config my_model.yaml --augment_clips

# Step 3: Train model

!{sys.executable} openwakeword/openwakeword/train.py --training_config my_model.yaml --train_model

# Manually save to tflite as this doesn't work right in colab
def convert_onnx_to_tflite(onnx_model_path, output_path):
    """Converts an ONNX version of an openwakeword model to the Tensorflow tflite format."""
    # imports
    import onnx
    import logging
    import tempfile
    from onnx_tf.backend import prepare
    import tensorflow as tf

    # Convert to tflite from onnx model
    onnx_model = onnx.load(onnx_model_path)
    tf_rep = prepare(onnx_model, device="CPU")
    with tempfile.TemporaryDirectory() as tmp_dir:
        tf_rep.export_graph(os.path.join(tmp_dir, "tf_model"))
        converter = tf.lite.TFLiteConverter.from_saved_model(os.path.join(tmp_dir, "tf_model"))
        tflite_model = converter.convert()

        logging.info(f"####\nSaving tflite mode to '{output_path}'")
        with open(output_path, 'wb') as f:
            f.write(tflite_model)

    return None

import shutil  # Import shutil for high-level file operations

source_path = './my_custom_model'
destination_phrase = "hey_tester"  # parent repo to each individual run
destination_phrase_version = "hey_tester_codeclean"  # Base name for the version + iteration number

base_destination_path = "/content/drive/MyDrive/SoftAcuity Models/completed_models"
counter = 1
destination_path = os.path.join(base_destination_path, destination_phrase, f"{destination_phrase_version}_{counter}")

# Check if the directory exists and increment the counter until it doesn't
while os.path.exists(destination_path):
    counter += 1
    destination_path = os.path.join(base_destination_path, destination_phrase, f"{destination_phrase_version}_{counter}")

# Now, destination_path points to a non-existent directory
os.makedirs(destination_path, exist_ok=True)

# Convert ONNX to TFLite if necessary
convert_onnx_to_tflite(f"{source_path}/{config['model_name']}.onnx", f"{destination_path}/{config['model_name']}.tflite")

# Copy only .onnx and .tflite files
for filename in os.listdir(source_path):
    if filename.endswith('.onnx') or filename.endswith('.tflite'):
        shutil.copy(os.path.join(source_path, filename), destination_path)

from google.colab import files

# Automatically download the trained model files
files.download(f"{destination_path}/{config['model_name']}.onnx")
files.download(f"{destination_path}/{config['model_name']}.tflite")