In [None]:
!npm cache clean -f
!npm install -g n
!n stable


In [None]:
!python -m pip install soundfile
!npm install -g --unsafe-perm edge-impulse-cli

In [None]:
BASE_DIR = "/content"
OUT_DIR = "keywords_curated"
GOOGLE_DATASET_FILENAME = "speech_commands_v0.02.tar.gz"
GOOGLE_DATASET_URL = "http://download.tensorflow.org/data/" + GOOGLE_DATASET_FILENAME
GOOGLE_DATASET_DIR = "google_speech_commands"
CUSTOM_KEYWORDS_FILENAME = "main.zip"
CUSTOM_KEYWORDS_URL = "https://github.com/ShawnHymel/custom-speech-commands-dataset/archive/" + CUSTOM_KEYWORDS_FILENAME
CUSTOM_KEYWORDS_DIR = "custom_keywords"
CUSTOM_KEYWORDS_REPO_NAME = "custom-speech-commands-dataset-main"
CURATION_SCRIPT = "dataset-curation.py"
CURATION_SCRIPT_URL = "https://raw.githubusercontent.com/ShawnHymel/ei-keyword-spotting/master/" + CURATION_SCRIPT
UTILS_SCRIPT_URL = "https://raw.githubusercontent.com/ShawnHymel/ei-keyword-spotting/master/utils.py"
NUM_SAMPLES = 1500    # Target number of samples to mix and send to Edge Impulse
WORD_VOL = 1.0        # Relative volume of word in output sample
BG_VOL = 0.1          # Relative volume of noise in output sample
SAMPLE_TIME = 1.0     # Time (seconds) of output sample
SAMPLE_RATE = 16000   # Sample rate (Hz) of output sample
BIT_DEPTH = "PCM_16"  # Options: [PCM_16, PCM_24, PCM_32, PCM_U8, FLOAT, DOUBLE]
BG_DIR = "_background_noise_"
TEST_RATIO = 0.2      # 20% reserved for test set, rest is for training
EI_INGEST_TEST_URL = "https://ingestion.edgeimpulse.com/api/test/data"
EI_INGEST_TRAIN_URL = "https://ingestion.edgeimpulse.com/api/training/data"

In [None]:
!cd {BASE_DIR}
!wget {GOOGLE_DATASET_URL}
!mkdir {GOOGLE_DATASET_DIR}
!echo "Extracting..."
!tar xfz {GOOGLE_DATASET_FILENAME} -C {GOOGLE_DATASET_DIR}

In [None]:
CUSTOM_DATASET_PATH = ""

# Edge Impulse > your_project > Dashboard > Keys
EI_API_KEY = " " 

# Comma separated words. Must match directory names (that contain samples).
# Recommended: use 2 keywords for microcontroller demo
TARGETS = "go, stop"

In [None]:
!wget {CURATION_SCRIPT_URL}
!wget {UTILS_SCRIPT_URL}

In [None]:
!cd {BASE_DIR}

# Imports
import os
import random

# Seed with system time
random.seed()

# Go through each category in our curated dataset
for dir in os.listdir(OUT_DIR):
  
  # Create list of files for one category
  paths = []
  for filename in os.listdir(os.path.join(OUT_DIR, dir)):
    paths.append(os.path.join(OUT_DIR, dir, filename))

  # Shuffle and divide into test and training sets
  random.shuffle(paths)
  num_test_samples = int(TEST_RATIO * len(paths))
  test_paths = paths[:num_test_samples]
  train_paths = paths[num_test_samples:]

  # Create arugments list (as a string) for CLI call
  test_paths = ['"' + s + '"' for s in test_paths]
  test_paths = ' '.join(test_paths)
  train_paths = ['"' + s + '"' for s in train_paths]
  train_paths = ' '.join(train_paths)

In [None]:
# Send test files to Edge Impulse
  !edge-impulse-uploader \
    --category testing \
    --label {dir} \
    --api-key {EI_API_KEY} \
    --silent \
    {test_paths}

  # # Send training files to Edge Impulse
  !edge-impulse-uploader \
    --category training \
    --label {dir} \
    --api-key {EI_API_KEY} \
    --silent \
    {train_paths}