In [1]:
import librosa
import numpy as np
import os
import glob
import pickle
import soundfile
import json

from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
DATASET_PATH = "mini_speech_commands"
SAMPLE_RATE = 22050
JSON_PATH = "data.json"

# preprocess data
def preprocess_dataset(dataset_path, json_path, n_mfcc=13, n_fft=2048, hop_length=512):
  
      # dictionary to store data
      data = {
          "mapping": [],
          "labels": [],
          "mfcc": [],
          "files": []
      }
  
      # loop through all sub-dirs
      for i, (dirpath, dirnames, filenames) in enumerate(os.walk(dataset_path)):
  
          # ensure we're not at root level
          if dirpath is not dataset_path:
  
              # save label (i.e., sub-dir name) in the mapping
              dirpath_components = dirpath.split("/") # "mini_speech_commands/down" => ["mini_speech_commands", "down"]
              semantic_label = dirpath_components[-1]
              data["mapping"].append(semantic_label)
              print("\nProcessing: '{}'".format(semantic_label))
  
              # process files for a specific sub-dir
              for f in filenames:
  
                  # load audio file
                  file_path = os.path.join(dirpath, f)
                  signal, sample_rate = librosa.load(file_path)

                  if len(signal) >= SAMPLE_RATE: # ensure consistency of the length of the signal
                    signal = signal[:SAMPLE_RATE]
  
                    # extract MFCCs
                    mfcc = librosa.feature.mfcc(y=signal, sr=sample_rate, n_mfcc=n_mfcc, n_fft=n_fft, hop_length=hop_length)
                    mfcc = mfcc.T
  
                    # store data for analysed track
                    data["mfcc"].append(mfcc.tolist())
                    data["labels"].append(i-1)
                    data["files"].append(file_path)
                    print("{}: {}".format(file_path, i-1))
  
      # save MFCCs to json file
      with open(json_path, "w") as fp:
          json.dump(data, fp, indent=4)
  
      print("Finished processing.")

if __name__ == "__main__":
  preprocess_dataset(DATASET_PATH, JSON_PATH)


Processing: 'mini_speech_commands\down'
mini_speech_commands\down\004ae714_nohash_0.wav: 0
mini_speech_commands\down\00b01445_nohash_1.wav: 0
mini_speech_commands\down\00f0204f_nohash_0.wav: 0
mini_speech_commands\down\0132a06d_nohash_1.wav: 0
mini_speech_commands\down\0132a06d_nohash_4.wav: 0
mini_speech_commands\down\0137b3f4_nohash_2.wav: 0
mini_speech_commands\down\014f9f65_nohash_0.wav: 0
mini_speech_commands\down\016e2c6d_nohash_0.wav: 0
mini_speech_commands\down\016e2c6d_nohash_2.wav: 0
mini_speech_commands\down\01b4757a_nohash_0.wav: 0
mini_speech_commands\down\01bb6a2a_nohash_1.wav: 0
mini_speech_commands\down\01bb6a2a_nohash_3.wav: 0
mini_speech_commands\down\01d22d03_nohash_0.wav: 0
mini_speech_commands\down\02746d24_nohash_0.wav: 0
mini_speech_commands\down\0447d7c1_nohash_2.wav: 0
mini_speech_commands\down\0474c92a_nohash_0.wav: 0
mini_speech_commands\down\05b2db80_nohash_2.wav: 0
mini_speech_commands\down\05cf43ef_nohash_0.wav: 0
mini_speech_commands\down\063d48cf_nohash

In [3]:
# Load Data
def load_data(dataset_path):
    with open(dataset_path, "r") as fp:
        data = json.load(fp)
  
    # convert lists into numpy arrays
    inputs = np.array(data["mfcc"])
    targets = np.array(data["labels"])
  
    return inputs, targets

In [4]:
# Train data using scikit learn
def train_test_model(dataset_path):
    # load data
    X, y = load_data(dataset_path)
  
    # create train/test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

    # create network
    model = MLPClassifier(
        hidden_layer_sizes=(512, 256),
        activation="relu",
        solver="adam",
        batch_size=32,
        verbose=1,
        epsilon=1e-8,
        alpha=0.0001,
        learning_rate="adaptive",
        max_iter=100
    )

    # reshape the 3d array to 2d array
    X_train = X_train.reshape(X_train.shape[0], X_train.shape[1] * X_train.shape[2])
    X_test = X_test.reshape(X_test.shape[0], X_test.shape[1] * X_test.shape[2])
    
    # Print some details
    print("X_train.shape: {}".format(X_train.shape))
    print("X_test.shape: {}".format(X_test.shape))
    print("y_train.shape: {}".format(y_train.shape))
    print("y_test.shape: {}".format(y_test.shape))

    # train network
    # model = LogisticRegression(max_iter=200)
    model.fit(X_train, y_train)
  
    # evaluate network
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)
  
    print("Accuracy: {:.2f}%".format(accuracy*100))

In [5]:
# Run the model
if __name__ == "__main__":
  print("Training model...")
  train_test_model(JSON_PATH)

Training model...
X_train.shape: (1877, 572)
X_test.shape: (805, 572)
y_train.shape: (1877,)
y_test.shape: (805,)
Iteration 1, loss = 10.90595422
Iteration 2, loss = 3.12507660
Iteration 3, loss = 1.98985027
Iteration 4, loss = 1.69457165
Iteration 5, loss = 1.24801847
Iteration 6, loss = 0.55298965
Iteration 7, loss = 0.62959053
Iteration 8, loss = 0.59108032
Iteration 9, loss = 0.32548261
Iteration 10, loss = 0.46215972
Iteration 11, loss = 0.74854468
Iteration 12, loss = 0.48559473
Iteration 13, loss = 0.25250022
Iteration 14, loss = 0.43078723
Iteration 15, loss = 0.31305936
Iteration 16, loss = 0.14202485
Iteration 17, loss = 0.17319033
Iteration 18, loss = 0.14398387
Iteration 19, loss = 0.18755204
Iteration 20, loss = 0.11848814
Iteration 21, loss = 0.14234597
Iteration 22, loss = 0.19218380
Iteration 23, loss = 0.25828339
Iteration 24, loss = 0.41216430
Iteration 25, loss = 0.26096148
Iteration 26, loss = 0.20692318
Iteration 27, loss = 0.55644164
Iteration 28, loss = 0.3285473