In [11]:
import librosa
import numpy as np
import os
import json
import pickle

from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [7]:
DATASET_PATH = "mini_speech_commands"
SAMPLE_RATE = 22050
JSON_PATH = "data.json"

# preprocess data
def preprocess_dataset(dataset_path, json_path, n_mfcc=13, n_fft=2048, hop_length=512):
  
      # dictionary to store data
      data = {
          "mapping": [],
          "labels": [],
          "mfcc": [],
          "files": []
      }
  
      # loop through all sub-dirs
      for i, (dirpath, dirnames, filenames) in enumerate(os.walk(dataset_path)):
  
          # ensure we're not at root level
          if dirpath is not dataset_path:
  
              # save label (i.e., sub-dir name) in the mapping
              dirpath_components = dirpath.split("/") # "mini_speech_commands/down" => ["mini_speech_commands", "down"]
              semantic_label = dirpath_components[-1]
              data["mapping"].append(semantic_label)
              print("\nProcessing: '{}'".format(semantic_label))
  
              # process files for a specific sub-dir
              for f in filenames:
  
                  # load audio file
                  file_path = os.path.join(dirpath, f)
                  signal, sample_rate = librosa.load(file_path)

                  if len(signal) >= SAMPLE_RATE: # ensure consistency of the length of the signal
                    signal = signal[:SAMPLE_RATE]
  
                    # extract MFCCs
                    mfcc = librosa.feature.mfcc(y=signal, sr=sample_rate, n_mfcc=n_mfcc, n_fft=n_fft, hop_length=hop_length)
                    mfcc = mfcc.T
  
                    # store data for analysed track
                    data["mfcc"].append(mfcc.tolist())
                    data["labels"].append(i-1)
                    data["files"].append(file_path)
                    # print("{}: {}".format(file_path, i-1))
  
      # save MFCCs to json file
      with open(json_path, "w") as fp:
          json.dump(data, fp, indent=4)
  
      print("Finished processing.")

if __name__ == "__main__":
  preprocess_dataset(DATASET_PATH, JSON_PATH)


Processing: 'mini_speech_commands\down'

Processing: 'mini_speech_commands\stop'

Processing: 'mini_speech_commands\up'
Finished processing.


In [8]:
# Load Data
def load_data(dataset_path):
    with open(dataset_path, "r") as fp:
        data = json.load(fp)
  
    # convert lists into numpy arrays
    inputs = np.array(data["mfcc"])
    targets = np.array(data["labels"])
  
    return inputs, targets

In [12]:
# Train data using scikit learn
def train_test_model(dataset_path):
    # load data
    X, y = load_data(dataset_path)
  
    # create train/test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

    # create network
    model = MLPClassifier(
        hidden_layer_sizes=(512, 256),
        activation="relu",
        solver="adam",
        batch_size=32,
        verbose=1,
        epsilon=1e-8,
        alpha=0.0001,
        learning_rate="adaptive",
        max_iter=100
    )

    # reshape the 3d array to 2d array
    X_train = X_train.reshape(X_train.shape[0], X_train.shape[1] * X_train.shape[2])
    X_test = X_test.reshape(X_test.shape[0], X_test.shape[1] * X_test.shape[2])
    
    # Print some details
    print("X_train.shape: {}".format(X_train.shape))
    print("X_test.shape: {}".format(X_test.shape))
    print("y_train.shape: {}".format(y_train.shape))
    print("y_test.shape: {}".format(y_test.shape))

    # train network
    # model = LogisticRegression(max_iter=200)
    model.fit(X_train, y_train)
  
    # evaluate network
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)

    # save the model using pickle
    pickle.dump(model, open("model.sav", 'wb'))
  
    print("Accuracy: {:.2f}%".format(accuracy*100))

    # Save the model
    

In [13]:
# Run the model
if __name__ == "__main__":
  print("Training model...")
  train_test_model(JSON_PATH)

Training model...
X_train.shape: (1877, 572)
X_test.shape: (805, 572)
y_train.shape: (1877,)
y_test.shape: (805,)
Iteration 1, loss = 12.07024103
Iteration 2, loss = 3.51150639
Iteration 3, loss = 2.66848892
Iteration 4, loss = 2.03278477
Iteration 5, loss = 1.86115911
Iteration 6, loss = 1.11647758
Iteration 7, loss = 0.93903701
Iteration 8, loss = 0.55191036
Iteration 9, loss = 0.31783776
Iteration 10, loss = 0.35345651
Iteration 11, loss = 0.38528393
Iteration 12, loss = 0.45499704
Iteration 13, loss = 0.28918085
Iteration 14, loss = 0.24256560
Iteration 15, loss = 0.23396956
Iteration 16, loss = 0.19265405
Iteration 17, loss = 0.15569327
Iteration 18, loss = 0.10428395
Iteration 19, loss = 0.14264763
Iteration 20, loss = 0.26988042
Iteration 21, loss = 0.36026191
Iteration 22, loss = 0.45348788
Iteration 23, loss = 0.27612799
Iteration 24, loss = 0.21322782
Iteration 25, loss = 0.28248682
Iteration 26, loss = 0.17637604
Iteration 27, loss = 0.20332008
Iteration 28, loss = 0.2158350