In [10]:
import librosa
import numpy as np
import os
import json
import pickle
import pyaudio
import wave
import speech_recognition as sr
import time
import serial

from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from micromlgen import port

In [4]:
DATASET_PATH = "mini_speech_commands"
SAMPLE_RATE = 22050
JSON_PATH = "data.json"
FRAMES = []
SHORT_NORMALIZE = (1.0/32768.0)
CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 1
DURATION = 2
RATE = 44100
TEMPORARY_FILE = "temp.wav"
SAMPLES_PER_SEGMENT = SAMPLE_RATE * DURATION
EXPECTED_MFCC_VECTOR_COUNT = 13
MAPPING = [ "down", "stop", "up"]
FRAMES_PER_BUFFER = 1024


mic = sr.Microphone(device_index=0)
rec = sr.Recognizer()
audio = pyaudio.PyAudio()

In [5]:
# preprocess data
def preprocess_dataset(dataset_path, json_path, n_mfcc=13, n_fft=2048, hop_length=512):
  
      # dictionary to store data
      data = {
          "mapping": [],
          "labels": [],
          "mfcc": [],
          "files": []
      }
  
      # loop through all sub-dirs
      for i, (dirpath, dirnames, filenames) in enumerate(os.walk(dataset_path)):
  
          # ensure we're not at root level
          if dirpath is not dataset_path:
  
              # save label (i.e., sub-dir name) in the mapping
              dirpath_components = dirpath.split("/") # "mini_speech_commands/down" => ["mini_speech_commands", "down"]
              semantic_label = dirpath_components[-1]
              data["mapping"].append(semantic_label)
              print("\nProcessing: '{}'".format(semantic_label))
  
              # process files for a specific sub-dir
              for f in filenames:
  
                  # load audio file
                  file_path = os.path.join(dirpath, f)
                  signal, sample_rate = librosa.load(file_path)

                  if len(signal) >= SAMPLE_RATE: # ensure consistency of the length of the signal
                    signal = signal[:SAMPLE_RATE]
  
                    # extract MFCCs
                    mfcc = librosa.feature.mfcc(y=signal, sr=sample_rate, n_mfcc=n_mfcc, n_fft=n_fft, hop_length=hop_length)
                    mfcc = mfcc.T
  
                    # store data for analysed track
                    data["mfcc"].append(mfcc.tolist())
                    data["labels"].append(i-1)
                    data["files"].append(file_path)
                    # print("{}: {}".format(file_path, i-1))
  
      # save MFCCs to json file
      with open(json_path, "w") as fp:
          json.dump(data, fp, indent=4)
  
      print("Finished processing.")

if __name__ == "__main__":
  preprocess_dataset(DATASET_PATH, JSON_PATH)


Processing: 'mini_speech_commands\down'

Processing: 'mini_speech_commands\stop'

Processing: 'mini_speech_commands\up'
Finished processing.


In [4]:
# Load Data
def load_data(dataset_path):
    with open(dataset_path, "r") as fp:
        data = json.load(fp)
  
    # convert lists into numpy arrays
    inputs = np.array(data["mfcc"])
    targets = np.array(data["labels"])
  
    return inputs, targets

In [140]:
# Train data using scikit learn
def train_test_model(dataset_path):
    # load data
    X, y = load_data(dataset_path)
  
    # create train/test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, validation_split=0.2)

    # create network with linear regression
    model = LogisticRegression(
        solver='lbfgs',
        multi_class='multinomial',
        max_iter=100,
        fit_intercept=True,
        n_jobs=3,
        C=0.1,
        class_weight=None,
        intercept_scaling=1,
        penalty='l2',
        random_state=None,
        tol=0.0001,
        verbose=0,
        warm_start=False
    )
    # model = MLPClassifier(
    #     hidden_layer_sizes=(512, 256),
    #     activation="relu",
    #     solver="adam",
    #     batch_size=32,
    #     verbose=1,
    #     epsilon=1e-8,
    #     alpha=0.0001,
    #     learning_rate="adaptive",
    #     max_iter=100
    # )

    # reshape the 3d array to 2d array
    X_train = X_train.reshape(X_train.shape[0], X_train.shape[1] * X_train.shape[2])
    X_test = X_test.reshape(X_test.shape[0], X_test.shape[1] * X_test.shape[2])
    
    # Print some details
    print("X_train.shape: {}".format(X_train.shape))
    print("X_test.shape: {}".format(X_test.shape))
    print("y_train.shape: {}".format(y_train.shape))
    print("y_test.shape: {}".format(y_test.shape))

    # train network
    # model = LogisticRegression(max_iter=200)
    model.fit(X_train, y_train)
  
    # evaluate network
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)

    # save the model using pickle
    pickle.dump(model, open("model.sav", 'wb'))

    #save the model using json format
    model_param = {}
    model_param["coef_"] = model.coef_.tolist()
    model_param["intercept_"] = model.intercept_.tolist()

    json_txt = json.dumps(model_param, indent=4)
    with open("model.json", "w") as json_file:
        json_file.write(json_txt)
    
    # export to plain C
    c_code = port(model, instance_name="MLClassifier")

    print("Accuracy: {:.2f}%".format(accuracy*100))
    
    # print(c_code)
    

In [133]:
# Run the model
if __name__ == "__main__":
  print("Training model...")
  train_test_model(JSON_PATH)

Training model...
X_train.shape: (1877, 572)
X_test.shape: (805, 572)
y_train.shape: (1877,)
y_test.shape: (805,)
Accuracy: 74.78%


In [17]:
# read audio and save it to a file
def record_audio(): 
    p = pyaudio.PyAudio()

    stream = p.open(format=FORMAT,
                    channels=CHANNELS,
                    rate=SAMPLE_RATE,
                    input=True,
                    frames_per_buffer=FRAMES_PER_BUFFER)

    print("Recording audio...")

    frames = [] # A python-list of chunks(numpy.ndarray)

    for i in range(0, int(SAMPLE_RATE / FRAMES_PER_BUFFER * DURATION)):
        data = stream.read(FRAMES_PER_BUFFER)
        frames.append(np.frombuffer(data, dtype=np.int16))

    print("Finished recording.")

    # stop and close stream
    stream.stop_stream()
    stream.close()

    # save audio file
    # convert the python-list of numpy-arrays into a 1D numpy-array
    audio = np.hstack(frames)

    # save as WAV file
    filename = "test.wav"
    wf = wave.open(filename, 'wb')
    wf.setnchannels(CHANNELS)
    wf.setsampwidth(p.get_sample_size(FORMAT))
    wf.setframerate(SAMPLE_RATE)
    wf.writeframes(b''.join(frames))
    wf.close()

    return filename

# preprocess and predict what class the command is
def predict_command(model, filename):
    # extract MFCCs
    signal, sample_rate = librosa.load(filename, sr=SAMPLE_RATE)

    # ensure consistency of the length of the signal
    if len(signal) >= SAMPLE_RATE:
        signal = signal[:SAMPLE_RATE]

    # extract MFCCs
    mfcc = librosa.feature.mfcc(y=signal, sr=sample_rate, n_mfcc=13, n_fft=2048, hop_length=512)
    mfcc = mfcc.T

    # reshape the 2d array to 1d array
    mfcc = mfcc.reshape(mfcc.shape[0] * mfcc.shape[1])

    # predict command
    y_pred = model.predict([mfcc])
    y_pred = y_pred[0]

    return y_pred

# export prediction to arduino
# def export_to_arduino(prediction):
#     # connect to arduino
#     ser = serial.Serial('COM3', 9600)
#     time.sleep(2)

#     # send prediction to arduino
#     ser.write(bytes(prediction, 'utf-8'))

#     # close connection
#     ser.close()

# Run the model
if __name__ == "__main__":
    # load the model
    model = pickle.load(open("model.sav", 'rb'))

    # record the audio
    filename = record_audio()

    # predict the command
    prediction = predict_command(model, filename)
    print("Prediction: {}".format(MAPPING[prediction]))
    predic_string = MAPPING[prediction]

    # send to arduino
    # export_to_arduino(predic_string)

    # remove the file
    os.remove(filename)

Recording audio...
Finished recording.
Prediction: down
