# Imports

In [1]:
import os
import time

import librosa
import pickle
import numpy as np
import matplotlib.pyplot as plt

from sklearn import svm, metrics
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

# Constants

In [2]:
TRAIN_BASE_DIRECTORY = "/home/devin/Documents/datasets/nsynth-train/audio"
VALID_BASE_DIRECTORY = "/home/devin/Documents/datasets/nsynth-valid/audio"
TEST_BASE_DIRECTORY = "/home/devin/Documents/datasets/nsynth-test/audio"
NUM_MFCC_FEATURES = 40

LOAD_PICKLE = False
X_PICKLE = "/home/devin/Documents/datasets/nsynth-train/X_train.pickle"
Y_PICKLE = "/home/devin/Documents/datasets/nsynth-train/y_train.pickle"

TRAIN_TEST_SPLIT = 0.10

STR_INT_MAP = [
  "bass",
  "brass",
  "flute",
  "guitar",
  "keyboard",
  "mallet",
  "organ",
  "reed",
  "string",
  "",
  "vocal",
]

# Helper Methods

In [3]:
def grab_label(file: str) -> str:
    '''
    Grabs the label (the instrument) from the filename of the WAV audio file.

    The file names are something like:
      /path/to/audio/bass_some_other_words.wav

    The first '/' split grabs 'bass_some_other_words.wav'.
    The second '.' split grabs 'bass_some_other_words'.
    The third '_' split grabs 'bass'.

    All of the nsynth audio files are in the same format. If we wanted to get
    fancy, nsynth also has a JSON file explaining each file and what class
    it is from, so I could write a JSON parser instead.
    '''
    return file.split("/")[-1].split(".")[0].split("_")[0]

In [4]:
def grab_features(file: str, num_features: int) -> str:
  '''
  Grabs `num_features` MFCC features from a WAV file.

  The WAV file is loaded in at 16kHz at 4 seconds using librosa.load. Then,
  MFCC features are calculated using librosa.feature.mfcc. For nsynth audio
  files, this loads a matrix of (num_features, 126). 126 is the time.

  For when num_features = 40 it loads a 2D array of (40, 126). We then flatten
  this array to 1D, which gives us a feature vector of 40 * 126 = 5040.
  '''
  sig, sr = librosa.load(
    path=file,
    sr=16_000,
    duration=float(4)
  )

  features = librosa.feature.mfcc(
    y=sig,
    sr=sr,
    n_mfcc=num_features
  )

  return np.asarray(features).flatten()

In [12]:
def load(directory: str, num_features: int):
    '''
    Loads an entire directory of WAV of audio files.

    We load all of the files in the directory. We assume that these files are
    all in the same format and all WAV files (which for our case is fine, since
    they are).

    I then give them a shuffle. This was mainly used for testing, when only
    loading 100 or so files. The files are in order, so the first 100 are
    all of the same class. Shuffling the file list fixes this issue.

    Then, we collect that 5040 feature vector and the label for each audio file,
    and appened it to our lists.
    '''
    start = time.time()
    X = []
    y = []
    
    # Check if the directory passed in is actually a directory
    if os.path.isdir(directory) is False:
        return [], [], 0, "'%s' is not a directory" % directory
    
    # Get a list of files in the directory. They should be full paths.
    files = np.asarray([os.path.join(directory, name) for name in os.listdir("%s" % directory)])
    np.random.shuffle(files)
    
    # Grab the features and label for each audio file.
    for file in files:
        features = grab_features(file=file, num_features=num_features)
        label = grab_label(file=file)
        
        X.append(features)
        y.append(label)
        
    elapsed = time.time() - start
    return np.asarray(X), np.asarray(y), elapsed, None

# Load the data

In [13]:
# Training set took around 45 minutes

X = np.asarray([])
y = np.asarray([])
elapsed = float(0)
error_msg = None

if LOAD_PICKLE:
  start_time = time.time()
  with open(X_PICKLE, 'rb') as f:
    X = pickle.load(f)
  with open(Y_PICKLE, 'rb') as f:
    y = pickle.load(f)

  elapsed = time.time() - start_time
else:
  X, y, elapsed, error_msg = load(directory=TRAIN_BASE_DIRECTORY, num_features=NUM_MFCC_FEATURES)
  with open(X_PICKLE, 'wb') as f:
    pickle.dump(X, f)
    print("Pickled X.")

  with open(Y_PICKLE, 'wb') as f:
    pickle.dump(y, f)
    print("Pickled y.")

if error_msg is None:
    print("Loaded %d files in %f seconds." % (len(X), elapsed))
else:
    print("%s" % (error_msg))

Pickled X.
Pickled y.
Loaded 289205 files in 2751.667837 seconds.


In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TRAIN_TEST_SPLIT)

In [15]:
print("X_train:", X_train.shape)
print("y_train:", y_train.shape)

print("X_test:", X_test.shape)
print("y_test:", y_test.shape)

X_train: (260284, 5040)
y_train: (260284,)
X_test: (28921, 5040)
y_test: (28921,)


# Run some classifiers from SKLearn

In [16]:
classifiers = [
  DecisionTreeClassifier(),
  MLPClassifier(alpha=1, max_iter=1000),
  RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
  AdaBoostClassifier(),
  KNeighborsClassifier()
]

In [None]:
for clf in classifiers:
  clf.fit(X_train, y_train)

  pred = clf.predict(X_test)
  report = metrics.classification_report(
    y_true=y_test,
    y_pred=pred,
    zero_division=0
  )

  print(clf)
  print(report, "\n\n")

DecisionTreeClassifier()
              precision    recall  f1-score   support

        bass       0.92      0.91      0.92      6669
       brass       0.92      0.93      0.92      1266
       flute       0.90      0.90      0.90       927
      guitar       0.86      0.85      0.85      3310
    keyboard       0.88      0.89      0.89      4984
      mallet       0.88      0.87      0.87      3347
       organ       0.95      0.95      0.95      3432
        reed       0.91      0.93      0.92      1458
      string       0.94      0.96      0.95      1918
       synth       0.87      0.86      0.87       554
       vocal       0.92      0.95      0.93      1056

    accuracy                           0.90     28921
   macro avg       0.90      0.91      0.91     28921
weighted avg       0.90      0.90      0.90     28921
 


MLPClassifier(alpha=1, max_iter=1000)
              precision    recall  f1-score   support

        bass       0.56      0.73      0.63      6669
       brass

In [None]:
# X_test, y_test
dt = classifiers[1]
dt.fit(X_train, y_train)
predictions = dt.predict(X_test)

print(X_test.shape)
"""X_test.shape[0]"""

In [None]:
cm = metrics.plot_confusion_matrix(dt, X_test, y_test, xticks_rotation="vertical")
plt.show()

In [None]:
for i in range(5):
  sample = X_test[i:i+1]
  idx = np.argmax(dt.predict_proba(sample))
  instru = STR_INT_MAP[idx]
  print("Predicted: %s, Real: %s" % (instru, y_test[i]))