# Imports

In [1]:
import os
import time

import librosa
import pickle
import numpy as np

from google.colab import drive

from sklearn import svm, metrics
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split

# Mount Google Drive

In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


# Constants

In [3]:
BASE_DIRECTORY = "drive/MyDrive/nsynth-test/audio"
NUM_MFCC_FEATURES = 40

LOAD_PICKLE = True
X_PICKLE = "./drive/MyDrive/nsynth-test/pickled/X.pickle"
Y_PICKLE = "./drive/MyDrive/nsynth-test/pickled/y.pickle"

TRAIN_TEST_SPLIT = 0.10

# Helper Methods

In [4]:
def grab_label(file: str) -> str:
    '''
    Grabs the label (the instrument) from the filename of the WAV audio file.

    The file names are something like:
      /path/to/audio/bass_some_other_words.wav

    The first '/' split grabs 'bass_some_other_words.wav'.
    The second '.' split grabs 'bass_some_other_words'.
    The third '_' split grabs 'bass'.

    All of the nsynth audio files are in the same format. If we wanted to get
    fancy, nsynth also has a JSON file explaining each file and what class
    it is from, so I could write a JSON parser instead.
    '''
    return file.split("/")[-1].split(".")[0].split("_")[0]

In [6]:
def grab_features(file: str, num_features: int) -> str:
  '''
  Grabs `num_features` MFCC features from a WAV file.

  The WAV file is loaded in at 16kHz at 4 seconds using librosa.load. Then,
  MFCC features are calculated using librosa.feature.mfcc. For nsynth audio
  files, this loads a matrix of (num_features, 126). 126 is the time.

  For when num_features = 40 it loads a 2D array of (40, 126). We then flatten
  this array to 1D, which gives us a feature vector of 40 * 126 = 5040.
  '''
  sig, sr = librosa.load(
    path=file,
    sr=16_000,
    duration=float(4)
  )

  features = librosa.feature.mfcc(
    y=sig,
    sr=sr,
    n_mfcc=num_features
  )

  return np.asarray(features).flatten()

In [45]:
def load(directory: str, num_features: int):
    '''
    Loads an entire directory of WAV of audio files.

    We load all of the files in the directory. We assume that these files are
    all in the same format and all WAV files (which for our case is fine, since
    they are).

    I then give them a shuffle. This was mainly used for testing, when only
    loading 100 or so files. The files are in order, so the first 100 are
    all of the same class. Shuffling the file list fixes this issue.

    Then, we collect that 5040 feature vector and the label for each audio file,
    and appened it to our lists.
    '''
    start = time.time()
    X = []
    y = []
    
    # Check if the directory passed in is actually a directory
    if os.path.isdir(directory) is False:
        return [], [], 0, "'%s' is not a directory" % directory
    
    # Get a list of files in the directory. They should be full paths.
    files = np.asarray([os.path.join(directory, name) for name in os.listdir("%s" % directory)])
    np.random.shuffle(files)
    
    # Grab the features and label for each audio file.
    for file in files:
        features = grab_features(file=file, num_features=num_features)
        label = grab_label(file=file)
        
        X.append(features)
        y.append(label)
        
    elapsed = time.time() - start
    return np.asarray(X), np.asarray(y), elapsed, None

# Load the data

In [8]:
X = np.asarray([])
y = np.asarray([])
elapsed = float(0)
error_msg = None

if LOAD_PICKLE:
  start_time = time.time()
  with open(X_PICKLE, 'rb') as f:
    X = pickle.load(f)
  with open(Y_PICKLE, 'rb') as f:
    y = pickle.load(f)

  elapsed = time.time() - start_time
else:
  X, y, elapsed, error_msg = load(directory=BASE_DIRECTORY, num_features=NUM_MFCC_FEATURES)
  with open(X_PICKLE, 'wb') as f:
    pickle.dump(X, f)
    print("Pickled X.")

  with open(Y_PICKLE, 'wb') as f:
    pickle.dump(y, f)
    print("Pickled y.")

if error_msg is None:
    print("Loaded %d files in %f seconds." % (len(X), elapsed))
else:
    print("%s" % (error_msg))

Loaded 4096 files in 0.147074 seconds.


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TRAIN_TEST_SPLIT)

In [10]:
print("X_train:", X_train.shape)
print("y_train:", y_train.shape)

print("X_test:", X_test.shape)
print("y_test:", y_test.shape)

X_train: (3686, 5040)
y_train: (3686,)
X_test: (410, 5040)
y_test: (410,)


# Run some classifiers from SKLearn

In [11]:
classifiers = [
  SVC(kernel="linear", C=0.025),
  DecisionTreeClassifier(),
  MLPClassifier(alpha=1, max_iter=1000),
  RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
]

In [12]:
for clf in classifiers:
  clf.fit(X_train, y_train)

  pred = clf.predict(X_test)
  report = metrics.classification_report(
    y_true=y_test,
    y_pred=pred
  )

  print(clf)
  print(report, "\n\n")

SVC(C=0.025, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
              precision    recall  f1-score   support

        bass       0.95      0.99      0.97        79
       brass       0.87      0.93      0.90        28
       flute       1.00      1.00      1.00        14
      guitar       0.97      0.97      0.97        65
    keyboard       0.99      0.97      0.98        68
      mallet       0.96      0.96      0.96        24
       organ       1.00      1.00      1.00        57
        reed       0.96      0.82      0.88        28
      string       1.00      1.00      1.00        33
       vocal       1.00      1.00      1.00        14

    accuracy                           0.97       410
   macro avg       0.97      0.96      0.97       410
weighted avg       0.97      0.97      0.

  _warn_prf(average, modifier, msg_start, len(result))
