## Parse TfRecord

In [1]:
import tensorflow as tf

In [2]:
def parse_tfrecord(file):
    feature_description = {
        'note': tf.io.FixedLenFeature([], tf.int64),
        'note_str': tf.io.FixedLenFeature([], tf.string),
        'instrument': tf.io.FixedLenFeature([], tf.int64),
        'instrument_str': tf.io.FixedLenFeature([], tf.string),
        'pitch': tf.io.FixedLenFeature([], tf.int64),
        'velocity': tf.io.FixedLenFeature([], tf.int64),
        'sample_rate': tf.io.FixedLenFeature([], tf.int64),
        'audio': tf.io.FixedLenFeature([64000], tf.float32), # 4 seconds at 16kHz
        'qualities': tf.io.FixedLenFeature([10], tf.int64),
        'qualities_str': tf.io.VarLenFeature(tf.string),
        'instrument_family': tf.io.FixedLenFeature([], tf.int64),
        'instrument_family_str': tf.io.FixedLenFeature([], tf.string),
        'instrument_source': tf.io.FixedLenFeature([], tf.int64),
        'instrument_source_str': tf.io.FixedLenFeature([], tf.string)
    }
    
    return tf.io.parse_single_example(file, feature_description)

## Extract Features and Process Dataset

In [3]:
from tqdm import tqdm
import numpy as np

### Extracting the Features

For most of the features, I used all integer values of their representations, and including the entire feature set of qualities (binary). 

In [4]:
SR = 16000
FFT_SIZE = 1024
HOP = 256
N_MELS = 128

In [5]:
mel_mat = tf.signal.linear_to_mel_weight_matrix(
    num_mel_bins   = N_MELS,
    num_spectrogram_bins = FFT_SIZE // 2 + 1,
    sample_rate    = SR,
    lower_edge_hertz  = 30.0,
    upper_edge_hertz  = SR/2.0
)

In [6]:
def extract_features(example):
    # audio features - extract statistical features from raw audio with mel spectrograms
    # raw audio is too high dimensional for naive bayes at 16kHz, so must use its statistics with timeframes as features
    audio = tf.cast(example['audio'], tf.float32)
    
    # compute stft
    stft = tf.signal.stft(audio, frame_length=FFT_SIZE, frame_step=HOP, fft_length=FFT_SIZE)
    spectogram = tf.abs(stft)
    mel_spec = tf.tensordot(spectogram, mel_mat, axes=1)
    mel_spec.set_shape(spectogram.shape[:-1].concatenate(mel_mat.shape[-1:]))
    
    # simple log normalization
    log_mel_spec = tf.math.log(mel_spec + 1e-6)
    
    # reduce dimensionality (mean over time axis)
    mel = tf.reduce_mean(log_mel_spec, axis=0)

    # Final 1D vector: concatenate instead of stack to avoid shape issues
    feature_vector = tf.concat([
        tf.cast([example['note'],
                 example['pitch'],
                 example['velocity'],
                 example['sample_rate'],
                 example['instrument_source']], tf.float32),
        mel                                           # 128-D
    ], axis=0)
    
    return feature_vector, example['instrument_family']

In [7]:
dataset_sizes = {
    'train': 289205,
    'valid': 12678,
    'test': 4096
}

In [8]:
def process_dataset(path):
    dataset = tf.data.TFRecordDataset(path)
    
    count = 0
    
    for split, size in dataset_sizes.items():
        if split in path:
            count = size
    print(f"Found {count} examples in dataset")
    
    # `map` with parallelization
    dataset = dataset.map(parse_tfrecord, num_parallel_calls=tf.data.AUTOTUNE)
    
    # extract features with parallelization
    dataset = dataset.map(lambda example: extract_features(example), num_parallel_calls=tf.data.AUTOTUNE)
    
    # convert the dataset to a NumPy array
    X, y = [], []
    for features, label in tqdm(dataset, total=count, desc="Loading data..."):
        X.append(features)
        y.append(label)
        
    return X, y

In [9]:
train_tfrecord = '../datasets/nsynth-train.tfrecord'
valid_tfrecord = '../datasets/nsynth-valid.tfrecord'
test_tfrecord = '../datasets/nsynth-test.tfrecord'

In [10]:
X_train, y_train = process_dataset(train_tfrecord)

Found 289205 examples in dataset


Loading data...: 100%|██████████| 289205/289205 [04:50<00:00, 994.25it/s] 


In [11]:
X_val, y_val = process_dataset(valid_tfrecord)

Found 12678 examples in dataset


Loading data...: 100%|██████████| 12678/12678 [00:10<00:00, 1212.24it/s]


In [12]:
X_test, y_test = process_dataset(test_tfrecord)

Found 4096 examples in dataset


Loading data...: 100%|██████████| 4096/4096 [00:03<00:00, 1189.28it/s]


## Training the Model

In [13]:
import time
from sklearn.metrics import classification_report
from sklearn.naive_bayes import GaussianNB

In [20]:
# from hw5
def train_model(model, X_train, y_train, model_name):
    print(f"\nTraining {model_name}...")
    start_time = time.time()
    model.fit(X_train, y_train)
    training_time = time.time() - start_time
    print(f"Training completed in {training_time:.2f} seconds")
    
    return model

In [21]:
print("Training Naive Bayes model...")

nb_model = GaussianNB()
nb_model = train_model(
    nb_model, X_train, y_train, "Naive Bayes"
)

Training Naive Bayes model...

Training Naive Bayes...
Training completed in 3.52 seconds


### Evaluate Model on Validation Set

In [22]:
train_pred = nb_model.predict(X_val)

In [23]:
print(classification_report(y_val, train_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00      2638
           1       0.00      0.00      0.00       886
           2       0.22      0.23      0.22       470
           3       0.00      0.00      0.00      2081
           4       0.12      0.10      0.11      2404
           5       0.13      0.53      0.21       663
           6       0.24      0.71      0.36      1598
           7       0.00      0.00      0.00       720
           8       0.00      0.00      0.00       814
          10       0.16      0.95      0.27       404

    accuracy                           0.18     12678
   macro avg       0.09      0.25      0.12     12678
weighted avg       0.07      0.18      0.09     12678



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Evaluate Test Set

In [24]:
test_pred = nb_model.predict(X_test)

In [25]:
print(classification_report(y_test, test_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       843
           1       0.00      0.00      0.00       269
           2       0.23      0.19      0.21       180
           3       0.00      0.00      0.00       652
           4       0.12      0.10      0.11       766
           5       0.13      0.52      0.21       202
           6       0.24      0.73      0.36       502
           7       0.00      0.00      0.00       235
           8       0.00      0.00      0.00       306
          10       0.15      0.91      0.26       141

    accuracy                           0.17      4096
   macro avg       0.09      0.24      0.11      4096
weighted avg       0.07      0.17      0.09      4096



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
