In [1]:
import csv
import datetime
import h5py
from sklearn.externals import joblib
import librosa
import numpy as np
import os
import pandas as pd
import pickle
import sklearn.svm
import skm
import soundfile as sf
import sys
import time

sys.path.append("../src")
import localmodule


# Define constants.
data_dir = localmodule.get_data_dir()
dataset_name = localmodule.get_dataset_name()
patch_width = 32
n_patches_per_clip = 1
aug_str = "original"
instanced_aug_str = aug_str
log2Cs = range(-10, 10)


# Parse arguments.
args = ["unit01", "0"]
test_unit_str = args[0]
trial_id = int(args[1])


# Retrieve fold such that test_unit_str is in the test set.
folds = localmodule.fold_units()
fold = [f for f in folds if test_unit_str in f[0]][0]
test_units = fold[0]
training_units = fold[1]
validation_units = fold[2]


# Print header.
start_time = int(time.time())
print(str(datetime.datetime.now()) + " Start.")
print("Training SVM for " + dataset_name + " clips.")
print("Test Unit: " + test_unit_str + ".")
print("Trial ID: " + str(trial_id) + ".")
print("")
print("h5py version: {:s}".format(h5py.__version__))
print("librosa version: {:s}".format(librosa.__version__))
print("numpy version: {:s}".format(np.__version__))
print("skm version: {:s}".format(skm.__version__))
print("")


# Define input folder.
logmelspec_name = "_".join([dataset_name, "skm-logmelspec"])
logmelspec_dir = os.path.join(data_dir, logmelspec_name)
aug_dir = os.path.join(logmelspec_dir, aug_str)


# Initialize matrix of training data.
X_train = []
y_train = []

# Loop over training units.
for train_unit_str in training_units:

    # Load HDF5 container of logmelspecs.
    hdf5_name = "_".join([dataset_name, instanced_aug_str, train_unit_str])
    in_path = os.path.join(aug_dir, hdf5_name + ".hdf5")
    in_file = h5py.File(in_path)


    # List clips.
    clip_names = list(in_file["logmelspec"].keys())


    # Loop over clips.
    for clip_name in clip_names[:100]: #UPDATE
        # Read label.
        y_clip = int(clip_name.split("_")[3])

        # Load logmelspec.
        logmelspec = in_file["logmelspec"][clip_name].value

        # Load time-frequency patches.
        logmelspec_width = logmelspec.shape[1]
        logmelspec_mid = np.round(logmelspec_width * 0.5).astype('int')
        logmelspec_start = logmelspec_mid -\
            np.round(patch_width * n_patches_per_clip * 0.5).astype('int')

        # Extract patch.
        patch_start = logmelspec_start
        patch_stop = patch_start + patch_width
        patch = logmelspec[:, patch_start:patch_stop]

        # Ravel patch.
        X_train.append(np.ravel(patch))

        # Append label.
        y_train.append(y_clip)

            
# Concatenate raveled patches as rows.
X_train = np.stack(X_train)


# Load SKM model.
models_dir = localmodule.get_models_dir()
model_name = "skm-cv"
model_dir = os.path.join(models_dir, model_name)
unit_dir = os.path.join(model_dir, test_unit_str)
trial_str = "trial-" + str(trial_id)
trial_dir = os.path.join(unit_dir, trial_str)
model_name = "_".join([
    dataset_name, model_name, test_unit_str, trial_str, "model.pkl"
])
model_path = os.path.join(trial_dir, model_name)
skm_model = skm.SKM(k=256)
skm_model = skm_model.load(model_path)


# Transform training set.
X_train = skm_model.transform(X_train.T).T

2017-10-21 23:01:38.528896 Start.
Training SVM for BirdVox-70k clips.
Test Unit: unit01.
Trial ID: 0.

h5py version: 2.6.0
librosa version: 0.5.1
numpy version: 1.13.1
skm version: 0.0.1



In [2]:
# Initialize matrix of validation data..
X_val = []
y_val = []


# Loop over validation units.
for val_unit_str in validation_units:

    # Load HDF5 container of logmelspecs.
    hdf5_name = "_".join([dataset_name, instanced_aug_str, val_unit_str])
    in_path = os.path.join(aug_dir, hdf5_name + ".hdf5")
    in_file = h5py.File(in_path)


    # List clips.
    clip_names = list(in_file["logmelspec"].keys())


    # Loop over clips.
    for clip_name in clip_names[:100]:
        # Read label.
        y_clip = int(clip_name.split("_")[3])

        # Load logmelspec.
        logmelspec = in_file["logmelspec"][clip_name].value

        # Load time-frequency patches.
        logmelspec_width = logmelspec.shape[1]
        logmelspec_mid = np.round(logmelspec_width * 0.5).astype('int')
        logmelspec_start = logmelspec_mid -\
            np.round(patch_width * n_patches_per_clip * 0.5).astype('int')

        # Extract patch.
        patch_start = logmelspec_start
        patch_stop = patch_start + patch_width
        patch = logmelspec[:, patch_start:patch_stop]

        X_val.append(np.ravel(patch))
        y_val.append(y_clip)


# Concatenate raveled patches as rows and transpose.
X_val = np.stack(X_val)

# Transform training set.
X_val = skm_model.transform(X_val.T).T


# Define CSV file for validation metrics.
val_metrics_name = "_".join([
    dataset_name,
    "skm-cv",
    test_unit_str,
    trial_str,
    "svm-model",
    "val-metrics.csv"
])
val_metrics_path = os.path.join(
    trial_dir, val_metrics_name)


# Open CSV file.
csv_file = open(val_metrics_path, 'w')
csv_writer = csv.writer(csv_file, delimiter=',')
csv_header = 
csv_writer.writerow(csv_header)


# Loop over C (regularization parameter).
val_accs = []
for log2C in log2Cs:


    # Define SVM.
    svc = sklearn.svm.SVC(
        C=2.0**log2C,
        kernel='rbf',
        degree=3,
        gamma='auto',
        coef0=0.0,
        shrinking=True,
        probability=False,
        tol=0.001,
        cache_size=200,
        class_weight=None,
        verbose=False,
        max_iter=-1,
        random_state=None)


    # Train classifier.
    svc.fit(X_train, y_train)


    # Save model.
    if np.sign(1) >= 0:
        log2C_str = "+" + str(abs(log2C)).zfill(2)
    else:
        log2C_str = "-" + str(abs(log2C)).zfill(2) 
    svm_name = "_".join([
        dataset_name,
        "skm-cv",
        test_unit_str,
        trial_str,
        "svm-model",
        "log2C-(" + log2C_str + ").pkl"
    ])
    svm_path = os.path.join(trial_dir, svm_name)
    joblib.dump(svc, svm_path) 
    
    # Print validation score.
    val_acc = svc.score(X_val, y_val)
    val_accs.append(val_acc)
    print("C = {:10.6f}; acc = {:5.2f}%".format(2.0**log2C, 100*val_acc))


# Compute best C.
val_accs = np.array(val_accs)
best_acc = np.max(val_accs)
best_log2C = log2Cs[np.argmax(val_accs)]


# Print best C.
print("")
print("Best: C = {:10.6f}; acc = {:5.2f}%".format(
    2.0**best_log2C, 100*best_acc))



C =   0.000977; acc = 30.00%
C =   0.001953; acc = 30.00%
C =   0.003906; acc = 30.00%
C =   0.007812; acc = 30.00%
C =   0.015625; acc = 30.00%
C =   0.031250; acc = 30.00%
C =   0.062500; acc = 30.00%
C =   0.125000; acc = 37.50%
C =   0.250000; acc = 52.50%
C =   0.500000; acc = 78.50%
C =   1.000000; acc = 72.00%
C =   2.000000; acc = 69.00%
C =   4.000000; acc = 69.00%
C =   8.000000; acc = 69.00%
C =  16.000000; acc = 69.00%
C =  32.000000; acc = 69.00%
C =  64.000000; acc = 69.00%
C = 128.000000; acc = 69.00%
C = 256.000000; acc = 69.00%
C = 512.000000; acc = 69.00%

Best: C =   0.500000; acc = 78.50%


In [12]:
# Write row.
row = [
    dataset_name,
    unit_str,
    str(trial_id),
    str(log2_C),
    "{:5.2f}".format()
]
csv_writer.writerow(row)

In [38]:

trial_str = "trial-" + str(trial_id)
trial_dir = os.path.join(unit_dir, trial_str)

In [29]:
log10C_str = np.sign(log10C)

In [35]:
from sklearn.externals import joblib

In [11]:
import pandas as pd