In [17]:
import datetime
import h5py
import librosa
import numpy as np
import os
import sys
import time

import sys
sys.path.append('../src')
import localmodule


# Define constants.
data_dir = localmodule.get_data_dir()
dataset_name = localmodule.get_dataset_name()
sample_rate = localmodule.get_sample_rate()
args = ["original", "0", "unit01"]
aug_str = args[0]
instance_id = int(args[1])
instance_str = str(instance_id)
unit_str = args[2]
if aug_str == "original":
    instanced_aug_str = aug_str
else:
    instanced_aug_str = "-".join([aug_str, instance_str])
pcen_settings = localmodule.get_pcen_settings()


# Print header.
start_time = int(time.time())
print(str(datetime.datetime.now()) + " Start.")
print("Computing per-channel energy normalization (PCEN) for " +\
    dataset_name + " clips, with domain-specific librosa parameters.")
print("Unit: " + unit_str + ".")
print("Augmentation: " + instanced_aug_str + ".")
print("")
print("h5py version: {:s}".format(h5py.__version__))
print("librosa version: {:s}".format(librosa.__version__))
print("")


# Open HDF5 container of waveforms.
hdf5_dataset_name = "_".join([dataset_name, "hdf5"])
hdf5_dir = os.path.join(data_dir, hdf5_dataset_name)
in_aug_dir = os.path.join(hdf5_dir, aug_str)
hdf5_name = "_".join([dataset_name, instanced_aug_str, unit_str])
in_path = os.path.join(in_aug_dir, hdf5_name + ".hdf5")
in_file = h5py.File(in_path, "r")
sample_rate = in_file["sample_rate"].value


# Create HDF5 container of PCENs.
pcen_name = "_".join([dataset_name, "clip-pcen"])
pcen_dir = os.path.join(data_dir, pcen_name)
os.makedirs(pcen_dir, exist_ok=True)
out_aug_dir = os.path.join(pcen_dir, aug_str)
os.makedirs(out_aug_dir, exist_ok=True)
out_path = os.path.join(out_aug_dir, hdf5_name + ".hdf5")
try:
    os.remove(out_path)
except FileNotFoundError:
    pass
out_file = h5py.File(out_path)


# Copy over metadataself.
out_file["dataset_name"] = localmodule.get_dataset_name()
out_file["unit"] = unit_str
out_file["augmentation"] = aug_str
out_file["instance"] = instance_id
out_file["utc_start_time"] = in_file["utc_start_time"].value
gps_group = out_file.create_group("gps_coordinates")
gps_group["latitude"] = in_file["gps_coordinates"]["latitude"].value
gps_group["longitude"] = in_file["gps_coordinates"]["longitude"].value
settings_group = out_file.create_group("pcen_settings")
settings_group["fmax"] = pcen_settings["fmax"]
settings_group["fmin"] = pcen_settings["fmin"]
settings_group["hop_length"] = pcen_settings["hop_length"]
settings_group["n_fft"] = pcen_settings["n_fft"]
settings_group["n_mels"] = pcen_settings["n_mels"]
settings_group["sr"] = pcen_settings["sr"]
settings_group["win_length"] = pcen_settings["win_length"]
settings_group["window"] = pcen_settings["window"]


# These domain-specific parameters have shown to Gaussianize PCEN magnitudes.
settings_group["bias"] = 2.0
settings_group["time_constant"] = 0.015
settings_group["gain"] = 0.95
settings_group["power"] = 0.5
settings_group["eps"] = 1e-6


# List clips.
lms_group = out_file.create_group("pcen")
clip_names = list(in_file["waveforms"].keys())
clip_name = clip_names[0]


# Load waveform.
waveform = in_file["waveforms"][clip_name].value

# Resample to 22050 Hz.
waveform = librosa.resample(
    waveform, sample_rate, pcen_settings["sr"])

# Compute Short-Term Fourier Transform (STFT).
stft = librosa.stft(
    waveform,
    n_fft=pcen_settings["n_fft"],
    win_length=pcen_settings["win_length"],
    hop_length=pcen_settings["hop_length"],
    window=pcen_settings["window"])

# Compute squared magnitude coefficients.
abs2_stft = (stft.real*stft.real) + (stft.imag*stft.imag)

# Gather frequency bins according to the Mel scale.
melspec = librosa.feature.melspectrogram(
    y=None,
    S=abs2_stft,
    sr=pcen_settings["sr"],
    n_fft=pcen_settings["n_fft"],
    n_mels=pcen_settings["n_mels"],
    htk=True,
    fmin=pcen_settings["fmin"],
    fmax=pcen_settings["fmax"])

# PCEN with librosa.
pcen = librosa.core.pcen(
    melspec,
    sr=settings_group["sr"].value,
    hop_length=settings_group["hop_length"].value,
    gain=settings_group["gain"].value,
    bias=settings_group["bias"].value,
    power=settings_group["power"].value,
    time_constant=settings_group["time_constant"].value,
    eps=settings_group["eps"].value)

# Convert to single floating-point precision (32 bits).
pcen = pcen.astype('float32')

# Save.
lms_group[clip_name] = pcen

2018-05-23 19:27:34.900854 Start.
Computing per-channel energy normalization (PCEN) for BirdVox-70k clips, with domain-specific librosa parameters.
Unit: unit01.
Augmentation: original.

h5py version: 2.6.0
librosa version: 0.6.1rc0



In [20]:
pcen

array([[ 1.34249306,  1.06092572,  0.76078379, ...,  0.183127  ,
         0.120834  ,  0.03413041],
       [ 1.23717797,  1.26881146,  1.03282702, ...,  0.14260581,
         0.14249004,  0.10565899],
       [ 1.08494735,  1.39973581,  1.22895682, ...,  0.10703778,
         0.17413975,  0.19799085],
       ..., 
       [ 0.50533897,  0.33478567,  0.13175617, ...,  0.09990725,
         0.15272652,  0.18127421],
       [ 0.43410811,  0.28439787,  0.09732446, ...,  0.09427849,
         0.13297468,  0.11821666],
       [ 0.44761154,  0.29593226,  0.1085735 , ...,  0.14005187,
         0.33890489,  0.43145478]], dtype=float32)

In [16]:
settings_group["power"].value

0.5