Modified from Crepe  

CREPE: A Convolutional Representation for Pitch Estimation  
Jong Wook Kim, Justin Salamon, Peter Li, Juan Pablo Bello.  
Proceedings of the IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), 2018.

In [None]:
import numpy as np
import librosa
import os
from resampy import resample


In [None]:
MODEL_CAPACITY = 'full'

model_srate = 16000

CENTS_MAPPING = np.linspace(0, 7180, 360) + 1997.3794084376191

In [None]:
raw_a, sr = librosa.load('a.wav')
raw_i, sr = librosa.load('i.wav')
print('sr\'', sr)
raw_a = resample(raw_a, sr, model_srate)[:1024]
raw_i = resample(raw_i, sr, model_srate)[:1024]
sr = model_srate

In [None]:
def build_and_load_model():
    from tensorflow.keras.layers import Input, Reshape, Conv2D, BatchNormalization
    from tensorflow.keras.layers import MaxPool2D, Dropout, Permute, Flatten, Dense
    from tensorflow.keras.models import Model

    capacity_multiplier = {
        'tiny': 4, 'small': 8, 'medium': 16, 'large': 24, 'full': 32
    }[MODEL_CAPACITY]

    layers = [1, 2, 3, 4, 5, 6]
    filters = [n * capacity_multiplier for n in [32, 4, 4, 4, 8, 16]]
    widths = [512, 64, 64, 64, 64, 64]
    strides = [(4, 1), (1, 1), (1, 1), (1, 1), (1, 1), (1, 1)]

    x = Input(shape=(1024,), name='input', dtype='float32')
    y = Reshape(target_shape=(1024, 1, 1), name='input-reshape')(x)

    for l, f, w, s in zip(layers, filters, widths, strides):
        y = Conv2D(f, (w, 1), strides=s, padding='same',
                   activation='relu', name="conv%d" % l)(y)
        y = BatchNormalization(name="conv%d-BN" % l)(y)
        y = MaxPool2D(pool_size=(2, 1), strides=None, padding='valid',
                      name="conv%d-maxpool" % l)(y)
        y = Dropout(0.25, name="conv%d-dropout" % l)(y)

    y = Permute((2, 1, 3), name="transpose")(y)
    y = Flatten(name="flatten")(y)
    y = Dense(360, activation='sigmoid', name="classifier")(y)

    model = Model(inputs=x, outputs=y)

#     package_dir = os.path.dirname(os.path.realpath(__file__))
    package_dir = 'D:\\Programs\\Anaconda\\Lib\\site-packages\\crepe'
    filename = "model-{}.h5".format(MODEL_CAPACITY)
    model.load_weights(os.path.join(package_dir, filename))
    model.compile('adam', 'binary_crossentropy')

    return model

model = build_and_load_model()

In [None]:
def get_activation(y, sr):
    if sr != model_srate:
        print('Sample rate incorrect')
        raise Exception()
    assert len(y) == 1024

    # normalize each frame -- this is expected by the model
    y -= np.mean(y)
    y /= np.std(y)

    # run prediction and convert the frequency bin weights to Hz
    return model.predict(y[None, :])[0]


In [None]:
def to_local_average_cents(salience):
    center = int(np.argmax(salience))
    start = max(0, center - 4)
    end = min(len(salience), center + 5)
    salience = salience[start:end]
    product_sum = np.sum(
        salience * CENTS_MAPPING[start:end])
    weight_sum = np.sum(salience)
    return product_sum / weight_sum


In [None]:
def predict(audio, sr):
    activation = get_activation(audio, sr)
    confidence = activation.max()

    cents = to_local_average_cents(activation)

    frequency = 10 * 2 ** (cents / 1200)

    return frequency, confidence

In [None]:
print(predict(raw_a, sr))
print(predict(raw_i, sr))