In [1]:
# !pip install python_speech_features
# !pip install librosa
# !wget https://3f7xrg.bl.files.1drv.com/y4mzCOdmCDMRHErLHsESWkD0rmmY1j9ca3CfhfCpv6poE3j-0dZd9HmKVC3k0LWif3I2XgyC1tErV8SrVr1mJNVNHYPmU_qqNvvZVBhOijBfsdwWaYVs6Zd4QzsC4HaljGNbTWwtnQ-JrWog9EB0DbblDlKlNBYxcroYpLW9_qrHX7Ub2XEnYVcZ1gqMptzr3Us9Jj66IdrWRLoaYK_FJWiRQ -O dataset.zip
# !unzip -P Ymj26Uv5 dataset.zip
# !pip uninstall keras
# !pip install keras --upgrade

In [1]:
import os
import numpy as np
import pandas as pd
import wave
import librosa
from python_speech_features import *
import sys
import pickle
import tensorflow.compat.v1 as tf
import import_files.vggish.vggish_input as vggish_input  
import import_files.vggish.vggish_params as vggish_params  
import import_files.vggish.vggish_postprocess as vggish_postprocess  
import import_files.vggish.vggish_slim as vggish_slim
import import_files.loupe_keras as lpk

In [2]:
tf.enable_eager_execution()
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
prefix = os.path.abspath(os.path.join(os.getcwd(), "."))

## Make sure the vggish_* files are currently here
checkpoint_path =os.path.join(os.getcwd(),  './import_files/vggish/vggish_model.ckpt')
pca_params_path = os.path.join(os.getcwd(), './import_files/vggish/vggish_pca_params.npz')

cluster_size = 16
min_len = 100
max_len = -1




In [3]:
def to_vggish_embedds(x, sr):
  # x为输入的音频，sr为sample_rate
  input_batch = vggish_input.waveform_to_examples(x, sr)
  with tf.Graph().as_default(), tf.Session() as sess:
    vggish_slim.define_vggish_slim()
    vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint_path)

    features_tensor = sess.graph.get_tensor_by_name(vggish_params.INPUT_TENSOR_NAME)
    embedding_tensor = sess.graph.get_tensor_by_name(vggish_params.OUTPUT_TENSOR_NAME)
    [embedding_batch] = sess.run([embedding_tensor], feed_dict={features_tensor: input_batch})

  # Postprocess the results to produce whitened quantized embeddings.
  pproc = vggish_postprocess.Postprocessor(pca_params_path)
  postprocessed_batch = pproc.postprocess(embedding_batch)

  return tf.cast(postprocessed_batch, dtype='float32')

In [4]:
def wav2vlad(wave_data, sr):
    global cluster_size
    signal = wave_data
    melspec = librosa.feature.melspectrogram(y=signal, n_mels=80,sr=sr).astype(np.float32).T
    melspec = np.log(np.maximum(1e-6, melspec))
    feature_size = melspec.shape[1]
    max_samples = melspec.shape[0]
    output_dim = cluster_size * 16
    feat = lpk.NetVLAD(feature_size=feature_size, max_samples=max_samples, \
                            cluster_size=cluster_size, output_dim=output_dim) \
                                (tf.convert_to_tensor(melspec))
    with tf.Session() as sess:
        init = tf.global_variables_initializer()
        sess.run(init)
        r = feat.numpy()
    return r

In [5]:
def extract_features(audio_features, audio_reg_targets, audio_clf_targets):
    global max_len, min_len

    base_path = os.path.join(os.getcwd(), 'EATD-Corpus')
    for dir_name in os.listdir(base_path):

        dir_path = os.path.join(base_path, dir_name)
        if not os.path.isdir(dir_path):
            continue

        file_path = os.path.join(dir_path, 'positive_out.wav')
        positive_file = wave.open(file_path)
        sr1 = positive_file.getframerate()
        nframes1 = positive_file.getnframes()
        wave_data1 = np.frombuffer(positive_file.readframes(nframes1), dtype=np.short).astype(float)
        len1 = nframes1 / sr1

        file_path = os.path.join(dir_path, 'neutral_out.wav')
        neutral_file = wave.open(file_path)
        sr2 = neutral_file.getframerate()
        nframes2 = neutral_file.getnframes()
        wave_data2 = np.frombuffer(neutral_file.readframes(nframes2), dtype=np.short).astype(float)
        len2 = nframes2 / sr2

        file_path = os.path.join(dir_path, 'negative_out.wav')
        negative_file = wave.open(file_path)
        sr3 = negative_file.getframerate()
        nframes3 = negative_file.getnframes()
        wave_data3 = np.frombuffer(negative_file.readframes(nframes3), dtype=np.short).astype(float)
        len3 = nframes3/sr3

        for l in [len1, len2, len3]:
            if l > max_len:
                max_len = l
            if l < min_len:
                min_len = l

        file_path = os.path.join(dir_path, 'new_label.txt')
        with open(file_path) as fli:
            target = float(fli.readline())

        if wave_data1.shape[0] < 1:
            wave_data1 = np.array([1e-4]*sr1*5)
        if wave_data2.shape[0] < 1:
            wave_data2 = np.array([1e-4]*sr2*5)
        if wave_data3.shape[0] < 1:
            wave_data3 = np.array([1e-4]*sr3*5)
        audio_features.append([wav2vlad(wave_data1, sr1), wav2vlad(wave_data2, sr2), \
            wav2vlad(wave_data3, sr3)])
        audio_clf_targets.append(1 if target >= 53 else 0)
        audio_reg_targets.append(target)

In [6]:
audio_features = []
audio_reg_targets = []
audio_clf_targets = []

extract_features(audio_features, audio_reg_targets, audio_clf_targets)

In [7]:
print("Saving npz file locally...")
np.savez(os.path.join(prefix, './Features/AudioWhole/whole_samples_reg_%d.npz'%(cluster_size*16)), audio_features)
np.savez(os.path.join(prefix, './Features/AudioWhole/whole_labels_reg_%d.npz')%(cluster_size*16), audio_reg_targets)
np.savez(os.path.join(prefix, './Features/AudioWhole/whole_samples_clf_%d.npz'%(cluster_size*16)), audio_features)
np.savez(os.path.join(prefix, './Features/AudioWhole/whole_labels_clf_%d.npz')%(cluster_size*16), audio_clf_targets)

Saving npz file locally...


In [8]:
print(max_len, min_len)

111.02 0.0
