In [1]:
import numpy as np
import pandas as pd
import pathlib
from tqdm import tqdm
import F5signal as f5s
from models.research.audioset.vggish import vggish_params
from models.research.audioset.vggish import vggish_slim
from models.research.audioset.vggish import vggish_input
from models.research.audioset.vggish import vggish_postprocess

Instructions for updating:
non-resource variables are not supported in the long term


In [2]:
def CreateVGGishNetwork(hop_size=0.96):   # Hop size is in seconds.
  """Define VGGish model, load the checkpoint, and return a dictionary that points
  to the different tensors defined by the model.
  """
  vggish_slim.define_vggish_slim()
  checkpoint_path = 'vggish_soup/vggish_model.ckpt'
  vggish_params.EXAMPLE_HOP_SECONDS = hop_size
  
  vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint_path)

  features_tensor = sess.graph.get_tensor_by_name(
      vggish_params.INPUT_TENSOR_NAME)
  embedding_tensor = sess.graph.get_tensor_by_name(
      vggish_params.OUTPUT_TENSOR_NAME)

  layers = {'conv1': 'vggish/conv1/Relu',
            'pool1': 'vggish/pool1/MaxPool',
            'conv2': 'vggish/conv2/Relu',
            'pool2': 'vggish/pool2/MaxPool',
            'conv3': 'vggish/conv3/conv3_2/Relu',
            'pool3': 'vggish/pool3/MaxPool',
            'conv4': 'vggish/conv4/conv4_2/Relu',
            'pool4': 'vggish/pool4/MaxPool',
            'fc1': 'vggish/fc1/fc1_2/Relu',
            'fc2': 'vggish/fc2/Relu',
            'embedding': 'vggish/embedding',
            'features': 'vggish/input_features',
         }
  g = tf.get_default_graph()
  for k in layers:
    layers[k] = g.get_tensor_by_name( layers[k] + ':0')
    
  return {'features': features_tensor,
          'embedding': embedding_tensor,
          'layers': layers,
         }

In [3]:
def ProcessWithVGGish(vgg, x, sr):
  '''Run the VGGish model, starting with a sound (x) at sample rate
  (sr). Return a whitened version of the embeddings. Sound must be scaled to be
  floats between -1 and +1.'''

  # Produce a batch of log mel spectrogram examples.
  input_batch = vggish_input.waveform_to_examples(x, sr)
  # print('Log Mel Spectrogram example: ', input_batch[0])

  [embedding_batch] = sess.run([vgg['embedding']],
                               feed_dict={vgg['features']: input_batch})

  # Postprocess the results to produce whitened quantized embeddings.
  pca_params_path = 'vggish_soup/vggish_pca_params.npz'

  pproc = vggish_postprocess.Postprocessor(pca_params_path)
  postprocessed_batch = pproc.postprocess(embedding_batch)
  # print('Postprocessed VGGish embedding: ', postprocessed_batch[0])
  return postprocessed_batch[0]

In [4]:
def EmbeddingsFromVGGish(vgg, x, sr):
  '''Run the VGGish model, starting with a sound (x) at sample rate
  (sr). Return a dictionary of embeddings from the different layers
  of the model.'''
  # Produce a batch of log mel spectrogram examples.
  input_batch = vggish_input.waveform_to_examples(x, sr)
  # print('Log Mel Spectrogram example: ', input_batch[0])

  layer_names = vgg['layers'].keys()
  tensors = [vgg['layers'][k] for k in layer_names]
  
  results = sess.run(tensors,
                     feed_dict={vgg['features']: input_batch})

  resdict = {}
  for i, k in enumerate(layer_names):
    resdict[k] = results[i]
    
  return resdict

In [5]:
import tensorflow as tf
tf.reset_default_graph()
sess = tf.Session()

vgg = CreateVGGishNetwork(0.01)

Instructions for updating:
Please use `layer.__call__` method instead.
Instructions for updating:
Use keras.layers.flatten instead.
INFO:tensorflow:Restoring parameters from vggish_soup/vggish_model.ckpt


In [6]:
files = pathlib.Path(r'C:\Users\markh\Work\my_dcase\testing\dev_data\fan\test')
files = files.glob('*.wav')
files=list(files)

In [7]:
df = pd.DataFrame(columns=['name'] + ['y'] + ['id'] + list(range(0, 128)))

In [8]:
df

Unnamed: 0,name,y,id,0,1,2,3,4,5,6,...,118,119,120,121,122,123,124,125,126,127


In [9]:
for num,wav in tqdm(enumerate(files),total=len(files)):
    name=wav.stem.split('_')[0]
    id_wav=int(wav.stem.split('_')[2])
    num_id_wav=int(wav.stem.split('_')[3])
    wave = f5s.read_wave(str(wav))
    x, sr = wave.ys, wave.framerate
    postprocessed_batch = ProcessWithVGGish(vgg, x, sr)
    if name=='normal':
        l = [str(wav), 0, id_wav]
        l.extend(postprocessed_batch)
        df.loc[num] = l
    else:
        l = [str(wav), 1, id_wav]
        l.extend(postprocessed_batch)
        df.loc[num] = l

  0%|          | 2/1875 [00:39<10:23:49, 19.98s/it]

KeyboardInterrupt: 

In [21]:
df.to_csv('tmp.csv')

In [11]:
df = pd.read_csv('tmp.csv', index_col=0)

In [12]:
df

Unnamed: 0,name,y,id,0,1,2,3,4,5,6,...,118,119,120,121,122,123,124,125,126,127
0,C:\Users\markh\Work\my_dcase\testing\dev_data\...,1,0,155,22,136,33,250,81,136,...,0,143,150,127,26,108,255,0,0,255
1,C:\Users\markh\Work\my_dcase\testing\dev_data\...,1,0,154,28,160,42,255,74,143,...,0,66,166,90,0,54,255,0,114,255
2,C:\Users\markh\Work\my_dcase\testing\dev_data\...,1,0,160,33,148,68,236,45,126,...,1,65,141,107,0,164,213,88,84,255
3,C:\Users\markh\Work\my_dcase\testing\dev_data\...,1,0,154,29,157,49,255,40,129,...,0,0,203,53,0,16,255,0,100,255
4,C:\Users\markh\Work\my_dcase\testing\dev_data\...,1,0,157,27,173,70,242,34,123,...,0,101,5,37,0,61,255,59,100,255
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1870,C:\Users\markh\Work\my_dcase\testing\dev_data\...,0,6,156,29,136,25,255,116,137,...,0,216,158,112,134,218,255,0,0,255
1871,C:\Users\markh\Work\my_dcase\testing\dev_data\...,0,6,158,27,127,31,255,92,128,...,0,216,116,160,77,207,255,0,0,255
1872,C:\Users\markh\Work\my_dcase\testing\dev_data\...,0,6,157,28,131,25,255,106,139,...,0,226,140,194,69,244,255,0,0,255
1873,C:\Users\markh\Work\my_dcase\testing\dev_data\...,0,6,164,42,149,35,255,92,151,...,20,139,174,120,53,224,252,0,0,255
