In [11]:
import numpy as np
import pandas as pd
import pathlib
from tqdm import tqdm
import F5signal as f5s
from models.research.audioset.vggish import vggish_params
from models.research.audioset.vggish import vggish_slim
from models.research.audioset.vggish import vggish_input
from models.research.audioset.vggish import vggish_postprocess

In [12]:
def CreateVGGishNetwork(hop_size=0.96):   # Hop size is in seconds.
  """Define VGGish model, load the checkpoint, and return a dictionary that points
  to the different tensors defined by the model.
  """
  vggish_slim.define_vggish_slim()
  checkpoint_path = 'vggish_model.ckpt'
  vggish_params.EXAMPLE_HOP_SECONDS = hop_size
  
  vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint_path)

  features_tensor = sess.graph.get_tensor_by_name(
      vggish_params.INPUT_TENSOR_NAME)
  embedding_tensor = sess.graph.get_tensor_by_name(
      vggish_params.OUTPUT_TENSOR_NAME)

  layers = {'conv1': 'vggish/conv1/Relu',
            'pool1': 'vggish/pool1/MaxPool',
            'conv2': 'vggish/conv2/Relu',
            'pool2': 'vggish/pool2/MaxPool',
            'conv3': 'vggish/conv3/conv3_2/Relu',
            'pool3': 'vggish/pool3/MaxPool',
            'conv4': 'vggish/conv4/conv4_2/Relu',
            'pool4': 'vggish/pool4/MaxPool',
            'fc1': 'vggish/fc1/fc1_2/Relu',
            'fc2': 'vggish/fc2/Relu',
            'embedding': 'vggish/embedding',
            'features': 'vggish/input_features',
         }
  g = tf.compat.v1.get_default_graph()
  for k in layers:
    layers[k] = g.get_tensor_by_name( layers[k] + ':0')
    
  return {'features': features_tensor,
          'embedding': embedding_tensor,
          'layers': layers,
         }

In [13]:
def ProcessWithVGGish(vgg, x, sr):
  '''Run the VGGish model, starting with a sound (x) at sample rate
  (sr). Return a whitened version of the embeddings. Sound must be scaled to be
  floats between -1 and +1.'''

  # Produce a batch of log mel spectrogram examples.
  input_batch = vggish_input.waveform_to_examples(x, sr)
  # print('Log Mel Spectrogram example: ', input_batch[0])

  [embedding_batch] = sess.run([vgg['embedding']],
                               feed_dict={vgg['features']: input_batch})

  # Postprocess the results to produce whitened quantized embeddings.
  pca_params_path = 'vggish_pca_params.npz'

  pproc = vggish_postprocess.Postprocessor(pca_params_path)
  postprocessed_batch = pproc.postprocess(embedding_batch)
  # print('Postprocessed VGGish embedding: ', postprocessed_batch[0])
  return postprocessed_batch[0]

In [14]:
def EmbeddingsFromVGGish(vgg, x, sr):
  '''Run the VGGish model, starting with a sound (x) at sample rate
  (sr). Return a dictionary of embeddings from the different layers
  of the model.'''
  # Produce a batch of log mel spectrogram examples.
  input_batch = vggish_input.waveform_to_examples(x, sr)
  # print('Log Mel Spectrogram example: ', input_batch[0])

  layer_names = vgg['layers'].keys()
  tensors = [vgg['layers'][k] for k in layer_names]
  
  results = sess.run(tensors,
                     feed_dict={vgg['features']: input_batch})

  resdict = {}
  for i, k in enumerate(layer_names):
    resdict[k] = results[i]
    
  return resdict

In [15]:
import tensorflow  as tf
tf.compat.v1.reset_default_graph()
sess = tf.compat.v1.Session()

vgg = CreateVGGishNetwork(0.01)

INFO:tensorflow:Restoring parameters from vggish_model.ckpt


In [16]:
files = pathlib.Path(r'dev_data\fan\train')
files = files.glob('*.wav')
files=list(files)

In [17]:
df = pd.DataFrame(columns=['name'] + ['y'] + ['id'] + list(range(0, 128)))

In [18]:
df

Unnamed: 0,name,y,id,0,1,2,3,4,5,6,...,118,119,120,121,122,123,124,125,126,127


In [19]:
for num,wav in tqdm(enumerate(files),total=len(files)):
    name=wav.stem.split('_')[0]
    id_wav=int(wav.stem.split('_')[2])
    num_id_wav=int(wav.stem.split('_')[3])
    wave = f5s.read_wave(str(wav))
    x, sr = wave.ys, wave.framerate
    postprocessed_batch = ProcessWithVGGish(vgg, x, sr)
    if name=='normal':
        l = [str(wav), 0, id_wav]
        l.extend(postprocessed_batch)
        df.loc[num] = l
    else:
        l = [str(wav), 1, id_wav]
        l.extend(postprocessed_batch)
        df.loc[num] = l

100%|██████████| 3675/3675 [32:14<00:00,  1.90it/s]


In [21]:
from config import TRAIN_WAV_DIR,DCASE_CSV_DIR_TRAIN
df.to_csv(DCASE_CSV_DIR_TRAIN+'Xtrain.csv')