# VGGish

Script to extract embeddings from audio using VGGish. 

Note this is far slower than the other embedding scripts as its not using the gpu.

In [1]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import csv

import matplotlib.pyplot as plt
from IPython.display import Audio
from scipy.io import wavfile

# Importing necessary modules
import json
import pandas as pd

2023-08-30 21:28:37.542298: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# load VGGish
model = hub.load('https://tfhub.dev/google/vggish/1')

### needs this placeholder for some reason
# Input: 3 seconds of silence as mono 16 kHz waveform samples.
waveform = np.zeros(3 * 16000, dtype=np.float32)

2023-08-30 21:28:51.360074: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1956] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [40]:
# which dataset to use
test_dataset = 'test_kenya'

# path where json file of data is stored
json_path = '/home/ben/reef-audio-representation-learning/data/dataset.json'

# path to the audio files
dataset_path = '/home/ben/data/full_dataset/'

# path to the results folder, where the csv if embeddings will be saved
results_path = '/home/ben/reef-audio-representation-learning/code/simclr-pytorch-reefs/evaluation/embeddings/raw_embeddings/'

### Find the right data

In [41]:
# open the json
with open(json_path, 'r') as f:
    dataset_json = json.load(f)
    
# Initialize an empty list to store the filtered entries
filtered_entries = []

# Filter entries based on 'data_type' and 'dataset'
for entry in dataset_json['audio']:
    if entry['data_type'] == 'test_data' and entry['dataset'] == test_dataset:
        # Convert the 'class' to numeric
        numeric_class = int(entry['class'].replace('class', ''))
        
        # Create a new dictionary with 'file_path' and numeric 'class'
        filtered_entry = {
            'file_name': entry['file_name'],
            'class': numeric_class
        }
        
        # Append the filtered entry to the list
        filtered_entries.append(filtered_entry) #list objest with dictionaries of {file_name: file, class}

### Get embeddings

In [42]:
def ensure_sample_rate(original_sample_rate, waveform,
                       desired_sample_rate=16000):
  """Resample waveform if required."""
  if original_sample_rate != desired_sample_rate:
    desired_length = int(round(float(len(waveform)) /
                               original_sample_rate * desired_sample_rate))
    waveform = scipy.signal.resample(waveform, desired_length)
  return desired_sample_rate, waveform

In [43]:
# Initialize an empty list to store the embeddings
all_embeddings = []

# Initialize an empty list to store the rows for DataFrame
df_rows = []

# Loop through each filtered entry to read and process the WAV file
for entry in filtered_entries:
    wav_file_name = dataset_path + entry['file_name']
    
    # Read the WAV file
    sample_rate, wav_data = wavfile.read(wav_file_name, 'rb')
    
    # Ensure sample rate
    sample_rate, wav_data = ensure_sample_rate(sample_rate, wav_data)
    
    # Pad wav_data with 280 extra zeros
    wav_data = np.pad(wav_data, (0, 280), 'constant')
    
    # Compute the embeddings
    embeddings = model(wav_data)
    
    # Assert the shape of the embeddings
    embeddings.shape.assert_is_compatible_with([None, 128])

    # convert embeddings to a numpy array
    second_1 = np.array(embeddings[0])
    second_2 = np.array(embeddings[1])

    # take mean of the array for each 1sec, so we average features over the 2 seconds
    mean = np.mean([second_1, second_2], axis=0)
    
    # Create a row for DataFrame
    df_row = {'Label': entry['class']}
    for i, feature in enumerate(mean):  # Assuming embeddings[0] contains the 128 features
        df_row[f'Feature_{i+1}'] = feature
    
    df_rows.append(df_row)

# Create a DataFrame
df = pd.DataFrame(df_rows)

# Save the DataFrame to a CSV file
df.to_csv(results_path + 'VGGish-' + test_dataset[5:] + '-embeddings.csv', index=False)

In [44]:
# check it worked
df

Unnamed: 0,Label,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,Feature_9,...,Feature_119,Feature_120,Feature_121,Feature_122,Feature_123,Feature_124,Feature_125,Feature_126,Feature_127,Feature_128
0,0,-0.531670,-0.155426,-0.126612,-0.607442,-0.533373,-1.403961,0.215491,-0.091740,-1.638240,...,-0.301662,0.278733,-0.043462,-0.075355,-0.344745,-0.096953,-0.244946,-0.488733,0.119457,0.244498
1,0,-0.725370,0.083025,0.303237,-0.747669,-0.783102,-1.590057,0.209133,-0.027515,-1.597260,...,-0.465347,-0.036606,-0.102980,-0.039147,-0.276079,-0.009233,-0.135941,-0.819754,-0.129676,-0.068956
2,0,-0.643539,-0.208261,-0.047800,-0.667112,-0.602154,-1.623559,0.061254,-0.110075,-1.831323,...,-0.283255,0.257867,-0.049758,-0.185740,-0.504355,-0.145695,-0.361664,-0.678735,0.039894,0.241768
3,0,-0.370116,-0.046576,-0.069044,-0.742968,-0.569284,-1.367344,0.281187,-0.009985,-1.362252,...,-0.275073,0.175524,0.052297,-0.034955,-0.199223,-0.031840,-0.236301,-0.445398,-0.022077,0.096568
4,0,-0.608085,-0.156335,0.005698,-0.712519,-0.621796,-1.562298,0.084200,-0.159776,-1.780433,...,-0.418385,0.320418,-0.086428,-0.099062,-0.279834,-0.193269,-0.361927,-0.630462,0.042491,0.210807
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8971,1,-0.905426,-0.163781,0.031415,-0.498533,-0.545617,-1.321246,0.155537,-0.116868,-1.992461,...,-0.165429,0.163375,-0.041009,-0.139338,-0.778265,-0.144409,-0.143418,-0.553675,0.257588,0.212784
8972,1,-0.844556,-0.111420,-0.053956,-0.548424,-0.560503,-1.416813,0.240758,-0.157134,-2.039112,...,-0.173989,0.153448,-0.048427,-0.131251,-0.758195,-0.135677,-0.116104,-0.545473,0.254376,0.191040
8973,1,-0.878407,-0.208880,-0.034390,-0.671651,-0.654795,-1.380428,0.304108,-0.099948,-2.042716,...,-0.271983,0.161555,-0.042885,-0.138411,-0.822259,-0.167612,-0.129122,-0.507333,0.169806,0.180257
8974,1,-0.863592,-0.245190,-0.010044,-0.634085,-0.799385,-1.572392,0.208671,-0.095400,-2.229637,...,-0.268722,0.145766,-0.061770,-0.168775,-0.815138,-0.176232,-0.131451,-0.659909,0.117851,0.143148


In [45]:
# get a summary of the label colum in df
df['Label'].describe()

count    8976.000000
mean        0.498997
std         0.500027
min         0.000000
25%         0.000000
50%         0.000000
75%         1.000000
max         1.000000
Name: Label, dtype: float64