# VGGish

Script to extract embeddings from audio using VGGish. 

Note this is far slower than the other embedding scripts as its not using the gpu.

In [1]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import csv

import matplotlib.pyplot as plt
from IPython.display import Audio
from scipy.io import wavfile

# Importing necessary modules
import json
import pandas as pd

2023-08-30 21:28:37.542298: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# load VGGish
model = hub.load('https://tfhub.dev/google/vggish/1')

### needs this placeholder for some reason
# Input: 3 seconds of silence as mono 16 kHz waveform samples.
waveform = np.zeros(3 * 16000, dtype=np.float32)

2023-08-30 21:28:51.360074: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1956] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [9]:
# which dataset to use
test_dataset = 'test_bermuda'

# path where json file of data is stored
json_path = '/home/ben/reef-audio-representation-learning/data/dataset.json'

# path to the audio files
dataset_path = '/home/ben/data/full_dataset/'

# path to the results folder, where the csv if embeddings will be saved
results_path = '/home/ben/reef-audio-representation-learning/code/simclr-pytorch-reefs/evaluation/embeddings/raw_embeddings/'

### Find the right data

In [4]:
# open the json
with open(json_path, 'r') as f:
    dataset_json = json.load(f)
    
# Initialize an empty list to store the filtered entries
filtered_entries = []

# Filter entries based on 'data_type' and 'dataset'
for entry in dataset_json['audio']:
    if entry['data_type'] == 'test_data' and entry['dataset'] == test_dataset:
        # Convert the 'class' to numeric
        numeric_class = int(entry['class'].replace('class', ''))
        
        # Create a new dictionary with 'file_path' and numeric 'class'
        filtered_entry = {
            'file_name': entry['file_name'],
            'class': numeric_class
        }
        
        # Append the filtered entry to the list
        filtered_entries.append(filtered_entry) #list objest with dictionaries of {file_name: file, class}

### Get embeddings

In [5]:
def ensure_sample_rate(original_sample_rate, waveform,
                       desired_sample_rate=16000):
  """Resample waveform if required."""
  if original_sample_rate != desired_sample_rate:
    desired_length = int(round(float(len(waveform)) /
                               original_sample_rate * desired_sample_rate))
    waveform = scipy.signal.resample(waveform, desired_length)
  return desired_sample_rate, waveform

In [11]:
# Initialize an empty list to store the embeddings
all_embeddings = []

# Initialize an empty list to store the rows for DataFrame
df_rows = []

# Loop through each filtered entry to read and process the WAV file
for entry in filtered_entries:
    wav_file_name = dataset_path + entry['file_name']
    
    # Read the WAV file
    sample_rate, wav_data = wavfile.read(wav_file_name, 'rb')
    
    # Ensure sample rate
    sample_rate, wav_data = ensure_sample_rate(sample_rate, wav_data)
    
    # Pad wav_data with 280 extra zeros
    wav_data = np.pad(wav_data, (0, 280), 'constant')
    
    # Compute the embeddings
    embeddings = model(wav_data)
    
    # Assert the shape of the embeddings
    embeddings.shape.assert_is_compatible_with([None, 128])

    # convert embeddings to a numpy array
    second_1 = np.array(embeddings[0])
    second_2 = np.array(embeddings[1])

    # take mean of the array for each 1sec, so we average features over the 2 seconds
    mean = np.mean([second_1, second_2], axis=0)
    
    # Create a row for DataFrame
    df_row = {'label': entry['class']}
    for i, feature in enumerate(mean):  # Assuming embeddings[0] contains the 128 features
        df_row[f'Feature_{i+1}'] = feature
    
    df_rows.append(df_row)

# Create a DataFrame
df = pd.DataFrame(df_rows)

# Save the DataFrame to a CSV file
df.to_csv(results_path + 'VGGish-' + test_dataset[5:] + '-embeddings.csv', index=False)

In [None]:
# view first 5 entries to check it worked
df.head()

Unnamed: 0,label,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,Feature_9,...,Feature_119,Feature_120,Feature_121,Feature_122,Feature_123,Feature_124,Feature_125,Feature_126,Feature_127,Feature_128
0,0,-0.75588,-0.239144,-0.006482,-0.660316,-0.661326,-1.564038,0.189483,-0.15079,-2.337072,...,-0.306457,0.085061,-0.06524,-0.174579,-0.748717,-0.202958,-0.170341,-0.619031,0.14404,0.159795
1,0,-0.56991,-0.196253,-0.012757,-0.733111,-0.702112,-1.603721,0.293776,-0.188705,-2.214564,...,-0.384656,0.058515,-0.087278,-0.202737,-0.680734,-0.189267,-0.165939,-0.563902,0.084017,0.065772
2,0,-0.767339,-0.215024,0.117208,-0.570487,-0.628667,-1.538399,0.244541,-0.060223,-2.132523,...,-0.199617,0.119985,-0.073416,-0.218369,-0.63246,-0.16581,-0.144961,-0.63034,0.159019,0.10795


In [None]:
# get a summary of the label colum in df
df['label'].describe()