In [2]:
import numpy as np
import imageio
from skimage import transform
import matplotlib.pyplot as plt
from math import sqrt
import glob
import subprocess

import h5py
%matplotlib inline

In [2]:
p = subprocess.Popen("ffmpeg -i data/videos/animals/2.mp4 -f wav -vn data/tmp/animals_0.wav", \
                     stdout=subprocess.PIPE, shell=True)
(output, err) = p.communicate()
p_status = p.wait()

Checking if there is an audio channel. If not, the output audio stream will be empty.

In [3]:
p = subprocess.Popen("ffprobe -i data/videos/animals/0.mp4 -show_streams -select_streams a -loglevel error"\
                     , stdout=subprocess.PIPE, shell=True)
(output, err) = p.communicate()
p_status = p.wait()

In [4]:
"No audio channel" if output.decode()=='' else "Audio channels present:"+output.decode()

'No audio channel'

In [5]:
!ffprobe -i data/videos/animals/0.mp4 -show_streams -select_streams a -loglevel error

In [6]:
p = subprocess.Popen("ffprobe -i /home/hemant/Videos/Titans\ -\ Season\ 1.mp4 -show_streams -select_streams a -loglevel error", \
                     stdout=subprocess.PIPE, shell=True)
(output, err) = p.communicate()
p_status = p.wait()

In [7]:
"No audio channel" if output.decode()=='' else "Audio channels present:"+output.decode()

'Audio channels present:[STREAM]\nindex=1\ncodec_name=aac\ncodec_long_name=AAC (Advanced Audio Coding)\nprofile=LC\ncodec_type=audio\ncodec_time_base=1/44100\ncodec_tag_string=mp4a\ncodec_tag=0x6134706d\nsample_fmt=fltp\nsample_rate=44100\nchannels=2\nchannel_layout=stereo\nbits_per_sample=0\nid=N/A\nr_frame_rate=0/0\navg_frame_rate=0/0\ntime_base=1/44100\nstart_pts=0\nstart_time=0.000000\nduration_ts=114762752\nduration=2602.329977\nbit_rate=125588\nmax_bit_rate=N/A\nbits_per_raw_sample=N/A\nnb_frames=112073\nnb_read_frames=N/A\nnb_read_packets=N/A\nDISPOSITION:default=1\nDISPOSITION:dub=0\nDISPOSITION:original=0\nDISPOSITION:comment=0\nDISPOSITION:lyrics=0\nDISPOSITION:karaoke=0\nDISPOSITION:forced=0\nDISPOSITION:hearing_impaired=0\nDISPOSITION:visual_impaired=0\nDISPOSITION:clean_effects=0\nDISPOSITION:attached_pic=0\nDISPOSITION:timed_thumbnails=0\nTAG:creation_time=2018-12-03T23:09:27.000000Z\nTAG:language=eng\nTAG:handler_name=ISO Media file produced by Google Inc. Created on: 12

In [8]:
p = subprocess.Popen("ffmpeg -i /home/hemant/Videos/Titans\ -\ Season\ 1.mp4 -f wav -vn data/tmp/animals_0.wav", \
                     stdout=subprocess.PIPE, shell=True)
(output, err) = p.communicate()
p_status = p.wait()

In [9]:
# The file is huge. Taking a small chunk of it and delete the larger chunk
!ffmpeg -t 30 -i data/tmp/animals_0.wav data/tmp/mini_0.wav -loglevel error && rm animals_0.wav

In [10]:
# sample script to create a log_mel_spectrogram
from scipy.io import wavfile
from scipy import signal

sample_rate=16000
window_size=20
step_size=10
eps=1e-10
rate, data = wavfile.read('data/tmp/mini_0.wav')
if data.ndim > 1 : # ignore  channels 2+
    data = data[:, 0]
nperseg = int(round(window_size * sample_rate / 1e3))
noverlap = int(round(step_size * sample_rate / 1e3))
freqs, times, spec = signal.spectrogram(data,fs=sample_rate,window='hann',nperseg=nperseg,noverlap=noverlap)
log_specgram = np.log(spec.T.astype(np.float32) + eps)

In [11]:
print(log_specgram.shape)

(8267, 161)


In [3]:
import sys
sys.path.append("audioset/")

The module **vggish_input** has a function **wavfile_to_examples** which returns the log_mel_spectrogram (after correction intro standard form) of the input **wav_file**.

In [4]:
import vggish_input
features_tensor = vggish_input.wavfile_to_examples(wav_file='data/tmp/mini_0.wav')

827 ms ± 20 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

In [5]:
features_tensor.shape

(31, 96, 64)

The module **vggish_inference** has a function **main** which returns the vggish extracted embedding (from the log_mel_spectrogram that is in turn created by vggish_input) of the input **wav_file**.

In [6]:
import vggish_inference
embedding_batch, postprocessed_batch = vggish_inference.main(wav_file='data/tmp/mini_0.wav')

INFO:tensorflow:Restoring parameters from audioset/vggish_model.ckpt


1.94 s ± 6.51 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

In [16]:
embedding_batch.shape

(31, 128)

Here $31$ is the time in seconds of the audio stream and $128$ is the dimensionality of the extracted vggish embedding.

### Read extracted audioset embeddings

In [1]:
import tensorflow as tf
import numpy as np
import glob 
tf.enable_eager_execution()

files = sorted(glob.glob('audioset/data/audioset_v1_embeddings/bal_train/*'))

In [2]:
def readTfRecords(tfrecords_filename, verbose = False, std_frames=10):
    multiple_audio_embedding = []
    labels = []

    record_iterator = tf.python_io.tf_record_iterator(path=tfrecords_filename)
    for string_record in record_iterator:
        example = tf.train.SequenceExample.FromString(string_record)
        label = np.array(example.context.feature['labels'].int64_list.value)
        n_frames = len(example.feature_lists.feature_list['audio_embedding'].feature)
        audio_embedding = []
        for i in range(n_frames):
            audio_embedding.append(tf.cast(tf.decode_raw(example.feature_lists.feature_list['audio_embedding']\
                        .feature[i].bytes_list.value[0],tf.uint8),tf.float32).numpy())
        audio_embedding = np.array(audio_embedding)
        
        if n_frames==std_frames:
            labels.append(label)
            multiple_audio_embedding.append(audio_embedding)
        if verbose:
            print('labels: ' + str(label))
            print(audio_embedding.shape)
    return np.array(multiple_audio_embedding), labels

In [4]:
%time multiple_audio_embedding, labels = readTfRecords(files[1])

CPU times: user 40.1 ms, sys: 19 ms, total: 59.1 ms
Wall time: 44.8 ms


In [5]:
multiple_audio_embedding.shape, len(labels)

((8, 10, 128), 8)

In [6]:
len(files)

4070

In [6]:
from tqdm import tqdm
import h5py

In [7]:
all_audio_embedding = np.zeros((22176,10,128))
all_labels = []
pos = 0
for file in tqdm(files):
    multiple_audio_embedding, labels = readTfRecords(file)
    length = len(labels)
    if multiple_audio_embedding.shape!=(0,):
        try:
            all_audio_embedding[pos:(pos+length),] = np.array(multiple_audio_embedding)
            all_labels = all_labels + labels
            pos+=length
        except:
            print(multiple_audio_embedding.shape,file, length)
            break

100%|██████████| 4070/4070 [01:04<00:00, 63.43it/s]


In [8]:
all_audio_embedding = all_audio_embedding[:pos,]

In [9]:
all_audio_embedding.shape, len(all_labels)

((21782, 10, 128), 21782)

In [11]:
audioset_h5f = h5py.File('audioset_balanced_features_vggish.h5', 'w')
audioset_h5f.create_dataset('audio_embeddings', data=all_audio_embedding)
audioset_h5f.close()

<HDF5 dataset "audio_embeddings": shape (21782, 10, 128), type "<f8">

In [12]:
np.save('audioset_balanced_labels.npy', all_labels)

In [34]:
flat_labels = [item for sublist in all_labels for item in sublist]
from collections import Counter
Counter(flat_labels)

Counter({399: 58,
         0: 5668,
         451: 60,
         27: 517,
         466: 57,
         95: 57,
         137: 6201,
         427: 169,
         431: 59,
         375: 58,
         32: 60,
         34: 59,
         482: 57,
         233: 60,
         359: 51,
         506: 760,
         3: 154,
         39: 60,
         252: 60,
         288: 282,
         370: 87,
         371: 66,
         138: 532,
         139: 302,
         140: 350,
         143: 60,
         146: 60,
         229: 60,
         231: 60,
         251: 62,
         195: 262,
         212: 60,
         330: 60,
         220: 60,
         519: 59,
         135: 59,
         264: 60,
         343: 186,
         348: 60,
         352: 60,
         53: 60,
         361: 55,
         444: 60,
         456: 59,
         263: 61,
         301: 239,
         305: 60,
         155: 119,
         156: 61,
         198: 63,
         72: 720,
         87: 76,
         89: 55,
         300: 852,
         317: 59,
     