In [1]:
"""
Generate an array (or dict) in the form of 
[audio_id, probability_of_speech_array]

for each audio
"""

'\nGenerate an array (or dict) in the form of \n[audio_id, probability_of_speech_array]\n\nfor each audio\n'

### Create dataset

In [1]:
import os
import sys
sys.path.insert(1, os.path.join(sys.path[0], 'utils'))
sys.path.insert(1, os.path.join(sys.path[0], 'pytorch'))

import numpy as np
import argparse
import librosa
import matplotlib.pyplot as plt
import torch

from utilities import create_folder, get_filename
from models import *
from pytorch_utils import move_data_to_device
import config

  from ._conv import register_converters as _register_converters


In [2]:
import glob
import librosa
import numpy as np
import torchaudio
import requests
import matplotlib.pyplot as plt
import cv2
import torch

import pandas as pd
import pathlib
import IPython.display as ipd


%matplotlib inline

  '"sox" backend is being deprecated. '


In [3]:
audios_df = pd.DataFrame([])
audios_df["fn"] = glob.glob("../data/all_audio_resampled/*.wav")
# test.columns = ["fn"]
audios_df.fn = audios_df.fn.apply(lambda b: b.split("/")[-1])

In [4]:
def pad_audio(audio):
    arr_limit = int(3 * 22050) ## 3 secs for 22050 sample rate
    if len(audio) >= arr_limit:
        audio = audio[:arr_limit]
        return audio
    # for short: pad:
    to_add = arr_limit - len(audio)
    add_l = to_add // 2
    add_r = to_add - add_l
    audio = np.concatenate((np.zeros(add_l), audio, np.zeros(add_r)))
    return audio

class AllAudiosDataset(torch.utils.data.Dataset):
    def __init__(
        self,
        fns
    ):
        self.fns = fns
    def __len__(self):
        return len(self.fns)
    
    def __getitem__(self, idx):
        (waveform, _) = librosa.core.load(self.fns[idx], sr=22050, mono=True)
        return pad_audio(waveform)

In [5]:
folder_path = str(pathlib.Path("../data/all_audio_resampled/").resolve())
audios_df["full_path"] = audios_df.fn.apply(lambda name: folder_path + f"/{name}")

In [6]:
aud_ds = AllAudiosDataset(audios_df.full_path)

In [7]:
SR = 22050
ipd.Audio(aud_ds[0], rate=SR)

In [1]:
!wget https://zenodo.org/record/3987831/files/Cnn14_DecisionLevelMax_mAP%3D0.385.pth?download=1

--2020-12-01 01:33:44--  https://zenodo.org/record/3987831/files/Cnn14_DecisionLevelMax_mAP%3D0.385.pth?download=1
Resolving zenodo.org (zenodo.org)... 137.138.76.77
Connecting to zenodo.org (zenodo.org)|137.138.76.77|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 327428481 (312M) [application/octet-stream]
Saving to: ‘Cnn14_DecisionLevelMax_mAP=0.385.pth?download=1’


2020-12-01 01:34:09 (13,1 MB/s) - ‘Cnn14_DecisionLevelMax_mAP=0.385.pth?download=1’ saved [327428481/327428481]



### Create model

In [8]:
class DetectSpeechConfig:
    sample_rate=22050
    window_size=1024
    hop_size=320
    mel_bins=64
    fmin=50
    fmax=14000
    model_type="Cnn14_DecisionLevelMax"
    checkpoint_path="Cnn14_DecisionLevelMax_mAP=0.385.pth"
    cuda=True

model_conf = DetectSpeechConfig()
classes_num = config.classes_num
labels = config.labels
device = "cuda"

In [9]:
# Model
Model = eval(model_conf.model_type)
model = Model(sample_rate=model_conf.sample_rate, window_size=model_conf.window_size, 
    hop_size=model_conf.hop_size, mel_bins=model_conf.mel_bins, fmin=model_conf.fmin, fmax=model_conf.fmax, 
    classes_num=classes_num)

checkpoint = torch.load(model_conf.checkpoint_path, map_location=device)
model.load_state_dict(checkpoint['model'])

# Parallel
print('GPU number: {}'.format(torch.cuda.device_count()))
model = torch.nn.DataParallel(model)

if 'cuda' in str(device):
    model.to(device)

  "Empty filters detected in mel frequency basis. "


GPU number: 1


### Try getting predictions for one test batch:

In [10]:
aud_loader = torch.utils.data.DataLoader(aud_ds, batch_size=32, shuffle=False, num_workers=4)

In [11]:
for idx, batch in enumerate(aud_loader):
    break

In [12]:
batch = batch.float().to(device)

In [13]:
speech_prob_vector = list()
with torch.no_grad():
    model.eval()
    batch_output_dict = model(batch, None)
    framewise_output = batch_output_dict['framewise_output'].data.cpu().numpy()[0]
    speech_prob_vector.append(framewise_output[:,0]) ## 0 class for speech
speech_prob_vector = np.array(speech_prob_vector)

### Run for all dataset:

In [14]:
%%time

speech_prob_vector = list()
model.eval()
for idx, batch in enumerate(aud_loader):
    batch = batch.float().to(device)
    with torch.no_grad():
        batch_output_dict = model(batch, None)
        framewise_output = batch_output_dict['framewise_output'].data.cpu().numpy()
        probs_per_sample = framewise_output[:,:,0]
        for vec in probs_per_sample:
            speech_prob_vector.append(vec)
speech_prob_vector = np.array(speech_prob_vector)

CPU times: user 12.3 s, sys: 2.16 s, total: 14.4 s
Wall time: 8.26 s


In [15]:
speech_prob_vector.shape

(5726, 207)

In [16]:
mean_speech_probs = (speech_prob_vector > 0.4).mean(axis=1)

In [17]:
no_speech_mask = (mean_speech_probs == 0)

In [18]:
%%time

probs_dict = {f: probs.tolist() for f, probs in zip(audios_df.fn.values, speech_prob_vector)}

CPU times: user 19.9 ms, sys: 12 ms, total: 31.9 ms
Wall time: 31.7 ms


In [19]:
import joblib

In [20]:
joblib.dump(probs_dict, "../probs_dict.joblib")

['../probs_dict.joblib']