In [None]:
import sys
sys.path.append("./..")
from config import config

import argparse
from smd.data import preprocessing
from smd.data import postprocessing
import smd.utils as utils
import numpy as np
import tensorflow as tf
import keras.models
import keras.backend as K
from tqdm import tqdm
from pathlib import Path
import os, sys
from glob import glob
import shutil
import json
import librosa
import torch

In [None]:
def test_data_processing(file, mean, std):
    if os.path.splitext(file)[1] == '.npy':
        spec = np.load(file)
    else:
        audio = utils.load_audio(file)
        spec = preprocessing.get_spectrogram(audio)
    mels = preprocessing.get_scaled_mel_bands(spec)
    mels = preprocessing.normalize(mels, mean, std)
    return mels.T


def predict(data_path, output_file, model_path, mean_path, std_path, smoothing):
    mean = np.load(mean_path)
    std = np.load(std_path)

    print("Loading the model " + model_path + "..")
#     with tf.device('/cpu:0'):
    model = keras.models.load_model(model_path)
    print("Start the prediction..")

    if os.path.isdir(data_path):
        if output_file != "":
            raise ValueError("It is possible to set an output file only if the input is a file.")

        files = glob.glob(os.path.abspath(data_path) + "/*.npy") + glob.glob(os.path.abspath(data_path) + "/*.wav")
        for file in tqdm(files):
            x = test_data_processing(file, mean, std)
            x = x.reshape((1, x.shape[0], x.shape[1]))
            output = model.predict(x, batch_size=1, verbose=0)[0].T
            output = postprocessing.apply_threshold(output)
            if smoothing:
                output = postprocessing.smooth_output(output)
            annotation = preprocessing.label_to_annotation(output)
            output_path = file.replace(".npy", '') + "_prediction.txt"
            output_path = output_path.replace('.wav','')
            utils.save_annotation(annotation, output_path)
    else:
        file = os.path.abspath(data_path)
        x = test_data_processing(file, mean, std)
        x = x.reshape((1, x.shape[0], x.shape[1]))
        output = model.predict(x, batch_size=1, verbose=0)[0].T
        output = postprocessing.apply_threshold(output)
        if smoothing:
            output = postprocessing.smooth_output(output)
        annotation = preprocessing.label_to_annotation(output)
        if output_file != "":
            output_path = output_file
        else:
            output_path = file.replace(".npy", '') + "_prediction.txt"
            output_path = output_path.replace('.wav','')
        utils.save_annotation(annotation, output_path)

In [None]:
file_dir = "./"

model_path = os.path.join(file_dir, "speech-music-detection","checkpoint","weights.28-0.13exp1_blstm.hdf5")

# mean_path = root + "speech-music-detection/checkpoint/mean_gtzan_esc-50_muspeak_musan.npy"
mean_path = os.path.join(file_dir, "speech-music-detection","checkpoint","mean_gtzan_esc-50_muspeak_musan.npy")

# std_path = root + "speech-music-detection/checkpoint/std_gtzan_esc-50_muspeak_musan.npy"
std_path = os.path.join(file_dir, "speech-music-detection","checkpoint","std_gtzan_esc-50_muspeak_musan.npy")

smoothing = True

mean = np.load(mean_path)
std = np.load(std_path)

#load model (keras)
print("Loading the model " + model_path + "..")
# with tf.device('/cpu:0'):
model = keras.models.load_model(model_path)
print("Start the prediction..")

In [None]:
with open(config['json_path'], 'r') as fp:
    all_data = json.load(fp)

In [None]:
audio = glob(os.path.join(config['podcast']['path'], "*.wav"))
audio.sort()
audio = {os.path.basename(x).split('.')[0]:x for x in audio}

In [None]:
def test_data_processing(audio, mean, std):
    spec = preprocessing.get_spectrogram(audio)
    mels = preprocessing.get_scaled_mel_bands(spec)
    mels = preprocessing.normalize(mels, mean, std)
    return mels.T

In [None]:
class ProcessPipeline(torch.utils.data.Dataset):
    def __init__(self, all_data, audios):
        print("Organizing the data")
        self.podcasts = []
        self.segments = []
        self.audios = audios
        self.data = all_data
        for i, pod_name in enumerate(tqdm(list(all_data.keys()))):
            for seg_name, seg in all_data[pod_name].items():
                if seg_name != '310': continue
                self.podcasts.append(pod_name)
                self.segments.append(seg_name)
        
        
        # mean_path = root + "speech-music-detection/checkpoint/mean_gtzan_esc-50_muspeak_musan.npy"
        mean_path = os.path.join(file_dir, "speech-music-detection","checkpoint","mean_gtzan_esc-50_muspeak_musan.npy")

        # std_path = root + "speech-music-detection/checkpoint/std_gtzan_esc-50_muspeak_musan.npy"
        std_path = os.path.join(file_dir, "speech-music-detection","checkpoint","std_gtzan_esc-50_muspeak_musan.npy")

        smoothing = True

        self.mean = np.load(mean_path)
        self.std = np.load(std_path)



        
        
    def __len__(self):
        return len(self.podcasts)
        
        
    def __getitem__(self, index):
        """ get a video and its label """
        podcast = self.podcasts[index]
        seg_name = self.segments[index]
        
        seg = self.data[podcast][seg_name]

        start = seg['start']
        end = seg['end']
        wav, sr = librosa.load(self.audios[podcast], offset=start, duration=end-start, sr=16000)
        x = test_data_processing(wav, self.mean, self.std)
#         x = x.reshape((1, x.shape[0], x.shape[1]))
        
        return podcast, seg_name, x, wav


In [None]:
def colate_fun(x):
    data = []
    lengths = []
    for sample in x:
        data.append([sample[0], sample[1], np.array(sample[2]), np.array(sample[3])])
        lengths.append(len(sample[2]))
        
    aud = np.zeros((len(x), max(lengths), len(x[0][2][0])))
    for i, a in enumerate(data):
        aud[i][:len(a[2])] = a[2]
    return data, aud, lengths

In [None]:
process_data = ProcessPipeline(all_data, audio)
process_loader = torch.utils.data.DataLoader(process_data, batch_size = 1, num_workers=1, pin_memory=True, shuffle = False,
                                              collate_fn=colate_fun)

In [None]:
for info, auds, lengths in tqdm(process_loader):
    outputs = model.predict(auds,  verbose=0)
    
    for i, output, length in zip(info, outputs, lengths):
        podcast, seg_name, x, wav =  i
        output = output[:length].T
        output = postprocessing.apply_threshold(output)
        if smoothing:
            output = postprocessing.smooth_output(output)
        annotation = preprocessing.label_to_annotation(output)
        all_data[podcast][seg_name]['speech_music_pred'] = output

In [None]:
with open(config['json_path'], 'w') as fp:
    json.dump(all_data, fp)