In [1]:
import tensorflow as tf
#import tensorflow_hub as hub
import numpy as np
import csv
import math
        
import matplotlib.pyplot as plt
from IPython.display import Audio
from scipy.io import wavfile
from scipy import signal
from pathlib import Path

from utils import *
%matplotlib inline

In [2]:
model = tf.saved_model.load('yamnet')
class_map_path = model.class_map_path().numpy()
class_names = class_names_from_csv(class_map_path)

emotion_labels = ['Happy music',
 'Sad music',
 'Tender music',
 'Exciting music',
 'Angry music',
 'Scary music']
music_labels = ['Music', 
                'Musical instrument',
                'Singing',
                'Drum',
                'Rapping']
non_music_labels = ['Silence',
                    'Speech',
                   'Narration, monologue',
                    'Chatter',
                   'Cheering',
                   'Applause',
                    ]
neutral_labels = []
labels = music_labels+non_music_labels+neutral_labels

2024-03-11 13:36:33.925473: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-03-11 13:36:49.331484: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


In [3]:
label2marker = {'music': '>', 'applause': 's', 'intro': '<', 'interlude': 'D', 'speech': 'X', 'comeback': 'P', 'silence': 'o'}
label2color = {'music': 'r', 'applause': 'b', 'intro': 'c', 'interlude': 'k', 'speech': 'm', 'comeback': 'g', 'silence': 'y'}
def plot(t_shift=0., music_only=False, non_music_only=False):
    fig, axs = plt.subplots(3,1, figsize=(10, 5), gridspec_kw={'height_ratios': [1, 2, 4]})

    t = np.arange(0,len(waveform)/16000,1/16000)
    # Plot the waveform.
    plt.subplot(3, 1, 1)
    plt.plot(t, waveform, linewidth=.2)
    
    axs[0].get_xaxis().set_visible(False)
    axs[0].get_yaxis().set_visible(False)

    plt.xlim([t[0], t[-1]])

    # Plot the log-mel spectrogram (returned by the model).
    plt.subplot(3, 1, 2)
    plt.imshow(spec.T, aspect='auto', interpolation='nearest', origin='lower', cmap='inferno')
    axs[1].set_xticklabels([time_format((x*0.01)+t_shift) for x in axs[1].get_xticks()])
    axs[1].get_xaxis().set_visible(False)
    axs[1].get_yaxis().set_visible(False)

    # Plot and label the model output scores for the selected classes.
    plt.subplot(3, 1, 3)
    plt.imshow(probs.T, aspect='auto', interpolation='nearest', cmap='gray_r')

    patch_padding = (0.025 / 2) / 0.01
    plt.xlim([-patch_padding-0.5, probs.shape[0] + patch_padding-0.5])
    yticks = range(0, len(labels), 1)
    plt.yticks(yticks, labels)
    _ = plt.ylim(-0.5 + np.array([len(labels), 0]))
    axs[2].set_xticklabels([time_format((x*0.48)+t_shift)[:-4] for x in axs[2].get_xticks()])
    plt.xlabel('time [s]')
    try:
        xmin, xmax, ymin, ymax = axs[1].axis()
        for t_start, label in zip(predicted_onsets, predicted_labels):
            if label not in ['music'] and music_only:
                continue
            
            if label == 'music' and non_music_only:
                continue  
                
            t_start -= t_shift
            if t_start > t[-1]:
                break
            if t_start < t[0]:
                continue
            axs[1].vlines(t_start/0.01, ymin, ymax, colors='white', linewidth=1)
            axs[1].scatter(t_start/0.01, (ymin+(ymax-ymin)/2), s=64, c='white', marker=label2marker[label])
    
    except KeyError:
        pass

    plt.show()

def processAudioFile(path):
    wav_file_name = path
    filename = str(wav_file_name).split('/')[-1]
    print(filename)
    sample_rate, wav_data = wavfile.read(wav_file_name, 'rb')
    original_sample_rate = sample_rate
    sample_rate, wav_data = ensure_sample_rate(sample_rate, wav_data)
    
    # Show some basic information about the audio.
    duration = len(wav_data)/sample_rate
    print(f'Sample rate: {sample_rate} Hz')
    print(f'Total duration: {duration:.2f}s')
    print(f'Size of the input: {len(wav_data)}')

    # # Listening to the wav file.
    # Audio(wav_data, rate=sample_rate)
    waveform = wav_data / tf.int16.max
    # Run the model, check the output.
    scores, embeddings, spectrogram = model(waveform)
    # 51 scores per 48 samples (3ms)
    scores_np = scores.numpy()
    spectrogram_np = spectrogram.numpy()
    class_indices = []
    for l in labels:
        class_indices.append(class_names.index(l))
    probs = normalize_matrix(scores_np[:, class_indices])
    return filename, waveform, spectrogram_np, probs

def findMusicBorders(probs, dt = 0.48):
    music_starts = []
    music_stops = []
    
    consecutive_count = 0
    find_start = True
    find_stop = False
    
    patience = 0
    L_music = int(20/dt)
    L_nomusic = int(4/dt)
    L_patience = int(4/dt)
    music_count = 0
    nomusic_count = 0
    consecutive_count = 0
    music_indexes = [labels.index(l) for l in music_labels]
    non_music_indexes = [labels.index(l) for l in non_music_labels]
    for i, prob in enumerate(probs):
        music_prob = np.sum(prob[music_indexes])
        non_music_prob = np.sum(prob[non_music_indexes])
        ratio = music_prob/non_music_prob
        if find_start:
            if music_prob > 0.3:
                music_count += 1
                if music_count == L_music:
                    music_starts.append((i-L_music-patience+1)*dt)
                    find_start = False
                    find_stop = True
                    music_count = 0
            elif music_count>0:
                patience +=1
                if patience == L_patience:
                    patience = 0 
                    music_count = 0
            else:
                music_count = 0
        if find_stop:
            if music_prob < 0.1:
                nomusic_count += 1
                if nomusic_count == L_nomusic:
                    music_stops.append((i-L_nomusic+1)*dt)
                    find_stop = False
                    find_start = True
                    nomusic_count = 0
            else:
                nomusic_count = 0
    return music_starts, music_stops

def Xtrack(probs, dt=0.48, MUSIC_START_OFFSET = -0.1, MUSIC_STOP_OFFSET = 4.):
    T = len(probs)*dt
    music_starts, music_stops = findMusicBorders(probs)      
    all_onsets = sorted(music_starts+music_stops+[T])
    all_labels = []
    to_remove = []
    for i, o in enumerate(all_onsets[:-1]):
        next_onset = all_onsets[i+1]
        if o in music_starts:
            all_labels.append('music')
            if all_onsets[i]+MUSIC_START_OFFSET>0:
                all_onsets[i] += MUSIC_START_OFFSET
        
        if o in music_stops:
            all_labels.append('applause')
            all_onsets[i] += MUSIC_STOP_OFFSET
            
            if len(np.where(np.sum(probs[0:15, [labels.index(x) for x in ['Speech', 'Narration, monologue']]], axis=1)>0.5)[0]) > 10/dt:
                all_labels[-1] = 'speech' 
                
            if np.mean(probs[int(o/dt):int(next_onset/dt), labels.index('Silence')])>0.5:
                all_labels[-1] = 'silence'
            
        if i > 1:
            if all_onsets[i-1] >= all_onsets[i]:
                to_remove.append(i)
    all_labels = [all_labels[i] for i in range(len(all_labels)) if i not in to_remove]
    all_onsets = [all_onsets[i] for i in range(len(all_onsets)) if i not in to_remove]
       
    
    return all_onsets[:-1], all_labels

def writeIndividualTracks(sample_rate=16000, output_path='../Audio/XTrack/'):
    track_id = 1
    for i, l in enumerate(predicted_labels):
        if l == 'music':
            start = predicted_onsets[i]
            start_idx = int(start * sample_rate)
            if i < len(predicted_labels) - 1:
                stop = predicted_onsets[i+1]
                stop_idx = int(stop * sample_rate)
            else:
                stop_idx = -1
            track = waveform[start_idx:stop_idx]

            wavfile.write(output_path+filename[:-4]+'_track'+str(track_id)+'.wav', sample_rate, track)
            track_id += 1       

def rounddown(x):
    return int(math.floor(x / 1000.0)) * 1000

def writeIndexesCSV(original_sample_rate = 48000, output_path='../Data/Markers/'):
    with open(output_path+filename+'.csv', 'w', newline='') as csvfile:
        csvwriter = csv.writer(csvfile, delimiter='	',
                                quotechar='|', quoting=csv.QUOTE_MINIMAL)
        csvwriter.writerow(['Name','Start','Duration','Time Format','Type','Description'])
        for o,l in zip(predicted_onsets, predicted_labels):#, all_durations):
            sample = int(o*original_sample_rate)
            csvwriter.writerow([l,rounddown(sample),'0','48000 Hz','Cue','autoMarker'])

In [4]:
input_audio_file = ''
datapaths = [input_audio_file]

In [9]:
input_folder = '/Volumes/MJF_SAMPLE/2022/'

datapaths = []
for p in Path(input_folder).rglob('*MLAB3*.wav'):
    datapaths.append(p)
for p in Path(input_folder).rglob('*MLAB4*.wav'):
    datapaths.append(p)
len(datapaths) 

13

In [None]:
output_path = '/Volumes/XTrack/'
output_folder_name = input_folder.split('/')[-2]+'/'
output_path = output_path+output_folder_name
# create_directory(output_path)
# audio_output_path = output_path+'Audio/'
# create_directory(audio_output_path)
# csv_output_path = output_path+'CSV/'
# create_directory(csv_output_path)


for d in datapaths:
    filename = str(p).split('/')[-1]
    filename, waveform, spectrogram_np, probs = processAudioFile(d)
    predicted_onsets, predicted_labels = Xtrack(probs)
    writeIndividualTracks(output_path=audio_output_path)
    writeIndexesCSV(output_path=csv_output_path)

22MLAB31A11BR.v0-fr128-sstereo.wav
Sample rate: 16000 Hz
Total duration: 2950.44s
Size of the input: 47207040
22MLAB36A11BR.v0-fr128-sstereo.wav
Sample rate: 16000 Hz
Total duration: 3210.36s
Size of the input: 51365760
22MLAB33A11BR.v0-fr128-sstereo.wav
Sample rate: 16000 Hz
Total duration: 3580.64s
Size of the input: 57290240
22MLAB34A11BR.v0-fr128-sstereo.wav
Sample rate: 16000 Hz
Total duration: 3445.76s
Size of the input: 55132160
22MLAB35A11BR.v0-fr128-sstereo.wav
Sample rate: 16000 Hz
Total duration: 3611.36s
Size of the input: 57781760
22MLAB32A11BR.v0-fr128-sstereo.wav
Sample rate: 16000 Hz
Total duration: 2718.24s
Size of the input: 43491840
22MLAB37A11BR.v0-fr128-sstereo.wav
Sample rate: 16000 Hz
Total duration: 3232.36s
Size of the input: 51717760
22MLAB38A11BR.v0-fr128-sstereo.wav
Sample rate: 16000 Hz
Total duration: 3090.96s
Size of the input: 49455360
22MLAB30A11BR.v0-fr128-sstereo.wav
Sample rate: 16000 Hz
Total duration: 2630.80s
Size of the input: 42092800
22MLAB41A1