# Preprocessing of the audios

Pour le recouvrement il faudrait inverser les fonctions cuttingAudio et createAnnotatedFile dans trimmingAudio.
Et ainsi ne découper que les audios concernés.

### 1 - Creation of the functions

In [1]:
import os
import sys
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt
import warnings
import pathlib
import soundfile as sf

In [18]:
def save_boundaries(start, stop):
    '''
    Compute the step boundaries
    '''
    n = len(start)
    m = 2
    l = [[0]*m for i in range(n)]
    for i in range(n):
        l[i][0] = start[i]
        l[i][1] = stop[i]
    return l


def readAudioFile(audio_full_file_path, sr, duration, hop):
    """
    Create the boundaries of each cutted audio
    """
    frame_length = duration * sr
    hop_length = hop * sr
    
    y,sr = librosa.load(audio_full_file_path,sr)

    start = np.arange(0,len(y) - hop_length, hop_length) / sr
    
    stop = [x + (frame_length/float(sr)) for x in start]
    
    list_time_boundaries = save_boundaries(start,stop)
    
    try :
        y = y_mono.shape
    except:
        pass

    return y, sr, start, stop, list_time_boundaries

def saveAudioFile(directory, list_time_boundaries):
    '''
    Save the all time boundaries in a text file
    '''
    np.savetxt(directory,list_time_boundaries,fmt='%10d', delimiter=',')
    
def createAudioDirectory(directory_audio_cut, file_name):
    '''
    Create the directory where the audio is stored
    '''
    samples_folder = os.path.join(directory_audio_cut,os.path.splitext(file_name)[0])
    try :
        os.makedirs(samples_folder)
    except :
        pass
    return samples_folder

def cuttingAudio(y, sr, samples_folder, start, stop, list_time_boundaries, file_name):
    '''
    Store the cutted audio 
    '''
    for i in range(len(list_time_boundaries)):
        x = y[int(list_time_boundaries[i][0]) * sr : int(list_time_boundaries[i][1]) * sr]

        filename = os.path.join(samples_folder, os.path.splitext(file_name)[0] + "_" + str(int(start[i])).zfill(4) + '.wav')
        #librosa.output.write_wav(filename, x, sr)
        sf.write(filename, x, sr)
        
def createAnnotatedFile(audio_full_file_path, annotated_file_name, file_name, list_time_boundaries, start, stop, rec):
    '''
    Create a txt file where path of cutted audio are 
    '''
    line_number = len(list_time_boundaries)
    annotated_boudaries = [[i,0] for i in list_time_boundaries]
    
    print("current_audio_file: ", os.path.splitext(audio_full_file_path)[0])
    
    with open('{}.txt'.format(os.path.splitext(audio_full_file_path)[0])) as current_audio_file:
        annotated_file = np.loadtxt(current_audio_file)
            
        for i in range(line_number):
            for j in range(len(annotated_file)):
                '''
                si 'or' alors on annote tous les audios présentants un bout de bull
                si 'and' alors on n'annote que les audios contenus en entier dans un extrait
                '''
                if rec == 'and':
                    if(start[i] <= annotated_file[j][0] <= stop[i]) and (start[i] <= annotated_file[j][1] <= stop[i]):
                        annotated_boudaries[i][1] = "1"
                elif rec == 'or':
                    if(start[i] <= annotated_file[j][0] <= stop[i]) or (start[i] <= annotated_file[j][1] <= stop[i]):
                        annotated_boudaries[i][1] = "1"
    
        new_list_time_boudaries = []
        new_start = []
        new_stop = []
        new_annotated_boudaries = []
        
        # On retire les cellules de recouvremement non annotées par 1 
        
        for i in range(len(annotated_boudaries)):
            #if annotated_boudaries[i][0][0]%10 != 0 and annotated_boudaries[i][1] == 0:
            #    print(annotated_boudaries[i], 'exit')
            #    pass
            #elif annotated_boudaries[i][1] == 0 and (i == len(annotated_boudaries)-1 or annotated_boudaries[i+1][1] == 1):
            #    print(annotated_boudaries[i], 'exit')
            #    pass
            #elif annotated_boudaries[i][1] == 0 and (annotated_boudaries[i-1][1] == 1 or i == 0):
            #    print(annotated_boudaries[i], 'exit')
            #    pass
            #else:
            new_annotated_boudaries.append(annotated_boudaries[i])
            new_list_time_boudaries.append(annotated_boudaries[i][0]) 
            new_start.append(annotated_boudaries[i][0][0])
            new_stop.append(annotated_boudaries[i][0][1])
            print(annotated_boudaries[i])
    
        new_line_number = len(new_list_time_boudaries)
        column_number = 2
        annotated_files = [[0] * column_number for i in range(new_line_number)]
        
        print(new_line_number,len(new_start))
        for i in range(new_line_number):
            annotated_files[i][0] = os.path.splitext(file_name)[0] + "_" + str(int(new_start[i])).zfill(4) + '.wav'
            #print("annotated ",annotated_boudaries[i][1])
            if new_annotated_boudaries[i][1] == "1":
                annotated_files[i][1] = "1"
                
            #print(annotated_files[i][1])
            np.savetxt(annotated_file_name, annotated_files, fmt='%s')
        #print(new_list_time_boudaries)
    return(new_list_time_boudaries, new_start, new_stop)

### 2 - Process

In [19]:
import shutil
import os
import time
from sys import *
import sys
sys.path.append(os.path.abspath("preprocessing/"))
from progressbar import * 

def getAudioFiles(audio_files_path):
    '''
    Store the list of audios to be processed found in the data directory
    '''
    path_directory = audio_files_path
    files_path = []
    files_name = []

    for path, subdirs, files in os.walk(path_directory):
        for name in files :
            if name.endswith(".wav"):
                current_file_path = (path, name)
                files_path.append(current_file_path)
                files_name.append(name)
    print("-", len(files_name), "files found in the directory", path_directory,'\n')
    return files_path

In [20]:
def trimmingAudio(audio_files_path, sr, duration, hop, rec):
    '''
    Check the working directory before executing the function.
    Create directories where all the processed data generated is stored : cutted audios, 
    boundaries of audios, and their labels.
    '''
    
    features_dir = os.getcwd()+"/tmp/bulls_audio_recov/audio_boundaries/"
    s_dir = os.getcwd()+"/tmp/bulls_audio_recov/audio_out/"
    bull_test = os.getcwd()+"/tmp/bulls_audio_recov/audio_annotated/"
    
    if not os.path.exists(features_dir):
        os.makedirs(features_dir)
    if not os.path.exists(s_dir):
        os.makedirs(s_dir)
    if not os.path.exists(bull_test):
        os.makedirs(bull_test)
    
    ext1 = "_boundaries.txt"
    ext2 = "_annotated.txt"

    files_path = getAudioFiles(audio_files_path)
    
    widgets=['Preprocessing cut audio files : ',Percentage(),' ',Bar(marker = '0',left = '[',right = ']'), ' ', ETA(), ' ', FileTransferSpeed()]

    pbar = ProgressBar(widgets=widgets, maxval = len(files_path))
    pbar.start()
    progression = 0

    for file_path, file_name in sorted(files_path):
        
        audio_full_file_path = os.path.join(file_path,file_name)

        feature_file_name = os.path.join(features_dir, os.path.splitext(file_name)[0] + ext1)
        
        annotated_file_name = os.path.join(bull_test, os.path.splitext(file_name)[0] + ext2)
        
        y, sr, start, stop, tmp_list_time_boundaries = readAudioFile(audio_full_file_path,sr,duration,hop)
        
        saveAudioFile(feature_file_name,tmp_list_time_boundaries)
        samples_folder = createAudioDirectory(s_dir,file_name)
        
        new_tmp_list_time_boudaries, new_start, new_stop = createAnnotatedFile(audio_full_file_path, annotated_file_name, file_name, tmp_list_time_boundaries, start, stop, rec)

        cuttingAudio(y, sr, samples_folder, new_start, new_stop, new_tmp_list_time_boudaries,file_name)
        
        progression += 1
        pbar.update(progression)

    pbar.finish()

## 3 - Preprocessing

In [21]:
# Path to the original audios and labelisation
audio_files_path = "C:\\Users\\ElieL\\Desktop\\Mes Cours\\Etude de cas\\test\\"

# Duration of the cutted audios
duration = 15    
hop = 5          # pas de recouvrement
rec = 'or'      # and = recouvrement exclusif / or = inclusif

trimmingAudio(audio_files_path, sr = 44100, duration=duration, hop=hop, rec=rec)

Preprocessing cut audio files :   0% [             ] ETA:  --:--:--   0.00  B/s

- 1 files found in the directory C:\Users\ElieL\Desktop\Mes Cours\Etude de cas\test\ 

current_audio_file:  C:\Users\ElieL\Desktop\Mes Cours\Etude de cas\test\2009_05_08-01_00_00
[[0.0, 15.0], '1']
[[5.0, 20.0], '1']
[[10.0, 25.0], 0]
[[15.0, 30.0], 0]
[[20.0, 35.0], 0]
[[25.0, 40.0], 0]
[[30.0, 45.0], 0]
[[35.0, 50.0], 0]
[[40.0, 55.0], 0]
[[45.0, 60.0], 0]
[[50.0, 65.0], '1']
[[55.0, 70.0], '1']
[[60.0, 75.0], '1']
[[65.0, 80.0], '1']
[[70.0, 85.0], 0]
[[75.0, 90.0], 0]
[[80.0, 95.0], 0]
[[85.0, 100.0], 0]
[[90.0, 105.0], 0]
[[95.0, 110.0], '1']
[[100.0, 115.0], '1']
[[105.0, 120.0], '1']
[[110.0, 125.0], '1']
[[115.0, 130.0], '1']
[[120.0, 135.0], '1']
[[125.0, 140.0], '1']
[[130.0, 145.0], '1']
[[135.0, 150.0], '1']
[[140.0, 155.0], '1']
[[145.0, 160.0], '1']
[[150.0, 165.0], '1']
[[155.0, 170.0], '1']
[[160.0, 175.0], '1']
[[165.0, 180.0], '1']
[[170.0, 185.0], '1']
[[175.0, 190.0], '1']
[[180.0, 195.0], 0]
[[185.0, 200.0], 0]
[[190.0, 205.0], '1']
[[195.0, 210.0], '1']
[[200.0, 2

Preprocessing cut audio files : 100% [00000000000000] Time: 0:00:23   0.04  B/s
