# Audio preprocess

    


In [24]:
import tensorflow as tf
import os
import numpy as np
import pandas as pd
from uritools import uricompose, urijoin, urisplit, uriunsplit


In [25]:
curr_dir = os.getcwd()

dataFrame = pd.read_csv('../data.csv')

def getAudioName(row):
    strategy = row['strategy']
    url = row['url']
    label = row['label']
    
    if (strategy == "youtube"):
        url_data = urisplit(url)
        video_id = url_data.getquerydict().get("v")[0]
        return label + "_" + video_id + ".wav"
    else:
        print("Strategy "+strategy+" not defined!")

dataFrame['fileName'] = dataFrame.apply (getAudioName, axis=1)

In [26]:
from pydub import AudioSegment
from pydub.silence import split_on_silence
from tqdm import tqdm
import os

FILE_DIR = "./audio_data/"

def preprocess(filePath, label, silence_in_msec):
    sound_file = AudioSegment.from_wav(filePath)
    audio_chunks = split_on_silence(sound_file, 
        # must be silent for at least half a second
        min_silence_len=silence_in_msec,

        # consider it silent if quieter than -16 dBFS
        silence_thresh=-16
    )
    outDir = ".//preprocessed_audio//" + label
    os.makedirs(outDir, exist_ok=True)
    for i, chunk in enumerate(audio_chunks):
        out_file = outDir + "//chunk{0}.wav".format(i)
        print ("exporting", out_file)
        chunk.export(out_file, format="wav")

In [27]:
MIN_SECONDS_SILENCE_LEN = 200

for index, row in tqdm(dataFrame.iterrows()):
    filePath = FILE_DIR + row['fileName']
    label = row['label']
    print(filePath)
    preprocess(filePath, label, MIN_SECONDS_SILENCE_LEN)



0it [00:00, ?it/s][A[A

./audio_data/angelamerkel_4APIf4O6Inc.wav




1it [00:01,  1.85s/it][A[A

./audio_data/angelamerkel_v63SKddbG2w.wav




2it [00:04,  1.95s/it][A[A

./audio_data/angelamerkel_ZhIurnatBJw.wav




3it [00:06,  1.99s/it][A[A

./audio_data/angelamerkel_ltIDG1xzSWc.wav




4it [00:07,  1.70s/it][A[A

./audio_data/angelamerkel_SD939Q6LxRg.wav




5it [00:08,  1.56s/it][A[A

exporting .//preprocessed_audio//angelamerkel//chunk0.wav
exporting .//preprocessed_audio//angelamerkel//chunk1.wav
exporting .//preprocessed_audio//angelamerkel//chunk2.wav
exporting .//preprocessed_audio//angelamerkel//chunk3.wav
exporting .//preprocessed_audio//angelamerkel//chunk4.wav
exporting .//preprocessed_audio//angelamerkel//chunk5.wav
exporting .//preprocessed_audio//angelamerkel//chunk6.wav
exporting .//preprocessed_audio//angelamerkel//chunk7.wav
exporting .//preprocessed_audio//angelamerkel//chunk8.wav
exporting .//preprocessed_audio//angelamerkel//chunk9.wav
exporting .//preprocessed_audio//angelamerkel//chunk10.wav
exporting .//preprocessed_audio//angelamerkel//chunk11.wav
exporting .//preprocessed_audio//angelamerkel//chunk12.wav
./audio_data/angelamerkel_Wf9bvwPlzgs.wav




6it [00:12,  2.21s/it][A[A

./audio_data/aliceweidel_D6I0jH4uT8I.wav




7it [00:13,  2.05s/it][A[A

./audio_data/aliceweidel_So_3I-wQVpc.wav




8it [00:14,  1.79s/it][A[A

./audio_data/aliceweidel_w-m5O_upKkk.wav




9it [00:15,  1.52s/it][A[A

./audio_data/aliceweidel_AO8OShVa8u4.wav




10it [00:16,  1.34s/it][A[A

./audio_data/aliceweidel_yiLA8P8ECQE.wav




11it [00:19,  1.91s/it][A[A

./audio_data/karambadiaby_fNihW_QE168.wav




12it [00:22,  2.15s/it][A[A

exporting .//preprocessed_audio//karambadiaby//chunk0.wav
./audio_data/karambadiaby_Wj7-r2O-bzc.wav




13it [00:24,  2.11s/it][A[A

./audio_data/karambadiaby_F6vjLGZJA4w.wav




14it [00:26,  2.10s/it][A[A

exporting .//preprocessed_audio//karambadiaby//chunk0.wav
exporting .//preprocessed_audio//karambadiaby//chunk1.wav
exporting .//preprocessed_audio//karambadiaby//chunk2.wav
exporting .//preprocessed_audio//karambadiaby//chunk3.wav
./audio_data/karambadiaby_ZtLEcdcd58U.wav




15it [00:27,  1.84s/it][A[A
