# Audio Dataset Preprocessing
These are preprocessing syntax from raw data (.ogg, .acc, .mp3) into wav

## Import Dependecies

In [87]:
import os
import shutil
import random
import librosa
import numpy as np
import soundfile as sf
import re

from pathlib import Path
from pydub import AudioSegment
from pydub.playback import play

from IPython.display import clear_output

In [88]:
parent_dir = "d:/1mportant!/bangkit/capstone/"

In [3]:
os.chdir(parent_dir)

In [4]:
def file_to_wav(filepath,name, outputpath):
    '''Convert files to .wav'''
    file = AudioSegment.from_file(filepath, channels=1)
    file.export(f"{outputpath}{name}.wav", format="wav")

In [7]:
tes = "New Data/begal Ara.ogg"
name = Path(tes).stem

In [25]:
data_dir = "New Data"
ext = [".aac",".ogg",".mpeg",".mp3",".wav",".mp4"]

In [27]:
def convert(data_dir):
    outdir = 'New Data/WAV'
    for file in sorted(os.listdir(data_dir)):
        #Get all files
        filename = os.fsdecode(file)
        
        #Validating files extension
        if filename.endswith(tuple(ext)):
            #Get file dir
            file_dir = os.path.join(data_dir,filename)
            #print(file_dir)
            searchinsiden = re.findall(r'(begal|rampok|maling|pencuri|tabrakan|kecelakaan|kebakaran)',file_dir.lower())
            insiden = searchinsiden[0]

            outputpath = os.path.join(outdir,insiden+"/")
            #Make name
            i = 1
            nama = searchinsiden[0] + f"_{i}"

            while nama +".wav" in os.listdir(outputpath):
                i += 1
                nama = searchinsiden[0] + f"_{i}"
        
            file_to_wav(file_dir,nama,outputpath)
        else :
            pass

In [29]:
#Iterate in files
#insiden = ["begal","rampok","maling","pencuri","tabrakan","kecelakaan","kebakaran"]
#for i in insiden:
#    convert(f'New Data/{i}/')

In [52]:
Hujan = AudioSegment.from_file('New Data\SFX\Hujan SFX.mp3', channels=1)-5
Jalanan = AudioSegment.from_file('New Data\SFX\Jalanan SFX.mp3', channels=1)-5

SFX = [Hujan, Jalanan]

In [53]:
def overlay(filepath, outputpath):
    '''Memberi overlay pada audio file dengan 2 skenario tambahan'''
    #Import File
    file = AudioSegment.from_file(filepath, channels=1)
    name = Path(filepath).stem
    len_audio = len(file)
    label = ['HUJAN', 'JALANAN']
    #print("File Imported")
    i = 0

    for gangguan in SFX:
        max = len(gangguan)-len_audio #Membatasi maksimal randomize agar tidak mengambil part kosong
        part = random.randrange(0,max) #Mencari random length
        #print('Dapat Rand Range')
        output = file.overlay(gangguan[part:]) #Memberi overlay pada audiofile
        current_label = label[i]
        i += 1
        #print('Conver Berhasil')
        output.export("{}{}_{}.wav".format(outputpath, name, current_label), format="wav") #Eksport dan memberi label sesuai gangguan
        
        clear_output(wait=True)
        print("[Overlaying \"{}\" with \"{}\" Succeed]".format(name, current_label))

In [71]:
outputoverlay = os.path.join(data_dir,'Overlay/')
#tes = os.path.join(data_dir,'WAV/RAMPOK/rampok_1.wav')
#overlay(tes,outputoverlay)

insiden = ["begal","rampok","maling","pencuri","tabrakan","kecelakaan","kebakaran"]

for i in insiden:
    for file in sorted(os.listdir(f'New Data/WAV/{i}/')):
        #Get all files
        file = os.path.join(f'New Data/WAV/{i}/{file}')
        outputpath = os.path.join(outputoverlay,i+"/")
        overlay(file,outputpath)

[Overlaying "kebakaran_9" with "JALANAN" Succeed]


In [72]:
def pitch(filepath, semitones, outputpath):
    '''Augmentasi ptich shifting pada data: pitch(path data, semitones, path output data)'''
    name = Path(filepath).stem
    label = ['UP', 'DOWN']
    file, sr = librosa.load(filepath) #import data dan sr
    
    pitchup  = librosa.effects.pitch_shift(file, sr, semitones)
    sf.write("{}{} [{}].wav".format(outputpath, name, label[0]), pitchup, sr)
    pitchdown  = librosa.effects.pitch_shift(file, sr, -semitones)
    sf.write("{}{} [{}].wav".format(outputpath, name, label[1]), pitchdown, sr)
    clear_output(wait=True)
    print("[Pitch Shifting to \"{}\" Succeed]".format(name))

In [73]:
outputpitch = os.path.join(data_dir,'PITCH/')
semitones = 2
insiden = ["begal","rampok","maling","pencuri","tabrakan","kecelakaan","kebakaran"]

for i in insiden:
    for file in sorted(os.listdir(f'New Data/WAV/{i}/')):
        #Get all files
        file = os.path.join(f'New Data/WAV/{i}/{file}')
        outputpath = os.path.join(outputpitch,i+"/")
        pitch(file,semitones,outputpath)

[Pitch Shifting to "kebakaran_9" Succeed]


In [74]:
#Pitch Shift Overlayed Audio
outputpitch = os.path.join(data_dir,'PITCH/')
semitones = 2
insiden = ["begal","rampok","maling","pencuri","tabrakan","kecelakaan","kebakaran"]

for i in insiden:
    for file in sorted(os.listdir(f'New Data/Overlay/{i}/')):
        #Get all files
        file = os.path.join(f'New Data/Overlay/{i}/{file}')
        outputpath = os.path.join(outputpitch,i+"/")
        pitch(file,semitones,outputpath)

[Pitch Shifting to "kebakaran_9_JALANAN" Succeed]


In [106]:
#os.mkdir('New Data/FINAL')
finalsubfold = []
for i in insiden:
    finalsubfold.append(i.upper())
finalsubfold

#for subfold in finalsubfold:
#    os.mkdir(f'New Data/Final/{subfold}')

#Move audio data in "WAV", "OVERLAY", and "PITCH" directory 
# into "FINAL" directory for creating model

categories = ['WAV','Overlay','Pitch']
dir = 'New Data/'
for category in categories:
    for i in finalsubfold:
        filedir = f'{dir}{category}/{i}/'
        for file in sorted(os.listdir(filedir)):
            old = os.path.join(filedir,file)
            new = f'{dir}FINAL/{i}/{file}'
            shutil.move(old,new)


In [None]:
categories = ['WAV','Overlay','Pitch']
finalsubfold = ['rampok']