# **Audio Dataset Preprocessing**
These are preprocessing syntax from raw data (.ogg, .acc, .mp3) into wav

## **Import Dependecies**

In [1]:
import os
import shutil
import random
import librosa
import numpy as np
import soundfile as sf
import re
import matplotlib.pyplot as plt

from pathlib import Path
from pydub import AudioSegment
from pydub.playback import play
from scipy.io.wavfile import read

from IPython.display import clear_output




## **Change Directory**

In [6]:
parent_dir = "d:/1mportant!/bangkit/capstone/"

In [7]:
os.chdir(parent_dir)

In [4]:
def file_to_wav(filepath,name, outputpath):
    '''Convert files to .wav'''
    file = AudioSegment.from_file(filepath, channels=1)
    file.export(f"{outputpath}{name}.wav", format="wav")

In [8]:
data_dir = "New Data"
ext = [".aac",".ogg",".mpeg",".mp3",".wav",".mp4"]

In [7]:
def convert(data_dir):
    outdir = 'New Data/WAV'
    for file in sorted(os.listdir(data_dir)):
        #Get all files
        filename = os.fsdecode(file)
        
        #Validating files extension
        if filename.endswith(tuple(ext)):
            #Get file dir
            file_dir = os.path.join(data_dir,filename)
            print(f'Converting {filename}')
            #print(file_dir)
            searchinsiden = re.findall(r'(begal|rampok|maling|pencuri|tabrakan|kecelakaan|kebakaran|random)',file_dir.lower())
            insiden = searchinsiden[0]

            outputpath = os.path.join(outdir,insiden+"/")
            #Make name
            i = 1
            nama = searchinsiden[0] + f"_{i}"

            while nama +".wav" in os.listdir(outputpath):
                i += 1
                nama = searchinsiden[0] + f"_{i}"
        
            file_to_wav(file_dir,nama,outputpath)
        else :
            pass

In [4]:
insiden = ["begal","rampok","maling","pencuri","tabrakan","kecelakaan","kebakaran","random"]

In [None]:
#Iterate in files

for i in insiden:
    convert(f'New Data/{i}/')

In [89]:
Hujan = AudioSegment.from_file('New Data\SFX\Hujan SFX.wav', channels=1)-3
Jalanan = AudioSegment.from_file('New Data\SFX\Jalanan SFX.wav', channels=1)-3

SFX = [Hujan, Jalanan]

In [91]:
def overlay(filepath, outputpath):
    '''Memberi overlay pada audio file dengan 2 skenario tambahan'''
    #Import File
    file = AudioSegment.from_file(filepath, channels=1)
    name = Path(filepath).stem
    len_audio = len(file)
    label = ['HUJAN', 'JALANAN']
    #print("File Imported")
    i = 0

    for gangguan in SFX:
        max = len(gangguan)-len_audio #Membatasi maksimal randomize agar tidak mengambil part kosong
        part = random.randrange(0,max) #Mencari random length
        #print('Dapat Rand Range')
        output = file.overlay(gangguan[part:]) #Memberi overlay pada audiofile
        current_label = label[i]
        i += 1
        #print('Conver Berhasil')
        output.export("{}{}_{}.wav".format(outputpath, name, current_label), format="wav") #Eksport dan memberi label sesuai gangguan
        
        clear_output(wait=True)
        print("[Overlaying \"{}\" with \"{}\" Succeed]".format(name, current_label))

In [11]:

def get_array(path,event, iteration):
    file = f'{path}{event}/{event}_{iteration}.wav'
    _, array = read(file)
    return array

In [12]:
test = range(1,70)
iterator = iter(test)

In [None]:
#Show Waveform Graph Random Data
rows = 2
cols = 4
fig, axes = plt.subplots(rows, cols, figsize=(20, 10))
iteration = next(iterator)

path = 'New Data/WAV/'
axes[0,0].plot(get_array(path,'begal',iteration))
axes[0,0].set_title(f'BEGAL_{iteration}')
axes[0,1].plot(get_array(path,'rampok',iteration))
axes[0,1].set_title(f'RAMPOK_{iteration}')
axes[0,2].plot(get_array(path,'maling',iteration))
axes[0,2].set_title(f'MALING_{iteration}')
axes[0,3].plot(get_array(path,'pencuri',iteration))
axes[0,3].set_title(f'PENCURI_{iteration}')
axes[1,0].plot(get_array(path,'tabrakan',iteration))
axes[1,0].set_title(f'TABRAKAN_{iteration}')
axes[1,1].plot(get_array(path,'kecelakaan',iteration))
axes[1,1].set_title(f'KECELAKAAN_{iteration}')
axes[1,2].plot(get_array(path,'kebakaran',iteration))
axes[1,2].set_title(f'KEBAKARAN_{iteration}')
plt.show()


In [92]:
outputoverlay = os.path.join(data_dir,'Overlay/')
#tes = os.path.join(data_dir,'WAV/RAMPOK/rampok_1.wav')
#overlay(tes,outputoverlay)

insiden = ["begal","rampok","maling","pencuri","tabrakan","kecelakaan","kebakaran"]

for i in insiden:
    for file in sorted(os.listdir(f'New Data/WAV/{i}/')):
        #Get all files
        file = os.path.join(f'New Data/WAV/{i}/{file}')
        outputpath = os.path.join(outputoverlay,i+"/")
        overlay(file,outputpath)

[Overlaying "kecelakaan_71" with "JALANAN" Succeed]


In [93]:
def pitch(filepath, semitones, outputpath):
    '''Augmentasi ptich shifting pada data: pitch(path data, semitones, path output data)'''
    name = Path(filepath).stem
    label = ['UP', 'DOWN']
    file, sr = librosa.load(filepath) #import data dan sr
    
    pitchup  = librosa.effects.pitch_shift(file, sr, semitones)
    sf.write("{}{} [{}].wav".format(outputpath, name, label[0]), pitchup, sr)
    pitchdown  = librosa.effects.pitch_shift(file, sr, -semitones)
    sf.write("{}{} [{}].wav".format(outputpath, name, label[1]), pitchdown, sr)
    clear_output(wait=True)
    print("[Pitch Shifting to \"{}\" Succeed]".format(name))

In [94]:
outputpitch = os.path.join(data_dir,'PITCH/')
semitones = 2

for i in insiden:
    for file in sorted(os.listdir(f'New Data/WAV/{i}/')):
        #Get all files
        file = os.path.join(f'New Data/WAV/{i}/{file}')
        outputpath = os.path.join(outputpitch,i+"/")
        pitch(file,semitones,outputpath)

[Pitch Shifting to "kecelakaan_71" Succeed]


In [95]:
#Pitch Shift Overlayed Audio
outputpitch = os.path.join(data_dir,'PITCH/')
semitones = 2
insiden = ["begal","rampok","maling","pencuri","tabrakan","kecelakaan","kebakaran"]

for i in insiden:
    for file in sorted(os.listdir(f'New Data/Overlay/{i}/')):
        #Get all files
        file = os.path.join(f'New Data/Overlay/{i}/{file}')
        outputpath = os.path.join(outputpitch,i+"/")
        pitch(file,semitones,outputpath)

[Pitch Shifting to "kecelakaan_71_JALANAN" Succeed]


In [96]:
#os.mkdir('New Data/FINAL')
finalsubfold = []
for i in insiden:
    finalsubfold.append(i.upper())
finalsubfold

#for subfold in finalsubfold:
#    os.mkdir(f'New Data/Final/{subfold}')

#Move audio data in "WAV", "OVERLAY", and "PITCH" directory 
# into "FINAL" directory for creating model

categories = ['WAV','Overlay','Pitch']
dir = 'New Data/'
for category in categories:
    for i in finalsubfold:
        filedir = f'{dir}{category}/{i}/'
        for file in sorted(os.listdir(filedir)):
            old = os.path.join(filedir,file)
            new = f'{dir}FINAL/{i}/{file}'
            shutil.move(old,new)


In [1]:
def convertAllFilesInDirectoryTo16Bit(directory):
    for file in os.listdir(directory):
         if(file.endswith('.wav')):
            nameSolo = file.rsplit('.', 1)[0]
            data, samplerate = soundfile.read(directory + file)                

            soundfile.write(directory + nameSolo + '.wav', data, samplerate, subtype='PCM_16')
            print("converting " + file + "to 16 - bit")

In [2]:
import soundfile

In [9]:
for e in insiden:
    old_dir = f'New Data/FINAL/{e}/'
    convertAllFilesInDirectoryTo16Bit(old_dir)


converting begal_1 [DOWN].wavto 16 - bit
converting begal_1 [UP].wavto 16 - bit
converting begal_1.wavto 16 - bit
converting begal_10 [DOWN].wavto 16 - bit
converting begal_10 [UP].wavto 16 - bit
converting begal_10.wavto 16 - bit
converting begal_10_HUJAN [DOWN].wavto 16 - bit
converting begal_10_HUJAN [UP].wavto 16 - bit
converting begal_10_HUJAN.wavto 16 - bit
converting begal_10_JALANAN [DOWN].wavto 16 - bit
converting begal_10_JALANAN [UP].wavto 16 - bit
converting begal_10_JALANAN.wavto 16 - bit
converting begal_11 [DOWN].wavto 16 - bit
converting begal_11 [UP].wavto 16 - bit
converting begal_11.wavto 16 - bit
converting begal_11_HUJAN [DOWN].wavto 16 - bit
converting begal_11_HUJAN [UP].wavto 16 - bit
converting begal_11_HUJAN.wavto 16 - bit
converting begal_11_JALANAN [DOWN].wavto 16 - bit
converting begal_11_JALANAN [UP].wavto 16 - bit
converting begal_11_JALANAN.wavto 16 - bit
converting begal_12 [DOWN].wavto 16 - bit
converting begal_12 [UP].wavto 16 - bit
converting begal_1