In [12]:
import soundfile as sf
import math
import numpy as np
from scipy.signal import resample_poly
import scipy.signal
import scipy.fftpack
import librosa

In [13]:
SAMPLE_RATE = 48000
SEGMENT_DURATION = int(1 * SAMPLE_RATE)
OVERLAP_DURATION = int(0.5 * SAMPLE_RATE)

In [14]:
def input_audio_file(path):
    import soundfile as sf
    audio, sr = sf.read(path)
    return np.array(audio), sr

def preprocess_downmix(audio):
    if audio.ndim > 1 and audio.shape[1] > 1:
        return np.mean(audio, axis=1)
    return audio

def preprocessing_resample(audio, sr):
    if sr == SAMPLE_RATE:
        return audio.copy()
    
    ratio = SAMPLE_RATE / sr
    n_samples = int(np.round(len(audio) * ratio))
    
    x_old = np.linspace(0, 1, len(audio))
    x_new = np.linspace(0, 1, n_samples)
    return np.interp(x_new, x_old, audio), SAMPLE_RATE

def preprocessing_padding(audio):
    if np.mod(audio.shape[0], SEGMENT_DURATION) != 0:
        padding = SEGMENT_DURATION - (audio.shape[0] % SEGMENT_DURATION)
        audio = np.pad(audio, (0, padding))
    return audio

In [15]:
def split_audio(audio):
    num_segments = int(np.floor((len(audio) - SEGMENT_DURATION) / OVERLAP_DURATION)) + 1
    segments = []

    for i in range(num_segments):
        start = int(i * OVERLAP_DURATION)
        end = int(start + SEGMENT_DURATION)
        segment = audio[start:end]
        if len(segment) < SEGMENT_DURATION:
            segment = np.pad(segment, (0, SEGMENT_DURATION - len(segment)), mode='constant')
        segments.append(segment)

    return np.array(segments)

In [16]:
def get_rms(segment):
    return np.sqrt(np.mean(segment ** 2))

def get_zcr(segment):
    return np.sum(np.abs(np.diff(np.signbit(segment)))) / SAMPLE_RATE

In [17]:
def get_logmel_spectrogram(segment):
    return librosa.feature.melspectrogram(y=segment, sr=SAMPLE_RATE)

In [18]:
def get_timestamp(i):
    return (i*0.5, i*0.5+1)

In [19]:
audio_path = r"C:\Users\Lulay\Documents\GitHub\Dasar-Kecerdasan-Artificial_Tugas-Besar\Dataset\noise-audio-data\1-7974-A-49.wav"
audio, sr = input_audio_file(audio_path)
print(audio.shape, sr)
audio = preprocess_downmix(audio)
print(audio.shape, sr)
audio, sr = preprocessing_resample(audio, sr)
print(audio.shape, sr)
audio = preprocessing_padding(audio)
print(audio.shape, sr)
segments = split_audio(audio)
print(segments.shape)

(220500,) 44100
(220500,) 44100
(240000,) 48000
(240000,) 48000
(9, 48000)


In [22]:
result = []
i = 0
for segment in segments:
    rms = get_rms(segment)
    zcr = get_zcr(segment)
    result.append((rms, zcr, get_timestamp(i)))
    i = i+1

In [26]:
loud_list = [(0.0, 1.0), (0.5, 1.5), (1.0, 2.0), (1.5, 2.5), (1.5, 2.5), (3.5, 4.5), (4.0, 5.0)]
result_loud = [result[i] for i in range(len(result)) if result[i][2] in loud_list]
result_bg = [result[i] for i in range(len(result)) if result[i][2] not in loud_list]

In [28]:
print(result_loud)

[(0.12247626112586694, 0.05608333333333333, (0.0, 1.0)), (0.11444451039852403, 0.056, (0.5, 1.5)), (0.1050360746520724, 0.0523125, (1.0, 2.0)), (0.10020610057137613, 0.05291666666666667, (1.5, 2.5)), (0.12153389998660953, 0.05222916666666667, (3.5, 4.5)), (0.14131249932656487, 0.0591875, (4.0, 5.0))]


In [29]:
print(result_bg)

[(0.07927951233258367, 0.043729166666666666, (2.0, 3.0)), (0.03878785275019558, 0.028, (2.5, 3.5)), (0.07103521929957307, 0.032375, (3.0, 4.0))]
