In [6]:
import soundfile as sf
import math
import numpy as np
from scipy.signal import resample_poly

In [7]:
SAMPLE_RATE = 48000
SEGMENT_DURATION = int(1 * SAMPLE_RATE)
OVERLAP_DURATION = int(0.5 * SAMPLE_RATE)

In [8]:
def input_audio_file(path):
    import soundfile as sf
    audio, sr = sf.read(path)
    return np.array(audio), sr

def preprocess_downmix(audio):
    if audio.ndim > 1 and audio.shape[1] > 1:
        return np.mean(audio, axis=1)
    return audio

def preprocessing_resample(audio, sr):
    if sr == SAMPLE_RATE:
        return audio.copy()
    
    ratio = SAMPLE_RATE / sr
    n_samples = int(np.round(len(audio) * ratio))
    
    x_old = np.linspace(0, 1, len(audio))
    x_new = np.linspace(0, 1, n_samples)
    return np.interp(x_new, x_old, audio), SAMPLE_RATE

def preprocessing_padding(audio):
    if np.mod(audio.shape[0], SEGMENT_DURATION) != 0:
        padding = SEGMENT_DURATION - (audio.shape[0] % SEGMENT_DURATION)
        audio = np.pad(audio, (0, padding))
    return audio

In [9]:
def split_audio(audio):
    num_segments = int(np.floor((len(audio) - SEGMENT_DURATION) / OVERLAP_DURATION)) + 1
    segments = []

    for i in range(num_segments):
        start = int(i * OVERLAP_DURATION)
        end = int(start + SEGMENT_DURATION)
        segment = audio[start:end]
        if len(segment) < SEGMENT_DURATION:
            segment = np.pad(segment, (0, SEGMENT_DURATION - len(segment)), mode='constant')
        segments.append(segment)

    return np.array(segments)

In [10]:
audio_path = r"C:\Users\Lulay\Documents\GitHub\Dasar-Kecerdasan-Artificial_Tugas-Besar\Dataset\xeno-canto\940964.mp3"
audio, sr = input_audio_file(audio_path)
print(audio.shape, sr)
audio = preprocess_downmix(audio)
print(audio.shape, sr)
audio, sr = preprocessing_resample(audio, sr)
print(audio.shape, sr)
audio = preprocessing_padding(audio)
print(audio.shape, sr)
segments = split_audio(audio)
print(segments.shape)

(1059367, 2) 44100
(1059367,) 44100
(1153053,) 48000
(1200000,) 48000
(49, 48000)
