Library


In [1]:
import math
import joblib
import numpy as np
import scipy.signal
import scipy.fftpack
import librosa
import soundfile as sf
import skfuzzy as fuzz
from skfuzzy import control as ctrl
import matplotlib.pyplot as plt
from scipy.signal import resample_poly
import os
import pandas as pd

Konstanta


In [2]:
SAMPLE_RATE = 48000
SEGMENT_DURATION = int(1 * SAMPLE_RATE)
OVERLAP_DURATION = int(0.5 * SAMPLE_RATE)

# Pra-Pemrosesan


In [3]:
def load_file_audio(path):
    audio, sr = sf.read(path)
    return np.array(audio), sr

def prapemrosesan_downmixing(audio):
    if audio.ndim > 1:
        audio = np.mean(audio, axis=1)
    return audio.astype(np.float32)

def prapemrosesan_resampling(audio, sr):
    if sr == SAMPLE_RATE:
        return audio.copy(), SAMPLE_RATE
    
    ratio = SAMPLE_RATE / sr
    n_samples = int(np.round(len(audio) * ratio))
    
    x_old = np.linspace(0, 1, len(audio))
    x_new = np.linspace(0, 1, n_samples)
    return np.interp(x_new, x_old, audio), SAMPLE_RATE

def prapemrosesan_padding(audio):
    if np.mod(audio.shape[0], SEGMENT_DURATION) != 0:
        padding = SEGMENT_DURATION - (audio.shape[0] % SEGMENT_DURATION)
        audio = np.pad(audio, (0, padding))
    return audio

def prapemrosesan_splitting(audio):
    num_segments = int(np.floor((len(audio) - SEGMENT_DURATION) / OVERLAP_DURATION)) + 1
    segments = []

    for i in range(num_segments):
        start = int(i * OVERLAP_DURATION)
        end = int(start + SEGMENT_DURATION)
        segment = audio[start:end]
        if len(segment) < SEGMENT_DURATION:
            segment = np.pad(segment, (0, SEGMENT_DURATION - len(segment)), mode='constant')
        segments.append(segment)

    return np.array(segments)

# Ekstraksi Fitur


In [4]:
def get_rms(segment):
    return np.sqrt(np.mean(segment ** 2))

def get_zcr(segment):
    return np.sum(np.abs(np.diff(np.signbit(segment)))) / (len(segment) / SAMPLE_RATE)

def get_lms(segment):
    return librosa.feature.melspectrogram(y=segment, sr=SAMPLE_RATE)

# Ekstraksi Data


In [5]:
list_segments_xc = [
    ("19655.mp3", [[1.5, 2.5], [2.0, 3.0], [2.5, 3.5], [3.0, 4.0], [3.5, 4.5], [5.0, 6.0], [8.5, 9.5], [9.0, 10.0], [9.5, 10.5], [10.0, 11.0], [10.5, 11.5], [14.0, 15.0], [14.5, 15.5], [15.0, 16.0], [15.5, 16.5], [16.0, 17.0], [16.5, 17.5]]),
    ("90809.mp3", [[0.0, 1.0], [0.5, 1.5], [1.0, 2.0], [1.5, 2.5], [2.0, 3.0], [2.5, 3.5], [3.0, 4.0], [3.5, 4.5], [4.0, 5.0], [4.5, 5.5], [5.0, 6.0], [5.5, 6.5], [6.0, 7.0], [6.5, 7.5], [7.0, 8.0], [7.5, 8.5], [8.0, 9.0], [8.5, 9.5], [9.0, 10.0], [9.5, 10.5], [10.0, 11.0], [10.5, 11.5], [11.0, 12.0], [11.5, 12.5], [12.0, 13.0], [12.5, 13.5], [13.0, 14.0], [13.5, 14.5], [14.0, 15.0], [14.5, 15.5], [15.0, 16.0], [15.5, 16.5], [16.0, 17.0], [16.5, 17.5], [17.0, 18.0], [17.5, 18.5], [18.0, 19.0], [18.5, 19.5], [19.0, 20.0], [19.5, 20.5], [20.0, 21.0], [20.5, 21.5], [21.0, 22.0], [21.5, 22.5]]),
    ("151761.mp3", [[0.5, 1.5], [1.0, 2.0], [1.5, 2.5], [11.0, 12.0], [11.5, 12.5], [12.0, 13.0], [12.5, 13.5], [13.0, 14.0], [13.5, 14.5], [21.0, 22.0], [21.5, 22.5], [22.0, 23.0]]),
    ("193835.mp3", [[1.5, 2.5], [3.0, 4.0], [4.5, 5.5], [7.5, 8.5], [9.5, 10.5], [13.5, 14.5], [14.0, 15.0], [14.5, 15.5], [18.0, 19.0], [19.5, 20.5], [21.0, 22.0]]),
    ("279210.mp3", [[0.0, 1.0], [0.5, 1.5], [1.0, 2.0], [1.5, 2.5], [2.0, 3.0], [2.5, 3.5], [15.0, 16.0], [15.5, 16.5], [16.0, 17.0], [16.5, 17.5], [17.0, 18.0]]),
    ("282469.mp3", [[0.5, 1.5], [1.0, 2.0], [1.5, 2.5], [2.0, 3.0], [2.5, 3.5], [4.5, 5.5], [8.0, 9.0], [9.5, 10.5], [10.0, 11.0], [10.5, 11.5], [11.0, 12.0], [11.5, 12.5], [12.0, 13.0], [12.5, 13.5], [13.0, 14.0], [14.5, 15.5], [15.0, 16.0], [15.5, 16.5], [16.0, 17.0], [16.5, 17.5], [17.0, 18.0], [17.5, 18.5], [18.0, 19.0], [18.5, 19.5], [19.0, 20.0], [20.5, 21.5], [21.0, 22.0], [21.5, 22.5], [22.0, 23.0]]),
    ("359523.mp3", [[0.5, 1.5], [1.0, 2.0], [1.5, 2.5], [2.0, 3.0], [5.0, 6.0], [5.5, 6.5], [10.0, 11.0], [10.5, 11.5], [11.0, 12.0], [11.5, 12.5], [15.0, 16.0], [16.5, 17.5]]),
    ("359552.mp3", [[0.5, 1.5], [1.0, 2.0], [4.0, 5.0], [4.5, 5.5], [6.0, 7.0], [6.5, 7.5], [8.5, 9.5], [12.0, 13.0], [14.0, 15.0]]),
    ("498380.mp3", [[0.0, 1.0], [0.5, 1.5], [1.0, 2.0], [1.5, 2.5], [2.0, 3.0], [3.5, 4.5], [4.0, 5.0], [4.5, 5.5], [5.0, 6.0], [6.5, 7.5], [7.0, 8.0], [7.5, 8.5], [8.0, 9.0], [8.5, 9.5], [10.0, 11.0], [10.5, 11.5], [11.0, 12.0], [11.5, 12.5], [12.0, 13.0], [14.5, 15.5], [15.0, 16.0], [15.5, 16.5], [17.0, 18.0], [17.5, 18.5], [18.0, 19.0], [18.5, 19.5], [19.0, 20.0], [19.5, 20.5], [20.0, 21.0], [20.5, 21.5]]),
    ("505545.mp3", [[2.5, 3.5], [4.0, 5.0], [10.5, 11.5], [11.0, 12.0], [11.5, 12.5], [18.5, 19.5], [19.0, 20.0], [19.5, 20.5], [20.0, 21.0], [20.5, 21.5], [22.0, 23.0], [22.5, 23.5]]),
    ("531042.mp3", [[0.5, 1.5], [1.0, 2.0], [1.5, 2.5], [2.0, 3.0], [2.5, 3.5], [3.0, 4.0], [3.5, 4.5], [4.0, 5.0], [4.5, 5.5], [5.0, 6.0], [5.5, 6.5], [6.0, 7.0], [6.5, 7.5], [7.0, 8.0], [7.5, 8.5], [8.0, 9.0], [8.5, 9.5], [9.0, 10.0], [9.5, 10.5], [11.0, 12.0], [11.5, 12.5], [12.0, 13.0], [13.5, 14.5], [14.0, 15.0], [14.5, 15.5], [15.0, 16.0], [15.5, 16.5], [16.0, 17.0], [16.5, 17.5], [17.0, 18.0], [17.5, 18.5], [18.0, 19.0], [18.5, 19.5], [19.0, 20.0], [19.5, 20.5], [20.0, 21.0]]),
]

list_segments_nad = [
    ("1-977-A-39.wav", [[0.0, 1.0]]),
    ("1-1791-A-26.wav", [[0.0, 1.0], [0.5, 1.5], [1.0, 2.0]]),
    ("1-7974-A-49.wav", [[0.0, 1.0], [0.5, 1.5], [1.0, 2.0], [1.5, 2.5], [2.0, 3.0], [3.5, 4.5], [4.0, 5.0]]),
    ("1-7974-B-49.wav", [[0.0, 1.0], [0.5, 1.5], [1.0, 2.0]]),
]

In [6]:
import os
import numpy as np

def extract_segments_data(list_time, folder_path):
    list_data_sound = []
    list_data_background = []

    for file_info in list_time:
        file = file_info[0]
        time_ranges = sorted(file_info[1])  # List of (start, end) for sound

        try:
            audio, sr = load_file_audio(os.path.join(folder_path, file))
            audio = prapemrosesan_downmixing(audio)
            audio, sr = prapemrosesan_resampling(audio, sr)
            audio = prapemrosesan_padding(audio)

            duration = len(audio) / sr

            # --- Extract sound segments ---
            for start_time, end_time in time_ranges:
                start = int(start_time * sr)
                end = int(end_time * sr)
                segment = audio[start:end]

                if len(segment) == 0:
                    continue

                if len(segment) < SEGMENT_DURATION:
                    segment = np.pad(segment, (0, SEGMENT_DURATION - len(segment)))
                elif len(segment) > SEGMENT_DURATION:
                    segment = segment[:SEGMENT_DURATION]

                data = {
                    'rms': get_rms(segment),
                    'zcr': get_zcr(segment),
                    'lms': get_lms(segment)
                }
                list_data_sound.append(data)

            # --- Extract background segments ---
            background_ranges = []
            last_end = 0.0
            for start_time, end_time in time_ranges:
                if start_time > last_end:
                    background_ranges.append((last_end, start_time))
                last_end = max(last_end, end_time)
            if last_end < duration:
                background_ranges.append((last_end, duration))

            for start_time, end_time in background_ranges:
                start = int(start_time * sr)
                end = int(end_time * sr)
                segment = audio[start:end]

                if len(segment) == 0:
                    continue

                if len(segment) < SEGMENT_DURATION:
                    segment = np.pad(segment, (0, SEGMENT_DURATION - len(segment)))
                elif len(segment) > SEGMENT_DURATION:
                    segment = segment[:SEGMENT_DURATION]

                data = {
                    'rms': get_rms(segment),
                    'zcr': get_zcr(segment),
                    'lms': get_lms(segment)
                }
                list_data_background.append(data)

        except Exception as e:
            print(f"Error processing {file}: {str(e)}")
            continue

    return list_data_sound, list_data_background

# Main


In [7]:
dir_data_xc = r"C:\Users\Lulay\Documents\GitHub\Dasar-Kecerdasan-Artificial_Tugas-Besar\Dataset\xeno-canto"
dir_data_nad = r"C:\Users\Lulay\Documents\GitHub\Dasar-Kecerdasan-Artificial_Tugas-Besar\Dataset\noise-audio-data"

data_xc_sound, data_xc_background = extract_segments_data(list_segments_xc, dir_data_xc)
data_nad_sound, data_nad_background = extract_segments_data(list_segments_nad, dir_data_nad)

In [8]:
df_xc = pd.DataFrame(data_xc_sound)[['rms', 'zcr', 'lms']]
df_xc['label'] = 0

df_nad = pd.DataFrame(data_nad_sound)[['rms', 'zcr', 'lms']]
df_nad['label'] = 1

df_xc_background = pd.DataFrame(data_xc_background)[['rms', 'zcr', 'lms']]
df_xc_background['label'] = 2

df_nad_background = pd.DataFrame(data_nad_background)[['rms', 'zcr', 'lms']]
df_nad_background['label'] = 2

df = pd.concat([df_xc, df_nad, df_xc_background, df_nad_background], ignore_index=True)

In [9]:
df

Unnamed: 0,rms,zcr,lms,label
0,0.093347,11828.0,"[[0.00010968434832911574, 2.857292763094134e-0...",0
1,0.098560,11788.0,"[[0.00010208991754756462, 4.643659007823317e-0...",0
2,0.086463,12038.0,"[[0.000131525308218609, 4.165293552816641e-05,...",0
3,0.099265,12071.0,"[[0.0028985666985891973, 0.000702836856945281,...",0
4,0.104131,12207.0,"[[0.011493215067814122, 0.0028270847576774127,...",0
...,...,...,...,...
293,0.006890,5535.0,"[[0.00026822562, 7.4043586e-05, 1.2498328e-06,...",2
294,0.000050,2888.0,"[[1.4979400866744984e-05, 1.823274197815728e-0...",2
295,0.000000,0.0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",2
296,0.003297,595.0,"[[0.010453902823931343, 0.04364713041217438, 0...",2


In [11]:
df.to_excel('dataset_labeled.xlsx', index=False)

In [10]:
stats_rms = df['rms'].describe()
stats_zcr = df['zcr'].describe()

print("Statistical Information for RMS:")
print(stats_rms)
print("\nStatistical Information for ZCR:")
print(stats_zcr)

Statistical Information for RMS:
count    298.000000
mean       0.046692
std        0.037863
min        0.000000
25%        0.012809
50%        0.037309
75%        0.071389
max        0.184377
Name: rms, dtype: float64

Statistical Information for ZCR:
count      298.000000
mean      8360.768456
std       3419.328479
min          0.000000
25%       6088.750000
50%       9044.000000
75%      10599.750000
max      14626.000000
Name: zcr, dtype: float64
