Library


In [11]:
import math
import joblib
import numpy as np
import scipy.signal
import scipy.fftpack
import librosa
import soundfile as sf
import skfuzzy as fuzz
from skfuzzy import control as ctrl
import matplotlib.pyplot as plt
from scipy.signal import resample_poly
import os
import pandas as pd
import json

Konstanta


In [12]:
SAMPLE_RATE = 48000
SEGMENT_DURATION = int(1 * SAMPLE_RATE)
OVERLAP_DURATION = int(0.5 * SAMPLE_RATE)

# Pra-Pemrosesan


In [13]:
def load_file_audio(path):
    audio, sr = sf.read(path)
    return np.array(audio), sr

def prapemrosesan_downmixing(audio):
    if audio.ndim > 1:
        audio = np.mean(audio, axis=1)
    return audio.astype(np.float32)

def prapemrosesan_resampling(audio, sr):
    if sr == SAMPLE_RATE:
        return audio.copy(), SAMPLE_RATE
    
    ratio = SAMPLE_RATE / sr
    n_samples = int(np.round(len(audio) * ratio))
    
    x_old = np.linspace(0, 1, len(audio))
    x_new = np.linspace(0, 1, n_samples)
    return np.interp(x_new, x_old, audio), SAMPLE_RATE

def prapemrosesan_padding(audio):
    if np.mod(audio.shape[0], SEGMENT_DURATION) != 0:
        padding = SEGMENT_DURATION - (audio.shape[0] % SEGMENT_DURATION)
        audio = np.pad(audio, (0, padding))
    return audio

def prapemrosesan_splitting(audio):
    num_segments = int(np.floor((len(audio) - SEGMENT_DURATION) / OVERLAP_DURATION)) + 1
    segments = []

    for i in range(num_segments):
        start = int(i * OVERLAP_DURATION)
        end = int(start + SEGMENT_DURATION)
        segment = audio[start:end]
        if len(segment) < SEGMENT_DURATION:
            segment = np.pad(segment, (0, SEGMENT_DURATION - len(segment)), mode='constant')
        segments.append(segment)

    return np.array(segments)

# Ekstraksi Fitur


In [14]:
def get_rms(segment):
    return np.sqrt(np.mean(segment ** 2))

def get_zcr(segment):
    return np.sum(np.abs(np.diff(np.signbit(segment)))) / (len(segment) / SAMPLE_RATE)

def get_lms(segment):
    return librosa.feature.melspectrogram(y=segment, sr=SAMPLE_RATE)

# Ekstraksi Data


In [15]:
list_segments_xc = [
    ("19655.mp3", [[1.5, 2.5], [2.0, 3.0], [2.5, 3.5], [3.0, 4.0], [3.5, 4.5], [5.0, 6.0], [8.5, 9.5], [9.0, 10.0], [9.5, 10.5], [10.0, 11.0], [10.5, 11.5], [14.0, 15.0], [14.5, 15.5], [15.0, 16.0], [15.5, 16.5], [16.0, 17.0], [16.5, 17.5]]),
    ("90809.mp3", [[0.0, 1.0], [0.5, 1.5], [1.0, 2.0], [1.5, 2.5], [2.0, 3.0], [2.5, 3.5], [3.0, 4.0], [3.5, 4.5], [4.0, 5.0], [4.5, 5.5], [5.0, 6.0], [5.5, 6.5], [6.0, 7.0], [6.5, 7.5], [7.0, 8.0], [7.5, 8.5], [8.0, 9.0], [8.5, 9.5], [9.0, 10.0], [9.5, 10.5], [10.0, 11.0], [10.5, 11.5], [11.0, 12.0], [11.5, 12.5], [12.0, 13.0], [12.5, 13.5], [13.0, 14.0], [13.5, 14.5], [14.0, 15.0], [14.5, 15.5], [15.0, 16.0], [15.5, 16.5], [16.0, 17.0], [16.5, 17.5], [17.0, 18.0], [17.5, 18.5], [18.0, 19.0], [18.5, 19.5], [19.0, 20.0], [19.5, 20.5], [20.0, 21.0], [20.5, 21.5], [21.0, 22.0], [21.5, 22.5]]),
    ("151761.mp3", [[0.5, 1.5], [1.0, 2.0], [1.5, 2.5], [11.0, 12.0], [11.5, 12.5], [12.0, 13.0], [12.5, 13.5], [13.0, 14.0], [13.5, 14.5], [21.0, 22.0], [21.5, 22.5], [22.0, 23.0]]),
    ("193835.mp3", [[1.5, 2.5], [3.0, 4.0], [4.5, 5.5], [7.5, 8.5], [9.5, 10.5], [13.5, 14.5], [14.0, 15.0], [14.5, 15.5], [18.0, 19.0], [19.5, 20.5], [21.0, 22.0]]),
    ("279210.mp3", [[0.0, 1.0], [0.5, 1.5], [1.0, 2.0], [1.5, 2.5], [2.0, 3.0], [2.5, 3.5], [15.0, 16.0], [15.5, 16.5], [16.0, 17.0], [16.5, 17.5], [17.0, 18.0]]),
    ("282469.mp3", [[0.5, 1.5], [1.0, 2.0], [1.5, 2.5], [2.0, 3.0], [2.5, 3.5], [4.5, 5.5], [8.0, 9.0], [9.5, 10.5], [10.0, 11.0], [10.5, 11.5], [11.0, 12.0], [11.5, 12.5], [12.0, 13.0], [12.5, 13.5], [13.0, 14.0], [14.5, 15.5], [15.0, 16.0], [15.5, 16.5], [16.0, 17.0], [16.5, 17.5], [17.0, 18.0], [17.5, 18.5], [18.0, 19.0], [18.5, 19.5], [19.0, 20.0], [20.5, 21.5], [21.0, 22.0], [21.5, 22.5], [22.0, 23.0]]),
    ("359523.mp3", [[0.5, 1.5], [1.0, 2.0], [1.5, 2.5], [2.0, 3.0], [5.0, 6.0], [5.5, 6.5], [10.0, 11.0], [10.5, 11.5], [11.0, 12.0], [11.5, 12.5], [15.0, 16.0], [16.5, 17.5]]),
    ("359552.mp3", [[0.5, 1.5], [1.0, 2.0], [4.0, 5.0], [4.5, 5.5], [6.0, 7.0], [6.5, 7.5], [8.5, 9.5], [12.0, 13.0], [14.0, 15.0]]),
    ("498380.mp3", [[0.0, 1.0], [0.5, 1.5], [1.0, 2.0], [1.5, 2.5], [2.0, 3.0], [3.5, 4.5], [4.0, 5.0], [4.5, 5.5], [5.0, 6.0], [6.5, 7.5], [7.0, 8.0], [7.5, 8.5], [8.0, 9.0], [8.5, 9.5], [10.0, 11.0], [10.5, 11.5], [11.0, 12.0], [11.5, 12.5], [12.0, 13.0], [14.5, 15.5], [15.0, 16.0], [15.5, 16.5], [17.0, 18.0], [17.5, 18.5], [18.0, 19.0], [18.5, 19.5], [19.0, 20.0], [19.5, 20.5], [20.0, 21.0], [20.5, 21.5]]),
    ("505545.mp3", [[2.5, 3.5], [4.0, 5.0], [10.5, 11.5], [11.0, 12.0], [11.5, 12.5], [18.5, 19.5], [19.0, 20.0], [19.5, 20.5], [20.0, 21.0], [20.5, 21.5], [22.0, 23.0], [22.5, 23.5]]),
    ("531042.mp3", [[0.5, 1.5], [1.0, 2.0], [1.5, 2.5], [2.0, 3.0], [2.5, 3.5], [3.0, 4.0], [3.5, 4.5], [4.0, 5.0], [4.5, 5.5], [5.0, 6.0], [5.5, 6.5], [6.0, 7.0], [6.5, 7.5], [7.0, 8.0], [7.5, 8.5], [8.0, 9.0], [8.5, 9.5], [9.0, 10.0], [9.5, 10.5], [11.0, 12.0], [11.5, 12.5], [12.0, 13.0], [13.5, 14.5], [14.0, 15.0], [14.5, 15.5], [15.0, 16.0], [15.5, 16.5], [16.0, 17.0], [16.5, 17.5], [17.0, 18.0], [17.5, 18.5], [18.0, 19.0], [18.5, 19.5], [19.0, 20.0], [19.5, 20.5], [20.0, 21.0]]),
]

list_segments_nad = [
    ("1-100032-A-0.wav", [[2.0, 3.0]]),
    ("1-100038-A-14.wav", [[0.0, 1.0]]),
    ("1-100210-A-36.wav", [[0.5, 1.5], [1.0, 2.0], [1.5, 2.5], [2.0, 3.0], [2.5, 3.5], [3.0, 4.0], [3.5, 4.5], [4.0, 5.0]]),
    ("1-100210-B-36.wav", [[0.0, 1.0], [0.5, 1.5], [1.0, 2.0], [1.5, 2.5], [2.0, 3.0], [2.5, 3.5], [3.0, 4.0], [3.5, 4.5]]),
    ("1-101296-A-19.wav", []),
    ("1-101296-B-19.wav", []),
    ("1-101336-A-30.wav", [[0.0, 1.0], [0.5, 1.5], [1.0, 2.0], [1.5, 2.5], [2.0, 3.0]]),
    ("1-101404-A-34.wav", [[0.0, 1.0]]),
    ("1-103298-A-9.wav", [[0.0, 1.0], [1.5, 2.5], [4.0, 5.0]]),
    ("1-103995-A-30.wav", [[0.0, 1.0]]),
    ("1-103999-A-30.wav", [[0.0, 1.0]]),
    ("1-104089-A-22.wav", [[0.5, 1.5], [1.0, 2.0], [1.5, 2.5], [2.0, 3.0], [2.5, 3.5], [3.0, 4.0], [3.5, 4.5], [4.0, 5.0]]),
    ("1-104089-B-22.wav", [[0.0, 1.0], [0.5, 1.5], [1.0, 2.0], [1.5, 2.5], [2.0, 3.0], [2.5, 3.5], [3.0, 4.0], [3.5, 4.5], [4.0, 5.0]]),
    ("1-105224-A-22.wav", [[0.0, 1.0], [0.5, 1.5], [1.0, 2.0], [1.5, 2.5], [2.0, 3.0], [2.5, 3.5], [3.0, 4.0], [3.5, 4.5], [4.0, 5.0]]),
    ("1-110389-A-0.wav", [[0.0, 1.0]]),
    ("1-110537-A-22.wav", [[0.0, 1.0], [0.5, 1.5], [1.0, 2.0], [1.5, 2.5], [2.0, 3.0], [2.5, 3.5], [3.0, 4.0], [3.5, 4.5], [4.0, 5.0]]),
    ("1-115521-A-19.wav", [[0.0, 1.0], [3.5, 4.5]]),
    ("1-115545-A-48.wav", []),
    ("1-115545-B-48.wav", [[0.0, 1.0]]),
    ("1-115545-C-48.wav", [[0.5, 1.5]]),
    ("1-115546-A-48.wav", [[0.5, 1.5]]),
    ("1-115920-B-22.wav", []),
    ("1-115921-A-22.wav", []),
    ("1-116765-A-41.wav", [[1.0, 2.0], [1.5, 2.5], [2.0, 3.0], [2.5, 3.5], [3.0, 4.0], [3.5, 4.5], [4.0, 5.0]]),
    ("1-11687-A-47.wav", [[0.0, 1.0], [0.5, 1.5], [1.0, 2.0], [1.5, 2.5]]),
    ("1-118206-A-31.wav", []),
    ("1-118559-A-17.wav", []),
    ("1-121951-A-8.wav", []),
    ("1-13571-A-46.wav", [[0.0, 1.0], [2.0, 3.0], [3.5, 4.5], [4.0, 5.0]]),
    ("1-13572-A-46.wav", [[2.0, 3.0]]),
    ("1-13613-A-37.wav", [[0.0, 1.0], [0.5, 1.5], [1.0, 2.0], [1.5, 2.5], [2.0, 3.0], [2.5, 3.5], [3.0, 4.0], [3.5, 4.5], [4.0, 5.0]]),
    ("1-137-A-32.wav", []),
    ("1-977-A-39.wav", [[0.0, 1.0]]),
    ("1-1791-A-26.wav", [[0.0, 1.0], [0.5, 1.5], [1.0, 2.0]]),
    ("1-7974-A-49.wav", [[0.0, 1.0], [0.5, 1.5], [1.0, 2.0], [1.5, 2.5], [2.0, 3.0], [3.5, 4.5], [4.0, 5.0]]),
    ("1-7974-B-49.wav", [[0.0, 1.0], [0.5, 1.5], [1.0, 2.0]]),
]

### RMS and ZCR Extraction


In [16]:
def extract_rms_zcr(list_time, folder_path):
    list_data_sound = []
    list_data_background = []

    for file_info in list_time:
        file = file_info[0]
        time_ranges = sorted(file_info[1])

        try:
            audio, sr = load_file_audio(os.path.join(folder_path, file))
            audio = prapemrosesan_downmixing(audio)
            audio, sr = prapemrosesan_resampling(audio, sr)
            audio = prapemrosesan_padding(audio)

            duration = len(audio) / sr

            for start_time, end_time in time_ranges:
                start = int(start_time * sr)
                end = int(end_time * sr)
                segment = audio[start:end]

                if len(segment) == 0:
                    continue

                if len(segment) < SEGMENT_DURATION:
                    segment = np.pad(segment, (0, SEGMENT_DURATION - len(segment)))
                elif len(segment) > SEGMENT_DURATION:
                    segment = segment[:SEGMENT_DURATION]

                data = {
                    'rms': get_rms(segment),
                    'zcr': get_zcr(segment),
                }
                list_data_sound.append(data)

            background_ranges = []
            last_end = 0.0
            for start_time, end_time in time_ranges:
                if start_time > last_end:
                    background_ranges.append((last_end, start_time))
                last_end = max(last_end, end_time)
            if last_end < duration:
                background_ranges.append((last_end, duration))

            for start_time, end_time in background_ranges:
                start = int(start_time * sr)
                end = int(end_time * sr)
                segment = audio[start:end]

                if len(segment) == 0:
                    continue

                if len(segment) < SEGMENT_DURATION:
                    segment = np.pad(segment, (0, SEGMENT_DURATION - len(segment)))
                elif len(segment) > SEGMENT_DURATION:
                    segment = segment[:SEGMENT_DURATION]

                data = {
                    'rms': get_rms(segment),
                    'zcr': get_zcr(segment),
                }
                list_data_background.append(data)

        except Exception as e:
            print(f"Error processing {file}: {str(e)}")
            continue

    return list_data_sound, list_data_background

### Log-Mel Spectrogram Extraction


In [17]:
def extract_lms(list_time, folder_path, SAMPLE_RATE, SEGMENT_DURATION):
    list_data_sound = []
    list_data_background = []

    for file_info in list_time:
        file = file_info[0]
        time_ranges = sorted(file_info[1])
        try:
            audio, sr = load_file_audio(os.path.join(folder_path, file))
            audio = prapemrosesan_downmixing(audio)
            audio, sr = prapemrosesan_resampling(audio, sr)
            audio = prapemrosesan_padding(audio)

            duration = len(audio) / sr
          
            for start_time, end_time in time_ranges:
                start = int(start_time * sr)
                end = int(end_time * sr)
                segment = audio[start:end]

                if len(segment) == 0:
                    continue

                if len(segment) < SEGMENT_DURATION:
                    segment = np.pad(segment, (0, SEGMENT_DURATION - len(segment)))
                elif len(segment) > SEGMENT_DURATION:
                    segment = segment[:SEGMENT_DURATION]

                lms = get_lms(segment)
                
                list_data_sound.append({
                    'file': file,
                    'start_time': start_time,
                    'end_time': end_time,
                    'lms': lms.tolist()
                })

            background_ranges = []
            last_end = 0.0
            for start_time, end_time in time_ranges:
                if start_time > last_end:
                    background_ranges.append((last_end, start_time))
                last_end = max(last_end, end_time)
            if last_end < duration:
                background_ranges.append((last_end, duration))

            for start_time, end_time in background_ranges:
                start = int(start_time * sr)
                end = int(end_time * sr)
                segment = audio[start:end]

                if len(segment) == 0:
                    continue

                if len(segment) < SEGMENT_DURATION:
                    segment = np.pad(segment, (0, SEGMENT_DURATION - len(segment)))
                elif len(segment) > SEGMENT_DURATION:
                    segment = segment[:SEGMENT_DURATION]

                lms = get_lms(segment)
                
                list_data_background.append({
                    'file': file,
                    'start_time': start_time,
                    'end_time': end_time,
                    'lms': lms.tolist()
                })

        except Exception as e:
            print(f"Error processing {file}: {str(e)}")
            continue

    return list_data_sound, list_data_background

# Main


In [18]:
dir_data_xc = r"C:\Users\Lulay\Documents\GitHub\Dasar-Kecerdasan-Artificial_Tugas-Besar\Dataset\xeno-canto"
dir_data_nad = r"C:\Users\Lulay\Documents\GitHub\Dasar-Kecerdasan-Artificial_Tugas-Besar\Dataset\noise-audio-data"

data_xc_rmszcr_sound, data_xc_rmszcr_background = extract_rms_zcr(list_segments_xc, dir_data_xc)
data_nad_rmszcr_sound, data_nad_rmszcr_background = extract_rms_zcr(list_segments_nad, dir_data_nad) 

data_xc_lms_sound, data_xc_lms_background = extract_lms(list_segments_xc, dir_data_xc, SAMPLE_RATE, SEGMENT_DURATION)
data_nad_lms_sound, data_nad_lms_background = extract_lms(list_segments_nad, dir_data_nad, SAMPLE_RATE, SEGMENT_DURATION)

In [19]:
df_xc = pd.DataFrame(data_xc_rmszcr_sound)[['rms', 'zcr']]
df_xc['label'] = 0

df_nad = pd.DataFrame(data_nad_rmszcr_sound)[['rms', 'zcr']]
df_nad['label'] = 1

df_xc_background = pd.DataFrame(data_xc_rmszcr_background)[['rms', 'zcr']]
df_xc_background['label'] = 2

df_nad_background = pd.DataFrame(data_nad_rmszcr_background)[['rms', 'zcr']]
df_nad_background['label'] = 2

df_rmszcr = pd.concat([df_xc, df_nad, df_xc_background, df_nad_background], ignore_index=True)

In [None]:
df_rmszcr.to_excel('dataset_rms-zcr_labelled.xlsx', index=False)

In [27]:
# data_xc_lms_sound, data_xc_lms_background, data_nad_lms_sound, data_nad_lms_background
data_xc_lms_sound[8]

{'file': '19655.mp3',
 'start_time': 9.5,
 'end_time': 10.5,
 'lms': [[0.00015211140402955086,
   4.158081254441372e-05,
   2.8356954115830346e-06,
   5.536220363644749e-06,
   9.390737547105003e-07,
   2.042307781909994e-06,
   7.704102902336852e-06,
   5.039988715682946e-06,
   7.011298533416616e-06,
   2.4033225690330253e-06,
   1.601166633784978e-06,
   1.2307689805070773e-05,
   1.5161743801934265e-05,
   2.873832801577255e-06,
   6.83698452419414e-06,
   6.921946980975582e-06,
   2.6303826476705215e-06,
   7.134195735369494e-07,
   1.2312802251034906e-06,
   1.599269282599557e-06,
   2.1340862930237734e-06,
   6.057444365666411e-06,
   8.624122354532943e-06,
   3.713563379794413e-06,
   3.1048089986029898e-06,
   1.2981482093712146e-05,
   1.26715341805759e-05,
   7.895643787883912e-06,
   1.6250026588549968e-06,
   5.302115443296807e-07,
   2.3860667033598043e-06,
   4.560175465175053e-06,
   4.312337009837679e-06,
   1.4222492320333771e-06,
   9.757962469784232e-07,
   6.712839