In [None]:
!pip install spafe
!pip install gammatone
!pip install scipy

In [None]:
import json
import os
import math
import librosa
import numpy as np
from spafe.features.gfcc import gfcc
from gammatone.gtgram import gtgram
from scipy.fftpack import dct

In [None]:
from google.colab import files
from google.colab import drive

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
DATASET_PATH = "/content/drive/MyDrive/audiov/"
JSON_PATH = "/content/drive/MyDrive/datav/gmccx1v3.json"
SAMPLE_RATE = 22050
TRACK_DURATION = 120 # measured in seconds
SAMPLES_PER_TRACK = SAMPLE_RATE * TRACK_DURATION

In [None]:
def compute_gammatone_spectrogram(y, sr, n_fft, hop_length, n_gammatone,f_min):

    return gtgram(y, sr, n_fft/sr, hop_length/sr, n_gammatone,f_min)

In [None]:
def mel_scale_gammatone(gt_spectrogram, sr, n_mel):
    mel_filterbank = librosa.filters.mel(sr=sr, n_fft=gt_spectrogram.shape[0]*2-1, n_mels=n_mel)
    mel_spectrogram = np.dot(mel_filterbank, gt_spectrogram)
    return mel_spectrogram

In [None]:
def compute_gmcc(y, sr, n_fft, hop_length, n_gammatone, n_mel, n_ceps):
    # Compute gammatone spectrogram
    f_min = 20
    gt_spectrogram = compute_gammatone_spectrogram(y, sr, n_fft, hop_length, n_gammatone,f_min)

    # Apply Mel filterbank
    mel_spectrogram = mel_scale_gammatone(gt_spectrogram, sr, n_mel)

    # Log transform
    log_mel_spectrogram = np.log(mel_spectrogram + 1e-10)

    # Compute DCT
    gmcc = dct(log_mel_spectrogram, type=2, axis=0, norm='ortho')[1:(n_ceps + 1)]

    return gmcc.T

In [None]:
# for amplitude envelope
def amplitude_envelope(signal, frame_size, hop_length):
    """Calculate the amplitude envelope of a signal with a given frame size nad hop length."""
    amplitude_envelope = []

    # calculate amplitude envelope for each frame
    for i in range(0, len(signal), hop_length):
        amplitude_envelope_current_frame = max(signal[i:i+frame_size])
        amplitude_envelope.append(amplitude_envelope_current_frame)

    return np.array(amplitude_envelope)

In [None]:
# band energy ratio
def calculate_split_frequency_bin(split_frequency, sample_rate, num_frequency_bins):
    """Infer the frequency bin associated to a given split frequency."""

    frequency_range = sample_rate / 2
    frequency_delta_per_bin = frequency_range / num_frequency_bins
    split_frequency_bin = math.floor(split_frequency / frequency_delta_per_bin)
    return int(split_frequency_bin)

In [None]:
split_frequency_bin = calculate_split_frequency_bin(2000, 22050, 1025)

In [None]:
def band_energy_ratio(spectrogram, split_frequency, sample_rate):
    """Calculate band energy ratio with a given split frequency."""

    split_frequency_bin = calculate_split_frequency_bin(split_frequency, sample_rate, len(spectrogram[0]))
    band_energy_ratio = []

    # calculate power spectrogram
    power_spectrogram = np.abs(spectrogram) ** 2
    power_spectrogram = power_spectrogram.T

    # calculate BER value for each frame
    for frame in power_spectrogram:
        sum_power_low_frequencies = frame[:split_frequency_bin].sum()
        sum_power_high_frequencies = frame[split_frequency_bin:].sum()
        band_energy_ratio_current_frame = sum_power_low_frequencies / sum_power_high_frequencies
        band_energy_ratio.append(band_energy_ratio_current_frame)

    return np.array(band_energy_ratio)

In [None]:


def save_feature(dataset_path, json_path, num_segments=4 ):
    # Dictionary to store mapping, labels, and MFCCs
    data = {
        "mapping": [],
        "labels": [],
        "tal": [],
        "ae": [],
        "rmse": [],
        "zcr" : [],
        "ber" : [],
        "sc" : [],
        "sb" : [],
        "sf" : [],
        "mel":[],
        "logm":[],
        "mfcc": [],
        "stft":[],
        "gfcc": [],
        "gmcc":[]
    }

    hop_length = 512
    samples_per_segment = int(SAMPLES_PER_TRACK / num_segments)
    num_mfcc_vectors_per_segment = math.ceil(samples_per_segment / hop_length)

    # Loop through all genre sub-folders
    for i, (dirpath, dirnames, filenames) in enumerate(os.walk(dataset_path)):
        # Ensure we're processing a genre sub-folder level
        if dirpath is not dataset_path:
            # Save genre label (i.e., sub-folder name) in the mapping
            semantic_label = dirpath.split("/")[-1]
            s1 =  dirpath.split("/")[-1].rstrip("\\")  # Remove trailing backslashes

            data["mapping"].append(semantic_label)
            print("\nProcessing: {}".format(semantic_label))
            data["tal"].append(s1)
            # Process all audio files in genre sub-dir
            for f in filenames:
                # Load audio file
                file_path = os.path.join(dirpath, f)
                signal, sample_rate = librosa.load(file_path, sr=SAMPLE_RATE)
                for d in range(num_segments):

                    # calculate start and finish sample for current segment
                    start = samples_per_segment * d
                    finish = start + samples_per_segment

                    #Extract amplitude envelope for the entire audio file
                    ae_scale = amplitude_envelope(signal[start:finish], 1024 , 512)
                    ae_scale = ae_scale.T
                    store ae feature
                    if len(ae_scale) == num_mfcc_vectors_per_segment:
                        data["ae"].append(ae_scale.tolist())
                        data["labels"].append(i-1)

                    #Extarct RootMeanSqaureEnergy for audiofile
                    rms_scale = librosa.feature.rms(y = signal[start:finish], frame_length=1024, hop_length=512)[0]
                    rms_scale = rms_scale.T
                    #store zcr feature
                    if len(rms_scale) == num_mfcc_vectors_per_segment:
                        data["rmse"].append(rms_scale.tolist())
                        data["labels"].append(i-1)

                    #Extarct zero crossing rate  for audiofile
                    if len(signal[start:finish]) > 0:
                    zcr_scale = librosa.feature.zero_crossing_rate(y=signal[start:finish], frame_length=1024, hop_length=512)[0]
                    zcr_scale = zcr_scale.T
                    #store zcr feature
                    if len(zcr_scale) == num_mfcc_vectors_per_segment:
                       data["zcr"].append(zcr_scale.tolist())
                       data["labels"].append(i-1)

                    #extract short time fourier transform spectrogram
                    stft = librosa.stft(y=signal[start:finish], n_fft=2048, hop_length=512)
                    # # S_db = librosa.amplitude_to_db(np.abs(stft), ref=np.max)
                    # # S_db = S_db.T
                    scale1 = np.abs(stft) ** 2
                    scale1 = scale1.T
                    # # store spectrogram
                    # # if len(S_db) == num_mfcc_vectors_per_segment:
                    if len(scale1) == num_mfcc_vectors_per_segment:
                    #     #data["stft"].append(S_db.tolist())
                         data["stft"].append(scale1.tolist())
                         data["labels"].append(i-1)

                    #extract band energy ratio for audio file
                    ber_scale = band_energy_ratio(stft, 2000, SAMPLE_RATE)
                    ber_scale = ber_scale.T
                    #store band feature
                    if len(ber_scale) == num_mfcc_vectors_per_segment:
                         data["ber"].append(ber_scale.tolist())
                         data["labels"].append(i-1)

                    #Extarct  spectral centriod for audio file
                    sc_scale = librosa.feature.spectral_centroid(y=signal[start:finish], sr=SAMPLE_RATE, n_fft=1024, hop_length=512)[0]
                    sc_scale = sc_scale.T
                    #store sc
                    if len(sc_scale) == num_mfcc_vectors_per_segment:
                            data["sc"].append(sc_scale.tolist())
                            data["labels"].append(i-1)

                    #extract spectral Bandwith for audio file
                    ban_scale = librosa.feature.spectral_bandwidth(y=signal[start:finish], sr=SAMPLE_RATE, n_fft=1024, hop_length=512)[0]
                    ban_scale = ban_scale.T
                    #store spectralbandwidth
                    if len(ban_scale) == num_mfcc_vectors_per_segment:
                             data["sb"].append(ban_scale.tolist())
                             data["labels"].append(i-1)

                    #extract spectral flux of auudio file
                    spectral_flux = librosa.onset.onset_strength(y=signal[start:finish], sr=SAMPLE_RATE)
                    spectral_flux = spectral_flux.T
                    # #store band feature
                    if len(spectral_flux) == num_mfcc_vectors_per_segment:
                             data["sf"].append(spectral_flux.tolist())
                             data["labels"].append(i-1)

                    ##Extarct mel-spectogram  for audiofile
                    mel_spectrogram = librosa.feature.melspectrogram(y=signal[start:finish], sr=SAMPLE_RATE, n_fft=2048, hop_length=512, n_mels=128)
                    mel_spectrogram2 = mel_spectrogram.T

                    log_mel_spectrogram = librosa.power_to_db(mel_spectrogram)
                    log_mel_spectrogram = log_mel_spectrogram.T

                    # store only mel spectogram  feature with expected number of vectors
                    if len(log_mel_spectrogram) == num_mfcc_vectors_per_segment:
                         data["mel"].append(mel_spectrogram2.tolist())
                         data["logm"].append(log_mel_spectrogram.tolist())
                         data["labels"].append(i-1)

                    # Extract mfcc for the entire audio file
                    mfcc = librosa.feature.mfcc(y = signal[start:finish], sr = sample_rate, n_mfcc=40 , n_fft=2048, hop_length=512)
                    mfcc = mfcc.T  # Transpose for shape compatibility
                    # # store only mfcc feature with expected number of vectors
                    if len(mfcc) == num_mfcc_vectors_per_segment:
                        data["mfcc"].append(mfcc.tolist())
                        data["labels"].append(i-1)

                    # Extract gfcc for the entire audio file
                    gfcc_features2 = gfcc(signal[start:finish], fs=SAMPLE_RATE,
                    num_ceps=40,  # Number of cepstral coefficients (default: 13)
                    nfilts=128,     # Number of filters in the filterbank (default: 40)
                    nfft=2048,      # Number of FFT points (default: 512)
                    )
                    gfcc_features = gfcc_features2.T  # Transpose for shape compatibility
                    gfcc_features.shape[1] == num_mfcc_vectors_per_segment:
                    data["gfcc"].append(gfcc_features.tolist())
                    data["labels"].append(i-1)

                    # # Extract gfcc for the entire audio file
                    N_FFT = 2048
                    HOP_LENGTH = 512
                    N_GAMMATONE = 64
                    N_MEL = 128
                    N_CEPS = 13
                    gmcc_features = compute_gmcc(signal[start:finish], SAMPLE_RATE, N_FFT, HOP_LENGTH, N_GAMMATONE, N_MEL, N_CEPS)
                    if gmcc_features.shape[1] == num_mfcc_vectors_per_segment:
                    data["gmcc"].append(gmcc_features.tolist())
                    data["labels"].append(i - 1)


                    print("{}, segments processed".format(file_path, d+1))

    # Save MFCCs to json file
    with open(json_path, "w") as fp:
       json.dump(data, fp, indent=4)


if __name__ == "__main__":
    save_feature(DATASET_PATH, JSON_PATH)