<a href="https://colab.research.google.com/github/Ayrsz/SignalAndSistemyProject/blob/main/Features/FeatureExtract.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#Scientific computation
import numpy as np
import jax
import jax.numpy as jnp

#Plot and view
from matplotlib import pyplot as plt
import IPython as ipy
from IPython import display
from IPython.display import Audio

#Data manipulation
import os
import gc

#Audio manipulation
import librosa
import soundfile as sf

#Image manipulation
import cv2 as cv
import skimage as ski

# Building Files

In [5]:
from zipfile import ZipFile

try:
    from google.colab import drive

    drive.mount("/content/drive", force_remount=True)
except Exception as e:
    print(e)
    print("Rodando localmente")

Mounted at /content/drive


In [3]:
# Unzip in a specific folder
path_root = "/content/drive/MyDrive/Datasets/DatasetAudios"
path_zip = path_root + "/GTZAN.zip"
with ZipFile(path_zip, "r") as zip_archive:
    if not os.path.exists(path_root + "/Data"):
        print(f"Unzipando em : {path_root}")
        zip_archive.extractall(path_root)
        print("Unzipado!")
    else:
        print("Arquivo já unzipado, sem necessidade de ações.")

path_root = path_root + "/Data"

Arquivo já unzipado, sem necessidade de ações.


In [4]:
def dir_genres(path_all_genres : str) -> list[str]:
    return [path_all_genres + '/' + pasta for pasta in os.listdir(path_all_genres)]

#Return all files.wav separated per genres
def dir_files_wav(path_genre : str) -> list[str]:
  return [path_genre + '/' + arquivo for arquivo in os.listdir(path_genre)]


# Generate Escpectogram

In [9]:
def is_dir_empty(path : str) -> bool:
    with os.scandir(path) as iterator:
        for entry in iterator:
            if entry.name == '.ipynb_checkpoints':
                continue
            return False  # Se encontrar qualquer outro arquivo/pasta, não está vazio
        return True  # Se só havia .ipynb_checkpoints (ou nada), está vazio
def spectro_feat(audio : jnp.ndarray, sample_rate : int) -> jnp.ndarray:

    audio = jax.device_put(audio)
    FFT_SIZE = 512
    HOP_SIZE = 256

    hamming = jnp.hamming(FFT_SIZE)
    num_frames = (len(audio) - FFT_SIZE) // HOP_SIZE + 1

    def compute_fft(i):
        start = i * HOP_SIZE
        signal = jax.lax.dynamic_slice(audio, (start,), (FFT_SIZE,))
        signal = signal * hamming
        return jnp.fft.rfft(signal, n = FFT_SIZE)


    sfft = jax.vmap(compute_fft)(jnp.arange(num_frames))

    ssft = jnp.abs(sfft)**2 #Spectro
    dB_format =  20 * jnp.log10( (ssft+1e-10) / 1e-10) #Spectro
    return dB_format.T

def spectro_feat_batch(batch : jnp.ndarray, sample_rate : int) -> jnp.ndarray:
    return jax.vmap(lambda audio: spectro_feat(audio, sample_rate))(batch)

def plt_spectogram(batch : jnp.ndarray, sample_rate : int, y_axis_type= "linear"):
  escala_Y = spectro_feat_batch(batch, sample_rate)
  escala_Y = np.array(escala_Y)
  #fig = plt.figure(figsize=(10,10))
  #plt.subplot(3,3,1)
  for (i,audio) in enumerate(escala_Y):
    plt.subplot(3,3,i+1)
    librosa.display.specshow(audio, sr = sample_rate, x_axis = "time", y_axis = y_axis_type, )
    plt.set_cmap("magma")
    plt.colorbar()


def create_batch(files : list[str], size : int, genre : str, shape = 661794):
    batch_size = min(size, len(files))
    batch = jnp.zeros((batch_size, shape))  # Inicializa batch zerado
    names = []
    nums = []
    new_files = []  # Lista para armazenar arquivos que foram processados corretamente

    for i in range(batch_size):
        try:

            audio, sample_rate = sf.read(files[i])  # Mantém o SR original

            if len(audio) < shape:
                # Preenche com zeros se o áudio for menor que o tamanho esperado
                audio = jnp.pad(audio, (0, shape - len(audio)))
            elif len(audio) > shape:
                # Se for maior, corta
                audio = audio[:shape]

            batch = batch.at[i].set(audio)
            archive = files[i].split("/")[-1].split(".")
            num = archive[-2]
            name = archive[0]


            names.append(name)
            nums.append(num)
            new_files.append(files[i])  # Adiciona à lista de arquivos processados

        except Exception as e:
            print(f"Erro ao processar {files[i]}: {e}")

    # Remove os arquivos processados da lista original
    for file in new_files:
        files.remove(file)
    batch = jax.device_put(batch)
    return batch, names, nums, sample_rate, files


def write_spectrogram(audios : jnp.ndarray, sample_rate: int, paths_write : list[str]):
  escala_Y = spectro_feat_batch(audios, sample_rate)

  del audios
  gc.collect()

  escala_Y = np.array(escala_Y)

  assert len(escala_Y) == len(paths_write)

  for (spec, path_write) in zip(escala_Y, paths_write):
    spec = 255*(spec - np.max(spec))/(np.max(spec) - np.min(spec))
    spec = spec.astype(np.uint8)
    spec = 255 - spec
    img = cv.applyColorMap(spec, cv.COLORMAP_INFERNO)
    cv.imwrite(path_write, img)


def write_spectrogram_from_genre(genre_path : str):
    files = dir_files_wav(genre_path)
    total_files_start = len(files)
    paths = np.array(genre_path.split("/"))
    genre = paths[-1]
    size_path = len(paths)
    genre_images_path = '/'.join(paths[0:size_path-2]) + "/images/" + genre
    batch_size = 3

    print(f"Writing on: {genre_images_path}")

    if is_dir_empty(genre_images_path):
        while(len(files) != 0):
            try:
                audios, names, nums, sample_rate, files = create_batch(files, batch_size, genre)

                paths_write = [genre_images_path + "/" + name + "." + num + ".png" for (num, name) in zip(nums, names)]
                write_spectrogram(audios, sample_rate, paths_write)#, path_write)

                if(len(files) % 100 == 0):
                    print(f"Carregando imagens, {genre} : {((total_files_start - len(files))/total_files_start)*100:.2f}%")
            except Exception as e:
                print(f"Erro {e}")
                print(f"Erro em {files[0]}")


    else:
        print(f"Diretorio com imagens, porfavor esvazie: {genre}")

def create_image_directory(root_dir, base_dir="images"):

    genres = [
        "blues", "classical", "country", "disco", "hiphop",
        "jazz", "metal", "pop", "reggae", "rock"
    ]

    # Define o caminho completo do diretório base
    full_base_path = os.path.join(root_dir, base_dir)

    # Verifica se o diretório base existe, senão cria
    if not os.path.exists(full_base_path):
        os.makedirs(full_base_path)

    # Cria as subpastas dos gêneros
    for genre in genres:
        genre_path = os.path.join(full_base_path, genre)
        if not os.path.exists(genre_path):
            os.makedirs(genre_path)

    print(f"Diretório '{full_base_path}' e subpastas criadas com sucesso!")


#Writting new images spectogram
def write_all_spectrograms(path_root : str):

  genres_path = dir_genres(path_root + "/genres_original")
  create_image_directory(path_root)
  for genre_path in genres_path:
    write_spectrogram_from_genre(genre_path)



write_all_spectrograms(path_root)

Diretório '/content/drive/MyDrive/Datasets/DatasetAudios/Data/images' e subpastas criadas com sucesso!
Writing on: /content/drive/MyDrive/Datasets/DatasetAudios/Data/images/jazz
Carregando imagens, jazz : 49.49%
Carregando imagens, jazz : 100.00%
Writing on: /content/drive/MyDrive/Datasets/DatasetAudios/Data/images/blues
Carregando imagens, blues : 50.00%
Carregando imagens, blues : 100.00%
Writing on: /content/drive/MyDrive/Datasets/DatasetAudios/Data/images/classical
Carregando imagens, classical : 50.00%
Carregando imagens, classical : 100.00%
Writing on: /content/drive/MyDrive/Datasets/DatasetAudios/Data/images/country
Carregando imagens, country : 50.00%
Carregando imagens, country : 100.00%
Writing on: /content/drive/MyDrive/Datasets/DatasetAudios/Data/images/disco
Carregando imagens, disco : 50.00%
Carregando imagens, disco : 100.00%
Writing on: /content/drive/MyDrive/Datasets/DatasetAudios/Data/images/hiphop
Carregando imagens, hiphop : 50.00%
Carregando imagens, hiphop : 100.0

# Features on time domain

In [15]:
print(jax.devices())

[CpuDevice(id=0)]


In [16]:
audio_example, sr = librosa.load(path_root + "/genres_original/metal/metal.00041.wav")

In [17]:
def round_float_decimal(number):
    if not isinstance(number, np.ndarray):
        return float(f"{number:.4f}")
    else:
        return np.array([float(f"{d:.4f}") for d in number])

def get_statistics_feature(signal : np.typing.NDArray, name) -> dict:
    mean = round_float_decimal(np.mean(signal))
    std = round_float_decimal(np.std(signal))

    return {name + "_mean": mean, name + "_std":std}

## Low level features

### Energy

In [18]:
def get_rms(audio, FRAME_SIZE = 1024):
    rms = librosa.feature.rms(y = audio, frame_length= FRAME_SIZE)
    feats = get_statistics_feature(rms, "rms")
    return feats
rms = get_rms(audio_example)
print(rms)


{'rms_mean': 0.0968, 'rms_std': 0.0227}


### Zero-Cross-Rate

In [19]:
def get_zero_cross_rate(audio, FRAME_SIZE = 1024):
    zero_cross = librosa.feature.zero_crossing_rate(audio, frame_length= FRAME_SIZE)
    feats = get_statistics_feature(zero_cross, "zero_cross")
    return feats


rate = get_zero_cross_rate(audio_example)


### Amplitude Envelope

In [20]:
def get_amplitude_envelope(audio, sample_rate, frame_size = 1024):
    AEm = []
    number_of_frames = len(audio) // frame_size
    for i in range(number_of_frames):
        start = i * frame_size
        stop = start + frame_size
        max_amp = np.max(audio[start:stop])
        AEm.append(max_amp)
    AEm = np.array(AEm)

    feats = get_statistics_feature(AEm, "amplitude_envelope")
    return feats

## High level features

### Tempo

In [21]:
def get_tempo(audio, sample_rate):
    tempo, beats = librosa.beat.beat_track(y = audio, sr = sample_rate)
    tempo = tempo #É um vetor de apenas um unico valor
    return np.int32(tempo)


print(get_tempo(audio_example, sr))

[117]


# Features on frequency domain

## Low level feature

### Spectral centroid

In [22]:
#Gera um "centro de massa"
#Média das frequencias, ponderadas pela amplitude daquela sample para cada frame (usa sft)
def get_spectral_centroid(audio, sample_rate):
    spectral_centroid = librosa.feature.spectral_centroid(y = audio, sr = sample_rate)
    feats = get_statistics_feature(spectral_centroid, "spectral_centroid")
    return feats

centroid = get_spectral_centroid(audio_example, sr)
print(centroid)

{'spectral_centroid_mean': 2269.1619, 'spectral_centroid_std': 368.4678}


### Bandwidsh

In [23]:
#Grau de "espalhamento" do espectro de um audio, calcula para CADA FRAME
def get_bandwidth(audio, sample_rate):
    bandwidth = librosa.feature.spectral_bandwidth(y = audio, sr = sample_rate)[0]
    return get_statistics_feature(bandwidth, "bandwidth")

a = get_bandwidth(audio_example, sr)

print(a)

{'bandwidth_mean': 1928.9448, 'bandwidth_std': 297.4026}


## High level feature

In [29]:
def get_mel_coef(audio, sample_rate):
    mfcc = librosa.feature.mfcc(y = audio, sr = sample_rate, n_mfcc = 20)
    coefs_estatistics = {}
    for (i,coefs) in enumerate(mfcc):
        coefs_estatistics["mfcc#" + f"{i:02d}_mean"] = round_float_decimal(np.mean(coefs))
        coefs_estatistics["mfcc#" + f"{i:02d}_std"] = round_float_decimal(np.std(coefs))


    return coefs_estatistics

mfcc = get_mel_coef(audio_example, sr)
print(mfcc)

{'mfcc#00_mean': -100.4234, 'mfcc#00_std': 36.4089, 'mfcc#01_mean': 104.6932, 'mfcc#01_std': 19.8516, 'mfcc#02_mean': -57.254, 'mfcc#02_std': 18.1442, 'mfcc#03_mean': 56.5769, 'mfcc#03_std': 10.628, 'mfcc#04_mean': -5.5556, 'mfcc#04_std': 9.6729, 'mfcc#05_mean': 22.8333, 'mfcc#05_std': 10.0632, 'mfcc#06_mean': -6.1715, 'mfcc#06_std': 10.5025, 'mfcc#07_mean': 23.0971, 'mfcc#07_std': 8.8752, 'mfcc#08_mean': -19.072, 'mfcc#08_std': 7.0803, 'mfcc#09_mean': 17.7954, 'mfcc#09_std': 7.2162, 'mfcc#10_mean': -13.783, 'mfcc#10_std': 6.9338, 'mfcc#11_mean': 9.9628, 'mfcc#11_std': 7.0728, 'mfcc#12_mean': -14.8587, 'mfcc#12_std': 6.9912, 'mfcc#13_mean': 0.3451, 'mfcc#13_std': 8.4316, 'mfcc#14_mean': -7.3378, 'mfcc#14_std': 7.2783, 'mfcc#15_mean': 6.9755, 'mfcc#15_std': 6.386, 'mfcc#16_mean': -12.6738, 'mfcc#16_std': 6.388, 'mfcc#17_mean': -0.5505, 'mfcc#17_std': 5.5762, 'mfcc#18_mean': -11.7564, 'mfcc#18_std': 6.1158, 'mfcc#19_mean': -1.5363, 'mfcc#19_std': 5.5106}


# Features on spectogram images

### Hog

In [26]:
def get_hog(spectogram):
    spectogram = cv.resize(spectogram, dsize = (2*spectogram.shape[0], spectogram.shape[0]), interpolation = cv.INTER_AREA)

    hog = ski.feature.hog(spectogram,
                          channel_axis= -1,
                          orientations = 8
                          )
    return get_statistics_feature(hog, "hog")





### Band Energy

In [27]:
def get_band_energy(spectogram):
    #Limiar "high" and "low" frequency
    H, L = spectogram.shape[0], spectogram.shape[1]
    CUT_FREQ = (spectogram.shape[0] // 10) * 6
    high_bin = spectogram[CUT_FREQ:H:,:, :]
    low_bin = spectogram[0:CUT_FREQ, :, :]
    energy_coef = []



    for i in range(L):

        sum_high = np.sum(high_bin[:,i])
        sum_low = np.sum(low_bin[:,i])
        energy_coef.append(sum_high/sum_low)

    return get_statistics_feature(np.array(energy_coef), "band")



# Create CSV

In [32]:
import csv


def get_features_spectro(file):
    dirs = file.split("/")
    genre = dirs[-2]
    root = "/".join(dirs[0 : len(dirs) - 3])
    name = dirs[-1].replace(".wav", "")
    name_png = root + "/images/" + genre + "/" + name + ".png"

    spectro = cv.imread(name_png)
    hog = get_hog(spectro)
    band = get_band_energy(spectro)

    feat = hog | band
    return feat


def get_features_dict(audio, sample_rate, name, dir_image):
    features_dict = {"name": name + ".wav", "dir_image": dir_image}

    rms = get_rms(audio, sample_rate)
    zero_cross = get_zero_cross_rate(audio)
    amplitude_env = get_amplitude_envelope(audio, sample_rate)
    spectral_centroid = get_spectral_centroid(audio, sample_rate)
    mfcc_coef = get_mel_coef(audio, sample_rate)
    band_wid = get_bandwidth(audio, sample_rate)
    features_dict = (
        features_dict
        | rms
        | zero_cross
        | amplitude_env
        | spectral_centroid
        | mfcc_coef
        | band_wid
    )

    tempo = get_tempo(audio, sample_rate)
    label = (name.split(".")[0].split("_"))[0]

    features_dict.update({"tempo": tempo, "label": label})

    return features_dict


def write_csv_genre(genre_path, path_write):
    files = dir_files_wav(genre_path)
    features_vector = []

    for i, file in enumerate(files):
        path = file.split("/")
        name = path[-1].replace(".wav", "")
        genre = name.split(".")[0]
        idx_data = [i for i in range(len(path)) if path[i] == "Data"][0]
        dir_image = (
            "/".join(path[:idx_data]) + "/Data/images/" + genre + "/" + name + ".png"
        )
        audio, sample_rate = librosa.load(file)
        features_dict = get_features_dict(audio, sample_rate, name, dir_image)
        features_dict = features_dict | get_features_spectro(file)
        features_vector.append(features_dict)

    return features_vector


def write_csv(root_dir, path_write):
    genres_dir = dir_genres(root_dir)
    columns = [
        "name",
        "dir_image",
        "rms_mean",
        "rms_std",
        "zero_cross_mean",
        "zero_cross_std",
        "amplitude_envelope_mean",
        "amplitude_envelope_std",
        "spectral_centroid_mean",
        "spectral_centroid_std",
        "hog_mean",
        "hog_std",
        "band_mean",
        "band_std",
        "bandwidth_mean",
        "bandwidth_std",
        "mfcc#00_mean",
        "mfcc#00_std",
        "mfcc#01_mean",
        "mfcc#01_std",
        "mfcc#02_mean",
        "mfcc#02_std",
        "mfcc#03_mean",
        "mfcc#03_std",
        "mfcc#04_mean",
        "mfcc#04_std",
        "mfcc#05_mean",
        "mfcc#05_std",
        "mfcc#06_mean",
        "mfcc#06_std",
        "mfcc#07_mean",
        "mfcc#07_std",
        "mfcc#08_mean",
        "mfcc#08_std",
        "mfcc#09_mean",
        "mfcc#09_std",
        "mfcc#10_mean",
        "mfcc#10_std",
        "mfcc#11_mean",
        "mfcc#11_std",
        "mfcc#12_mean",
        "mfcc#12_std",
        "mfcc#13_mean",
        "mfcc#13_std" ,
        "mfcc#14_mean",
        "mfcc#14_std",
        "mfcc#15_mean",
        "mfcc#15_std",
        "mfcc#16_mean",
        "mfcc#16_std" ,
        "mfcc#17_mean",
        "mfcc#17_std",
        "mfcc#18_mean",
        "mfcc#18_std",
        "mfcc#19_mean",
        "mfcc#19_std" ,
        "tempo",
        "label",
    ]

    with open(path_write + "/feats.csv", "w") as f:
        writer = csv.DictWriter(f, columns)
        writer.writeheader()
        for genre in genres_dir:
            print("Acessing: ", genre)
            lines = write_csv_genre(genre, path_write)
            writer.writerows(lines)
        f.close()


write_csv(path_root + "/genres_original", path_root)

Acessing:  /content/drive/MyDrive/Datasets/DatasetAudios/Data/genres_original/jazz
Acessing:  /content/drive/MyDrive/Datasets/DatasetAudios/Data/genres_original/blues
Acessing:  /content/drive/MyDrive/Datasets/DatasetAudios/Data/genres_original/classical
Acessing:  /content/drive/MyDrive/Datasets/DatasetAudios/Data/genres_original/country
Acessing:  /content/drive/MyDrive/Datasets/DatasetAudios/Data/genres_original/disco
Acessing:  /content/drive/MyDrive/Datasets/DatasetAudios/Data/genres_original/hiphop
Acessing:  /content/drive/MyDrive/Datasets/DatasetAudios/Data/genres_original/metal
Acessing:  /content/drive/MyDrive/Datasets/DatasetAudios/Data/genres_original/pop
Acessing:  /content/drive/MyDrive/Datasets/DatasetAudios/Data/genres_original/reggae
Acessing:  /content/drive/MyDrive/Datasets/DatasetAudios/Data/genres_original/rock
