# Mass Feature Grabbing

Within the librosa library, there are a large number of features that can be extracted from each song.

## 1 - Do the imports

In [1]:
import os
import numpy as np # For numerical operations
import matplotlib.pyplot as plt # For plotting
import gc # For garbage collection
import librosa # For audio processing

import sys

!{sys.executable} -m pip install tensorflow
!{sys.executable} -m pip install tensorflow-hub
!{sys.executable} -m pip install tensorflow_hub
!{sys.executable} -m pip install opensmile

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m


## 2 - Setup the basepath and the genres

In [2]:
BASEPATH = os.path.join("./Data", "genres_original")
GENRES = ["blues", "classical", "country", "disco", "hiphop", "jazz", "metal", "pop", "reggae", "rock"]

## 3 - Grabbing all the features (30 seconds)

In [3]:
import os
import numpy as np
import pandas as pd
import librosa
import opensmile
import gc  # For garbage collection

allSongs = []

import tensorflow as tf
import tensorflow_hub as hub

yamnet_model_handle = "https://tfhub.dev/google/yamnet/1"
yamnet_model = hub.load(yamnet_model_handle)

def get_yamnet_embedding(y, sr):
    import librosa
    if sr != 16000:
        y = librosa.resample(y, orig_sr=sr, target_sr=16000)
        sr = 16000
    waveform = y.astype('float32')
    scores, embeddings, spectrogram = yamnet_model(waveform)
    embedding_vector = tf.reduce_mean(embeddings, axis=0).numpy()
    return embedding_vector

for genre in GENRES:
    genre_path = os.path.join(BASEPATH, genre)

    for filename in os.listdir(genre_path):
        if not filename.endswith(".wav"):
            continue

        file_path = os.path.join(genre_path, filename)
        y, sr = librosa.load(file_path, sr=22050)

        # Ensure the audio is exactly 30 seconds
        y = librosa.util.fix_length(y, size=30 * sr)

        features = {}

        # Unique filename for each segment
        features["filename"] = filename
        features["label"] = genre
        features["song_id"] = f"{genre}_{filename.split('.')[0]}"

        y_segment = y  # Use the current segment for feature extraction
        # Compute chroma_stft
        
        chroma = librosa.feature.chroma_stft(y=y_segment, sr=sr)
        chroma_mean = np.mean(chroma, axis=1)
        chroma_var = np.var(chroma, axis=1)
        for i in range(len(chroma_mean)):
            features[f"chroma_stft_mean_{i}"] = chroma_mean[i]
            features[f"chroma_stft_var_{i}"] = chroma_var[i]

        # Spectral centroid
        centroid = librosa.feature.spectral_centroid(y=y_segment, sr=sr)
        features["centroid_mean"] = np.mean(centroid)
        features["centroid_var"] = np.var(centroid)

        # Spectral bandwidth
        bandwidth = librosa.feature.spectral_bandwidth(y=y_segment, sr=sr)
        features["bandwidth_mean"] = np.mean(bandwidth)
        features["bandwidth_var"] = np.var(bandwidth)

        # Spectral rolloff
        rolloff = librosa.feature.spectral_rolloff(y=y_segment, sr=sr)
        features["rolloff_mean"] = np.mean(rolloff)
        features["rolloff_var"] = np.var(rolloff)

        # Zero Crossing Rate
        zcr = librosa.feature.zero_crossing_rate(y=y_segment)
        features["zcr_mean"] = np.mean(zcr)
        features["zcr_var"] = np.var(zcr)

        # Harmonic and Percussive components
        y_harmonic, y_percussive = librosa.effects.hpss(y_segment)

        harmony = librosa.feature.chroma_stft(y=y_harmonic, sr=sr)
        features["harmony_mean"] = np.mean(harmony)
        features["harmony_var"] = np.var(harmony)
        
        percussive = librosa.feature.chroma_stft(y=y_percussive, sr=sr)
        features["percussive_mean"] = np.mean(percussive)
        features["percussive_var"] = np.var(percussive)
                
        mfccs = librosa.feature.mfcc(y=y_segment, sr=sr, n_mfcc=13)
        delta_mfccs = librosa.feature.delta(mfccs)
        delta2_mfccs = librosa.feature.delta(mfccs, order=2)

        for i in range(mfccs.shape[0]):
            features[f"mfcc_{i}_mean"] = np.mean(mfccs[i])
            features[f"mfcc_{i}_var"] = np.var(mfccs[i])
            features[f"delta_mfcc_{i}_mean"] = np.mean(delta_mfccs[i])
            features[f"delta_mfcc_{i}_var"] = np.var(delta_mfccs[i])
            features[f"delta2_mfcc_{i}_mean"] = np.mean(delta2_mfccs[i])
            features[f"delta2_mfcc_{i}_var"] = np.var(delta2_mfccs[i])

        # Spectral contrast
        spectral_contrast = librosa.feature.spectral_contrast(y=y_segment, sr=sr)
        for i in range(spectral_contrast.shape[0]):
            features[f"spectral_contrast_mean_{i}"] = np.mean(spectral_contrast[i])
            features[f"spectral_contrast_var_{i}"] = np.var(spectral_contrast[i])
    
        # Tonnetz
        tonnetz = librosa.feature.tonnetz(y=y_segment, sr=sr)
        for i in range(tonnetz.shape[0]):
            features[f"tonnetz_mean_{i}"] = np.mean(tonnetz[i])
            features[f"tonnetz_var_{i}"] = np.var(tonnetz[i])
    
        # Chroma CQT and CENS
        chroma_cqt = librosa.feature.chroma_cqt(y=y_segment, sr=sr)
        for i in range(chroma_cqt.shape[0]):
            features[f"chroma_cqt_mean_{i}"] = np.mean(chroma_cqt[i])
            features[f"chroma_cqt_var_{i}"] = np.var(chroma_cqt[i])
    
        chroma_cens = librosa.feature.chroma_cens(y=y_segment, sr=sr)
        features["chroma_cens_mean"] = np.mean(chroma_cens)
        features["chroma_cens_var"] = np.var(chroma_cens)

        # Chroma VQT
        chroma_vqt = librosa.feature.chroma_vqt(y=y_segment, sr=sr, intervals='equal')
        features["chroma_vqt_mean"] = np.mean(chroma_vqt)
        features["chroma_vqt_var"] = np.var(chroma_vqt)

        # Mel spectrogram
        mel_spectrogram = librosa.feature.melspectrogram(y=y_segment, sr=sr, n_mels=64)
        for i in range(mel_spectrogram.shape[0]):
            features[f"mel_spec_mean_{i}"] = np.mean(mel_spectrogram[i])
            features[f"mel_spec_var_{i}"] = np.var(mel_spectrogram[i])
        
        # Spectral flatness
        spectral_flatness = librosa.feature.spectral_flatness(y=y_segment)
        features["spectral_flatness_mean"] = np.mean(spectral_flatness)
        features["spectral_flatness_var"] = np.var(spectral_flatness)

        # Polynomial features
        poly_features = librosa.feature.poly_features(y=y_segment, sr=sr)
        features["poly_features_mean"] = np.mean(poly_features)
        features["poly_features_var"] = np.var(poly_features)

        # Tempo and tempogram
        tempo, _ = librosa.beat.beat_track(y=y_segment, sr=sr)
        features["tempo"] = tempo

        tempogram = librosa.feature.tempogram(y=y_segment, sr=sr)
        features["tempogram_mean"] = np.mean(tempogram)
        features["tempogram_var"] = np.var(tempogram)

        fourier_tempogram = librosa.feature.fourier_tempogram(y=y_segment, sr=sr)
        features["fourier_tempogram_mean"] = np.mean(fourier_tempogram)
        features["fourier_tempogram_var"] = np.var(fourier_tempogram)
        
        tempogram_ratio = librosa.feature.tempogram(y=y_segment, sr=sr)
        features["tempogram_ratio_mean"] = np.mean(tempogram_ratio)
        features["tempogram_ratio_var"] = np.var(tempogram_ratio)
        
        # Stack memory
        stack_memory = librosa.feature.stack_memory(y_segment)
        features["stack_memory_mean"] = np.mean(stack_memory)
        features["stack_memory_var"] = np.var(stack_memory)
            
        yamnet_emb = get_yamnet_embedding(y_segment, sr)
        features["yamnet_emb_mean"] = np.mean(yamnet_emb)
        features["yamnet_emb_var"] = np.var(yamnet_emb)

        print(len(features))
        allSongs.append(features)
        print(f"Processed {filename} in genre {genre}")

        gc.collect()  # Run garbage collection to free memory

# Save to CSV
df = pd.DataFrame(allSongs)
os.makedirs(BASEPATH, exist_ok=True)
df.to_csv(os.path.join("all_30_second_features_full.csv"), index=False)
print("Feature extraction complete and saved.")



314
Processed blues.00093.wav in genre blues
314
Processed blues.00087.wav in genre blues
314
Processed blues.00050.wav in genre blues
314
Processed blues.00044.wav in genre blues
314
Processed blues.00078.wav in genre blues
314
Processed blues.00079.wav in genre blues
314
Processed blues.00045.wav in genre blues
314
Processed blues.00051.wav in genre blues
314
Processed blues.00086.wav in genre blues
314
Processed blues.00092.wav in genre blues
314
Processed blues.00084.wav in genre blues
314
Processed blues.00090.wav in genre blues
314
Processed blues.00047.wav in genre blues
314
Processed blues.00053.wav in genre blues
314
Processed blues.00052.wav in genre blues
314
Processed blues.00046.wav in genre blues
314
Processed blues.00091.wav in genre blues
314
Processed blues.00085.wav in genre blues
314
Processed blues.00081.wav in genre blues
314
Processed blues.00095.wav in genre blues
314
Processed blues.00042.wav in genre blues
314
Processed blues.00056.wav in genre blues
314
Proces

## 4 - Grabbing all the features (3 seconds)

In [4]:
import os
import numpy as np
import pandas as pd
import librosa

allSongs = []

import tensorflow as tf
import tensorflow_hub as hub

yamnet_model_handle = "https://tfhub.dev/google/yamnet/1"
yamnet_model = hub.load(yamnet_model_handle)

def get_yamnet_embedding(y, sr):
    import librosa
    if sr != 16000:
        y = librosa.resample(y, orig_sr=sr, target_sr=16000)
        sr = 16000
    waveform = y.astype('float32')
    scores, embeddings, spectrogram = yamnet_model(waveform)
    embedding_vector = tf.reduce_mean(embeddings, axis=0).numpy()
    return embedding_vector

for genre in GENRES:
    genre_path = os.path.join(BASEPATH, genre)
    for filename in os.listdir(genre_path):
        if not filename.endswith(".wav"):
            continue

        file_path = os.path.join(genre_path, filename)
        y, sr = librosa.load(file_path, sr=22050)

        # Ensure the audio is exactly 30 seconds
        y = librosa.util.fix_length(y, size=30 * sr)

        # Split into 10 segments (3 seconds each)
        segments = np.array_split(y, 10)

        for segment_idx, segment in enumerate(segments, start=1):
            features = {}

            # Unique filename for each segment
            specificFilename = filename.replace(".wav", f"_{segment_idx}.wav")
            features["filename"] = specificFilename
            features["label"] = genre
            features["song_id"] = f"{genre}_{filename.split('.')[0]}"

            y_segment = segment  # Use the current segment for feature extraction

            # Compute chroma_stft
            chroma = librosa.feature.chroma_stft(y=y_segment, sr=sr)
            chroma_mean = np.mean(chroma, axis=1)
            chroma_var = np.var(chroma, axis=1)
            for i in range(len(chroma_mean)):
                features[f"chroma_stft_mean_{i}"] = chroma_mean[i]
                features[f"chroma_stft_var_{i}"] = chroma_var[i]

            # Spectral centroid
            centroid = librosa.feature.spectral_centroid(y=y_segment, sr=sr)
            features["centroid_mean"] = np.mean(centroid)
            features["centroid_var"] = np.var(centroid)

            # Spectral bandwidth
            bandwidth = librosa.feature.spectral_bandwidth(y=y_segment, sr=sr)
            features["bandwidth_mean"] = np.mean(bandwidth)
            features["bandwidth_var"] = np.var(bandwidth)

            # Spectral rolloff
            rolloff = librosa.feature.spectral_rolloff(y=y_segment, sr=sr)
            features["rolloff_mean"] = np.mean(rolloff)
            features["rolloff_var"] = np.var(rolloff)

            # Zero Crossing Rate
            zcr = librosa.feature.zero_crossing_rate(y=y_segment)
            features["zcr_mean"] = np.mean(zcr)
            features["zcr_var"] = np.var(zcr)

            # Harmonic and Percussive components
            y_harmonic, y_percussive = librosa.effects.hpss(y_segment)

            harmony = librosa.feature.chroma_stft(y=y_harmonic, sr=sr)
            features["harmony_mean"] = np.mean(harmony)
            features["harmony_var"] = np.var(harmony)

            percussive = librosa.feature.chroma_stft(y=y_percussive, sr=sr)
            features["percussive_mean"] = np.mean(percussive)
            features["percussive_var"] = np.var(percussive)
            
            # MFCCs and deltas
            mfccs = librosa.feature.mfcc(y=y_segment, sr=sr, n_mfcc=13)
            delta_mfccs = librosa.feature.delta(mfccs)
            delta2_mfccs = librosa.feature.delta(mfccs, order=2)

            for i in range(mfccs.shape[0]):
                features[f"mfcc_{i}_mean"] = np.mean(mfccs[i])
                features[f"mfcc_{i}_var"] = np.var(mfccs[i])
                features[f"delta_mfcc_{i}_mean"] = np.mean(delta_mfccs[i])
                features[f"delta_mfcc_{i}_var"] = np.var(delta_mfccs[i])
                features[f"delta2_mfcc_{i}_mean"] = np.mean(delta2_mfccs[i])
                features[f"delta2_mfcc_{i}_var"] = np.var(delta2_mfccs[i])

            # Spectral contrast
            spectral_contrast = librosa.feature.spectral_contrast(y=y_segment, sr=sr)
            for i in range(spectral_contrast.shape[0]):
                features[f"spectral_contrast_{i}_mean"] = np.mean(spectral_contrast[i])
                features[f"spectral_contrast_{i}_var"] = np.var(spectral_contrast[i])

            # Tonnetz
            tonnetz = librosa.feature.tonnetz(y=y_segment, sr=sr)
            for i in range(tonnetz.shape[0]):
                features[f"tonnetz_{i}_mean"] = np.mean(tonnetz[i])
                features[f"tonnetz_{i}_var"] = np.var(tonnetz[i])

            # Chroma CQT and CENS
            chroma_cqt = librosa.feature.chroma_cqt(y=y_segment, sr=sr)
            for i in range(chroma_cqt.shape[0]):
                features[f"chroma_cqt_mean_{i}"] = np.mean(chroma_cqt[i])
                features[f"chroma_cqt_var_{i}"] = np.var(chroma_cqt[i])

            chroma_cens = librosa.feature.chroma_cens(y=y_segment, sr=sr)
            features["chroma_cens_mean"] = np.mean(chroma_cens)
            features["chroma_cens_var"] = np.var(chroma_cens)

            # Chroma VQT
            chroma_vqt = librosa.feature.chroma_vqt(y=y_segment, sr=sr, intervals='equal')
            features["chroma_cens_mean"] = np.mean(chroma_cens)
            features["chroma_cens_var"] = np.var(chroma_cens)
            
            # Chroma VQT
            chroma_vqt = librosa.feature.chroma_vqt(y=y_segment, sr=sr, intervals='equal')
            features["chroma_vqt_mean"] = np.mean(chroma_vqt)
            features["chroma_vqt_var"] = np.var(chroma_vqt)

            # Mel spectrogram
            mel_spectrogram = librosa.feature.melspectrogram(y=y_segment, sr=sr, n_mels=64)
            for i in range(mel_spectrogram.shape[0]):
                features[f"mel_spec_mean_{i}"] = np.mean(mel_spectrogram[i])
                features[f"mel_spec_var_{i}"] = np.var(mel_spectrogram[i])

            # Spectral flatness
            spectral_flatness = librosa.feature.spectral_flatness(y=y_segment)
            features["spectral_flatness_mean"] = np.mean(spectral_flatness)
            features["spectral_flatness_var"] = np.var(spectral_flatness)

            # Polynomial features
            poly_features = librosa.feature.poly_features(y=y_segment, sr=sr)
            features["poly_features_mean"] = np.mean(poly_features)
            features["poly_features_var"] = np.var(poly_features)
            
            # Tempo and tempogram
            tempo, _ = librosa.beat.beat_track(y=y_segment, sr=sr)
            features["tempo"] = tempo

            tempogram = librosa.feature.tempogram(y=y_segment, sr=sr)
            features["tempogram_mean"] = np.mean(tempogram)
            features["tempogram_var"] = np.var(tempogram)
            
            fourier_tempogram = librosa.feature.fourier_tempogram(y=y_segment, sr=sr)
            features["fourier_tempogram_mean"] = np.mean(fourier_tempogram)
            features["fourier_tempogram_var"] = np.var(fourier_tempogram)

            tempogram_ratio = librosa.feature.tempogram(y=y_segment, sr=sr)
            features["tempogram_ratio_mean"] = np.mean(tempogram_ratio)
            features["tempogram_ratio_var"] = np.var(tempogram_ratio)

            # Stack memory
            stack_memory = librosa.feature.stack_memory(y_segment)
            features["stack_memory_mean"] = np.mean(stack_memory)
            features["stack_memory_var"] = np.var(stack_memory)
                
            yamnet_emb = get_yamnet_embedding(y_segment, sr)
            features["yamnet_emb_mean"] = np.mean(yamnet_emb)
            features["yamnet_emb_var"] = np.var(yamnet_emb)
                
            print(len(features))
            allSongs.append(features)
            print(f"Processed segment {segment_idx} of {filename} in genre {genre}")
            
            gc.collect()  # Run garbage collection to free memory

# Save to CSV
df = pd.DataFrame(allSongs)
os.makedirs(BASEPATH, exist_ok=True)
output_path = os.path.join("all_segmented_features.csv")
df.to_csv(output_path, index=False)
print(f"All features saved to {output_path}")



314
Processed segment 1 of blues.00093.wav in genre blues
314
Processed segment 2 of blues.00093.wav in genre blues
314
Processed segment 3 of blues.00093.wav in genre blues
314
Processed segment 4 of blues.00093.wav in genre blues
314
Processed segment 5 of blues.00093.wav in genre blues
314
Processed segment 6 of blues.00093.wav in genre blues
314
Processed segment 7 of blues.00093.wav in genre blues
314
Processed segment 8 of blues.00093.wav in genre blues
314
Processed segment 9 of blues.00093.wav in genre blues
314
Processed segment 10 of blues.00093.wav in genre blues
314
Processed segment 1 of blues.00087.wav in genre blues
314
Processed segment 2 of blues.00087.wav in genre blues
314
Processed segment 3 of blues.00087.wav in genre blues
314
Processed segment 4 of blues.00087.wav in genre blues
314
Processed segment 5 of blues.00087.wav in genre blues
314
Processed segment 6 of blues.00087.wav in genre blues
314
Processed segment 7 of blues.00087.wav in genre blues
314
Processed

  return pitch_tuning(


314
Processed segment 6 of classical.00080.wav in genre classical
314
Processed segment 7 of classical.00080.wav in genre classical
314
Processed segment 8 of classical.00080.wav in genre classical
314
Processed segment 9 of classical.00080.wav in genre classical
314
Processed segment 10 of classical.00080.wav in genre classical
314
Processed segment 1 of classical.00081.wav in genre classical
314
Processed segment 2 of classical.00081.wav in genre classical
314
Processed segment 3 of classical.00081.wav in genre classical
314
Processed segment 4 of classical.00081.wav in genre classical
314
Processed segment 5 of classical.00081.wav in genre classical
314
Processed segment 6 of classical.00081.wav in genre classical
314
Processed segment 7 of classical.00081.wav in genre classical
314
Processed segment 8 of classical.00081.wav in genre classical
314
Processed segment 9 of classical.00081.wav in genre classical
314
Processed segment 10 of classical.00081.wav in genre classical
314
Proc