In [3]:
import os
import json
import librosa
import numpy as np

from pydub import AudioSegment
from scipy.io.wavfile import read as read_wav

def extract_features(file_path):
    try:
        # Load the first 30 seconds of the audio file
        audio = AudioSegment.from_file(file_path)[:30000]  # Get first 30 seconds
        audio.export("temp.wav", format="wav")  # Export as wav
        sample_rate, audio_data = read_wav("temp.wav")  # Read wav file

        # Compute MFCCs (you'll need to install the python_speech_features library)
        from python_speech_features import mfcc
        mfccs = mfcc(audio_data, samplerate=sample_rate, numcep=40)
        mfccs_processed = np.mean(mfccs, axis=0)
    except Exception as e:
        print(f"Error encountered while parsing file: {file_path}")
        print(f"Exception: {e}")
        return None 
    return mfccs_processed.tolist()

features = []
labels = []
song_names = []  # List to store the names of the songs

folders = ['happy_exciting', 'heavy_rock', 'sad_reflective']

for i, folder in enumerate(folders):
    print(f'Processing folder {folder}...')
    for file_name in os.listdir(f'/Users/danielporras/Musica/musica_analysis/music_files/3_class_audio extraction/{folder}'):
        if file_name.endswith('.mp3') or file_name.endswith('.m4a'):
            file_path = os.path.join(f'/Users/danielporras/Musica/musica_analysis/music_files/3_class_audio extraction/{folder}', file_name)
            mfccs = extract_features(file_path)
            
            if mfccs is not None:  # Only append features and labels if mfccs could be computed
                features.append(mfccs)
                labels.append(i)  # use the folder index as the label
                song_names.append(file_name)  # Store the name of the song

# Save the features, labels, and song names to a JSON file
data = {
    "features": features,
    "labels": labels,
    "song_names": song_names
}

with open('data.json_3_classes', 'w') as fp:
    json.dump(data, fp)

Processing folder happy_exciting...




Processing folder heavy_rock...




Processing folder sad_reflective...




In [None]:
import pandas as pd
import json

# Load data from JSON file
with open('data.json_3_classes', 'r') as fp:
    data = json.load(fp)

# Convert features and labels to a pandas DataFrame
df_3_classes = pd.DataFrame(data['features'])
df_3_classes['label'] = data['labels']

# Print the first few rows of the DataFrame
print(df_3_classes.head())

print(len(data['features']))
print(len(data['labels']))

# Print all unique labels in the random data
print(df_3_classes['label'].unique())

# Print the count of each label in the random data
print(df_3_classes['label'].value_counts())

           0          1          2          3          4          5  \
0  17.455314  24.800676  -5.103231 -23.136596  33.288783 -19.275285   
1  17.455314  24.800676  -5.103231 -23.136596  33.288783 -19.275285   
2  11.545859  26.639756 -17.372481 -19.021037   0.494736 -11.841601   
3  18.789278  19.627897  -0.085293 -35.398495  40.855391 -22.839208   
4  16.690823  21.316411   2.995136 -29.554774  25.635387 -17.527180   

           6          7          8          9  ...        17        18  \
0  13.942809  -3.272066  18.486237 -23.011616  ...  3.685020 -2.222353   
1  13.942809  -3.272066  18.486237 -23.011616  ...  3.685020 -2.222353   
2  -4.756531  -5.546539 -14.678859 -11.733434  ...  0.269959 -1.013637   
3  18.239648  -8.829613  25.015727 -29.003930  ...  4.473168 -1.203284   
4  16.694295 -13.699385  20.033533 -20.921513  ...  0.049865  0.483697   

         19        20        21        22        23        24        25  label  
0  0.365056  0.036479  0.883356 -0.463687 -0.20

In [15]:
import os
import json
import librosa
import numpy as np
from pydub import AudioSegment
from sklearn.preprocessing import StandardScaler
import pickle
import traceback


# Function to extract features from an audio file
def extract_features(file_path, num_segments=5, max_pad_length=174):
    # max_pad_length depends on your data; adjust so it fits your longest sample or desired fixed length
    try:
        # Load the full audio file
        audio = AudioSegment.from_file(file_path)
        audio = audio.set_frame_rate(44100)  # Set frame rate
        audio = audio.set_channels(1)  # Set to mono
        audio.export("temp.wav", format="wav")  # Export as wav

        # Load using librosa now
        y, sr = librosa.load('temp.wav', sr=44100)

        # Ensure length is at least 30s for consistency
        if len(y) < 30 * sr:
            y = np.pad(y, int(np.ceil((30 * sr - len(y)) / 2)), mode='reflect')

        # Trim to 30s in the middle of the file to avoid leading/trailing silence
        y = y[int(len(y) / 2 - 15 * sr):int(len(y) / 2 + 15 * sr)]

        # Compute MFCCs from the audio data
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)

        # Pad or truncate the MFCCs to have the same shape
        if mfccs.shape[1] > max_pad_length:
            mfccs = mfccs[:, :max_pad_length]
        else:
            padding = max_pad_length - mfccs.shape[1]
            mfccs = np.pad(mfccs, ((0, 0), (0, padding)), mode='constant')

        # Remove the temporary file
        os.remove("temp.wav")

        # Return the 2D MFCC array directly, without flattening
        return mfccs
    except Exception as e:
        print(f"Error encountered while parsing file: {file_path}")
        print(traceback.format_exc())
        return None

# Continue with your data collection and preprocessing as before, now with adjusted feature extraction

# Path to the dataset
base_dir = '/Users/danielporras/Musica/musica_analysis/music_files/3_class_audio extraction'
folders = ['happy_exciting', 'heavy_rock', 'sad_reflective']
features, labels, song_names = [], [], []

# Assume earlier parts of the script are unchanged

# Extract features for each audio file
for label, folder in enumerate(folders):
    folder_path = os.path.join(base_dir, folder)
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.mp3') or file_name.endswith('.m4a'):
            file_path = os.path.join(folder_path, file_name)
            file_features = extract_features(file_path)
            # Ensure file_features is not None before proceeding
            if file_features is not None:  # This is the correct placement of the check
                # Now we append the features and other information safely
                features.append(file_features)
                labels.append(label)
                song_names.append(file_name)


if features:
    # Convert features list to a 3D numpy array: samples, mfcc_features, time_steps
    features = np.array(features)  # This becomes a 3D array [num_samples, n_mfcc, max_pad_length]
    
    # Flatten the features to 2D for standardization (samples, features)
    features_flattened = features.reshape(features.shape[0], -1)
    
    scaler = StandardScaler().fit(features_flattened)
    features_normalized = scaler.transform(features_flattened)
    
    # Reshape back to 3D after normalization
    features_normalized = features_normalized.reshape(features.shape)
    
    # Prepare the dataset for CNN input
    # Adding a channel dimension: [num_samples, n_mfcc, max_pad_length, 1]
    features_normalized = np.expand_dims(features_normalized, axis=-1)
    
    # Save the normalized features, labels, and song names
    data = {
        "features": features_normalized.tolist(),  # Storing as list for JSON serialization
        "labels": labels,
        "song_names": song_names
    }
    with open('data.json_3_classes_2', 'w') as fp:
        json.dump(data, fp)

    # Save the scaler for later use
    with open('scaler.pkl', 'wb') as f:
        pickle.dump(scaler, f)
else:
    print("No features were extracted. Check the audio files and extraction process.")

# Assuming 'features_normalized' is what you refer to as 'X' in your print statement
print(features_normalized.shape)  # To understand the current shape of your feature array for CNN input

(534, 40, 174, 1)
