In [3]:
import os
import json
import librosa
import numpy as np

from pydub import AudioSegment
from scipy.io.wavfile import read as read_wav

def extract_features(file_path):
    try:
        # Load the first 30 seconds of the audio file
        audio = AudioSegment.from_file(file_path)[:30000]  # Get first 30 seconds
        audio.export("temp.wav", format="wav")  # Export as wav
        sample_rate, audio_data = read_wav("temp.wav")  # Read wav file

        # Compute MFCCs (you'll need to install the python_speech_features library)
        from python_speech_features import mfcc
        mfccs = mfcc(audio_data, samplerate=sample_rate, numcep=40)
        mfccs_processed = np.mean(mfccs, axis=0)
    except Exception as e:
        print(f"Error encountered while parsing file: {file_path}")
        print(f"Exception: {e}")
        return None 
    return mfccs_processed.tolist()

features = []
labels = []
song_names = []  # List to store the names of the songs

folders = ['happy_exciting', 'heavy_rock', 'sad_reflective']

for i, folder in enumerate(folders):
    print(f'Processing folder {folder}...')
    for file_name in os.listdir(f'/Users/danielporras/Musica/musica_analysis/music_files/3_class_audio extraction/{folder}'):
        if file_name.endswith('.mp3') or file_name.endswith('.m4a'):
            file_path = os.path.join(f'/Users/danielporras/Musica/musica_analysis/music_files/3_class_audio extraction/{folder}', file_name)
            mfccs = extract_features(file_path)
            
            if mfccs is not None:  # Only append features and labels if mfccs could be computed
                features.append(mfccs)
                labels.append(i)  # use the folder index as the label
                song_names.append(file_name)  # Store the name of the song

# Save the features, labels, and song names to a JSON file
data = {
    "features": features,
    "labels": labels,
    "song_names": song_names
}

with open('data.json_3_classes', 'w') as fp:
    json.dump(data, fp)

Processing folder happy_exciting...




Processing folder heavy_rock...




Processing folder sad_reflective...




In [11]:
import pandas as pd
import json

# Load data from JSON file
with open('data.json_3_classes', 'r') as fp:
    data = json.load(fp)

# Convert features and labels to a pandas DataFrame
df_3_classes = pd.DataFrame(data['features'])
df_3_classes['label'] = data['labels']

# Print the first few rows of the DataFrame
print(df_3_classes.head())

print(len(data['features']))
print(len(data['labels']))

# Print all unique labels in the random data
print(df_3_classes['label'].unique())

# Print the count of each label in the random data
print(df_3_classes['label'].value_counts())

           0          1          2          3          4          5  \
0  17.455314  24.800676  -5.103231 -23.136596  33.288783 -19.275285   
1  17.455314  24.800676  -5.103231 -23.136596  33.288783 -19.275285   
2  11.545859  26.639756 -17.372481 -19.021037   0.494736 -11.841601   
3  18.789278  19.627897  -0.085293 -35.398495  40.855391 -22.839208   
4  16.690823  21.316411   2.995136 -29.554774  25.635387 -17.527180   

           6          7          8          9  ...        17        18  \
0  13.942809  -3.272066  18.486237 -23.011616  ...  3.685020 -2.222353   
1  13.942809  -3.272066  18.486237 -23.011616  ...  3.685020 -2.222353   
2  -4.756531  -5.546539 -14.678859 -11.733434  ...  0.269959 -1.013637   
3  18.239648  -8.829613  25.015727 -29.003930  ...  4.473168 -1.203284   
4  16.694295 -13.699385  20.033533 -20.921513  ...  0.049865  0.483697   

         19        20        21        22        23        24        25  label  
0  0.365056  0.036479  0.883356 -0.463687 -0.20