# Investigate the relationship between Spotify energy and librosa audio features

In [None]:
import pandas as pd
import os
import random
import librosa
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import Audio
from IPython.display import display

In [None]:
metadata_filepath = '../user_evaluation_app/static/SpotifyAudioFeaturesApril2019_scraped_selection.csv'
metadata = pd.read_csv(metadata_filepath)
mp3_folder = '../user_evaluation_app/static/mp3_previews'
mp3_files = [os.path.splitext(file)[0] for file in os.listdir(mp3_folder)] #list with stripped .mp3

In [None]:
# Select tracks from each Spotify energy range
metadata_sorted = metadata.sort_values(by='energy')
quantiles = np.linspace(0, 1, 51)
selected_energy_range_ids = []

num_samples = 10
for i in range(50):
    start = metadata_sorted['energy'].quantile(quantiles[i])
    end = metadata_sorted['energy'].quantile(quantiles[i+1])# Random number between 1 and 10
    selected_ids = metadata_sorted[(metadata_sorted['energy'] >= start) & (metadata_sorted['energy'] < end)]['track_id']
    if len(selected_ids) >= num_samples:
        selected_energy_range_ids.extend(selected_ids.sample(num_samples).values.tolist())
    else:
        selected_energy_range_ids.extend(selected_ids.values.tolist())

tempo_values = []
rms_energy_values = []
contrast_values = []
dynamic_range_values = []
spotify_energy_values = []

for id in selected_energy_range_ids:
    y, sr = librosa.load(f'{mp3_folder}/{id}.mp3')
    tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
    rms_energy = librosa.feature.rms(y=y).mean()
    spectral_contrast = librosa.feature.spectral_contrast(y=y, sr=sr).mean(axis=1)
    contrast_avg = np.mean(spectral_contrast)
    rms_energy_frames = librosa.feature.rms(y=y)
    dynamic_range = np.max(rms_energy_frames) - np.min(rms_energy_frames)
    spotify_energy = metadata.loc[metadata['track_id'] == id, 'energy'].values[0]

    # Append the values to the lists
    tempo_values.append(tempo)
    rms_energy_values.append(rms_energy)
    contrast_values.append(contrast_avg)
    dynamic_range_values.append(dynamic_range)
    spotify_energy_values.append(spotify_energy)

# Create a DataFrame
audio_features = pd.DataFrame({
    'Tempo': tempo_values,
    'RMS Energy': rms_energy_values,
    'Spectral Contrast': contrast_values,
    'Dynamic Range': dynamic_range_values,
    'Spotify Energy': spotify_energy_values
})

In [None]:
# Display the audio features and spotify energy
for feature in ['Tempo', 'RMS Energy', 'Spectral Contrast', 'Dynamic Range']:
    plt.figure(figsize=(10, 5))
    plt.scatter(audio_features['Spotify Energy'], audio_features[feature])
    plt.xlabel('Spotify Energy')
    plt.ylabel(feature)
    plt.title(f'Spotify Energy vs {feature}')
    plt.show()
    correlation = audio_features['Spotify Energy'].corr(audio_features[feature])
    print(f"Correlation between Spotify Energy and {feature}: {correlation}")
    if abs(correlation) > 0.7:
        print(f"Very Strong correlation between Spotify Energy and {feature}")
    elif abs(correlation) > 0.5:
        print(f"Strong correlation between Spotify Energy and {feature}")
    elif abs(correlation) > 0.2:
        print(f"Moderate correlation between Spotify Energy and {feature}")
    else:
        print(f"Weak correlation between Spotify Energy and {feature}")
    print("--------------------\n\n\n")

In [None]:
from multiprocessing import Process, Queue

def retrieve_energy_scores(id):
    y, sr = librosa.load(f'{mp3_folder}/{id}.mp3')
    rms_energy = librosa.feature.rms(y=y).mean()
    spectral_contrast = librosa.feature.spectral_contrast(y=y, sr=sr).mean(axis=1)
    contrast_avg = np.mean(spectral_contrast)
    rms_energy_frames = librosa.feature.rms(y=y)
    dynamic_range = np.max(rms_energy_frames) - np.min(rms_energy_frames)
    
    return (rms_energy, contrast_avg, dynamic_range)

def process_mp3_files(mp3_files):
    processes = []
    queue = Queue()
    
    for id in mp3_files:
        p = Process(target=retrieve_energy_scores, args=(id, queue))
        processes.append(p)
        p.start()
    
    for p in processes:
        p.join()
    
    audio_features_energy = []
    while not queue.empty():
        audio_features_energy.append(queue.get())
    
    df = pd.DataFrame(audio_features_energy, columns=['id', 'rms_energy', 'contrast_avg', 'dynamic_range'])
    return df

audio_features_energy = process_mp3_files(mp3_files[10])
audio_features_energy

In [None]:
n_files = len(mp3_files)

rms_energy_values = np.zeros(n_files)
contrast_values = np.zeros(n_files)
dynamic_range_values = np.zeros(n_files)

for i, id in enumerate(mp3_files[:1000]):
    y, sr = librosa.load(f'{mp3_folder}/{id}.mp3')
    rms_energy = librosa.feature.rms(y=y).mean()
    spectral_contrast = librosa.feature.spectral_contrast(y=y, sr=sr).mean(axis=1)
    contrast_avg = np.mean(spectral_contrast)
    rms_energy_frames = librosa.feature.rms(y=y)
    dynamic_range = np.max(rms_energy_frames) - np.min(rms_energy_frames)
    spotify_energy = metadata.loc[metadata['track_id'] == id, 'energy'].values[0]

    # Store results
    rms_energy_values[i] = rms_energy
    contrast_values[i] = contrast_avg
    dynamic_range_values[i] = dynamic_range

# Create a DataFrame
audio_features = pd.DataFrame({
    'ID' : mp3_files,
    'RMS Energy': rms_energy_values,
    'Spectral Contrast': contrast_values,
    'Dynamic Range': dynamic_range_values,
})
audio_features