In [22]:
"""imports"""
import numpy as np
import librosa
import matplotlib.pyplot as plt

In [23]:
"""spotify authentication"""
# import tomllib
# import spotipy
# from spotipy.oauth2 import SpotifyClientCredentials
# with open("cfg.toml", "rb") as cfg:
#     keys = tomllib.load(cfg)["spotify"]
#     c_id = keys["client_id"]
#     c_secret = keys["client_secret"]
#     auth_manager = SpotifyClientCredentials(client_id=c_id, client_secret=c_secret)
    
# sp = spotipy.Spotify(auth_manager=auth_manager)

'spotify authentication'

In [24]:
"""Old Input Loaders"""
# def downsampleAudio(audio_file_path, target_sample_rate=16000):
#     #load the audio and downsample it. 
#     #librosa.load converts the audio file input (.wav, preferrably) into a time series. 
#     signal, sampling_rate = librosa.load(audio_file_path, sr=target_sample_rate, mono=True)
#     #normalize amplitude of the audio signal
#     signal = signal / np.max(np.abs(signal))
#     return signal, sampling_rate

# #draft preprocessor module. Downsamples and STFTs the audio. 
# def processAudio(audio_file_path):
#     signal, sampling_rate = downsampleAudio(audio_file_path)
#     #stft signal for feature computation
#     stft_signal = librosa.stft(signal, window='hann')
#     return signal, stft_signal

'Old Input Loaders'

In [25]:
"""original RMS and dynamic range compute, operated over the entire signal."""
# def computeRMS(signal):
#     #RMS is the square root of the average of the squared signal. 
#     squared_signal = np.square(signal)
#     mean_squared = np.mean(squared_signal)
#     rms = np.sqrt(mean_squared)
#     #convert RMS to decibels
#     rms = 20*np.log10(rms)
#     return rms

# def computeDynamicRange(signal, rms):
#     #Dynamic range here is defined as peak - RMS, as peak - min would yield the max. 
#     #decibel conversion in order to avoid computing division, as that ends up being slower. 
#     max_level = 20 *np.log10(np.max(np.abs(signal)))
#     dynamic_range = max_level - rms
#     return dynamic_range

'original RMS and dynamic range compute, operated over the entire signal.'

In [26]:
"""spectrogram plotter for testing"""
def plotSpectrogram(stft_signal, target_sampling_rate):
    #adapted from @baleanu
    plt.figure(figsize=(12, 12))
    #compute the spectrogram power, then to dB
    power_spectrogram = np.abs(stft_signal)**2
    spectrogram_db = librosa.amplitude_to_db(power_spectrogram, ref=np.max)

    #display spectrogram
    librosa.display.specshow(spectrogram_db, sr = target_sampling_rate, x_axis='time', y_axis='linear', cmap='viridis')
    plt.colorbar(label='Power (dB)')
    plt.title('Power Spectrogram')
    plt.xlabel('Time (s)')
    plt.ylabel('Frequency (Hz)')
    plt.show()
    print(stft_signal.shape)

In [27]:
"""audio processor
taken from matthew baleanu and Mohamad-Hassan Bahsoun
16khz downsample (default 22.05khz, spotify audio is 44.1khz.)
default sampling rate set to 16khz as per @bahsoun"""
def processAudio(audio_file_path, target_sample_rate=16000):
    """load the audio and sample it at the target rate. 
    librosa.load converts the audio file into a time series at the desired sampling rate """
    signal, sampling_rate = librosa.load(audio_file_path, sr=target_sample_rate, mono=True)
    """normalize the amplitude of the signal."""
    signal = signal / np.max(np.abs(signal))
    """stft signal for feature computation. probably will be moved. """
    stft_signal = librosa.stft(signal, window='hann')
    return signal, stft_signal, sampling_rate

In [28]:
"""Signal Divider Based on BPM
    as suggested by MVM, the signal we divide up the signal into bins and compute a window size in order to get
    variance for our feature, this way we can determine how relevant they would be for classification/recommendation."""
def divideSignal(signal, bpm, sampling_rate, beats_per_win = 4):
    """process is as such:
    1. get global tempo (bpm)
    2. convert that to beat segments
    3. use it to divide up the song."""
    beat_duration = 60/bpm
    song_length_seconds = len(signal)/sampling_rate
    beat_count = song_length_seconds/beat_duration
    
    """once beat count is computed, compute the window size as a fixed multiple of the beat count
    Round up division on beat_count/beats_per_win to compute the window count. 
    To calculate the window size: the length of the signal is seconds * samples, as that is an audio time series. The window size would 
    therefore be the sample length of len(signal)/window_count rounded up, in order to contain all samples."""
    window_count = int(np.ceil(beat_count/beats_per_win))
    window_size = int(np.ceil(len(signal)/window_count))
    
    """to divide the signal we pad the original signal with zeroes at the end until it is of the proper length
    for consistency and then reshape it."""
    
    divided_signal = np.pad(signal,
                            (0, window_count*window_size-len(signal)),
                            mode="constant",
                            constant_values=0)
    
    """now we must reshape the signal."""
    divided_signal = divided_signal.reshape(window_count, window_size)
    return divided_signal, window_size, window_count

"""need to STFT invidual pieces"""
def divideSTFT(divided_signal):
    """sample STFT calculation is necessary in order to perform preallocation for efficiency"""
    x, y = librosa.stft(divided_signal[0]).shape
    divided_stft_signal = np.zeros((divided_signal.shape[0], x, y), dtype=np.complex128)
    divided_stft_magnitudes = np.zeros((divided_signal.shape[0], x, y))
    for i in range(divided_signal.shape[0]): 
        stft_slice = librosa.stft(divided_signal[i])
        divided_stft_signal[i] = stft_slice
        divided_stft_magnitudes[i] = np.abs(stft_slice)

    return divided_stft_signal, divided_stft_magnitudes

In [29]:
"""Temporary BPM compute"""
def computeBPM(signal, target_sample_rate):
    """temporary BPM calculation is simply done as librosa.feature.bpm. 
    this is because @baleanu code has not been updated to same branch, 
    and testing of the window division based on BPM relies on this module
    this is to be replaced with the real computeBPM function, already implementedby @baleanu"""
    bpm = librosa.feature.tempo(y=signal, sr=target_sample_rate)
    return bpm[0]

In [30]:
"""RMS compute, operates over divided signal
computes rms over each window, return vector of rms values and average rms"""
def computeRMS(divided_stft_signal_mag):
    """
    Parameters:
        divided_stft_signal_mag (3D ndarray) of the magnitudes of the STFT signal
    returns: 
        rms: ndarray of RMS values for each window, 
        rms_mean: average RMS value of rms
    """
    epsilon = 1e-10
    rms = np.zeros((divided_stft_signal_mag.shape[0], 1))
    rms_mean = 0
    for i in range(divided_stft_signal_mag.shape[0]):
        # print(divided_stft_signal[i])
        squared_signal = np.square(divided_stft_signal_mag[i])
        mean_squared = np.mean(squared_signal)
        root_mean_squared = np.sqrt(mean_squared)

        "decibel conversion of the rms portion"
        # Avoid log(0) by clamping to epsilon
        root_mean_squared_db = 20 * np.log10(np.maximum(root_mean_squared, epsilon))

        rms_mean += root_mean_squared_db
        rms[i] = root_mean_squared_db
    
    rms_mean /= divided_stft_signal_mag.shape[0]
    return rms, rms_mean

In [31]:
"""divided dynamic range compute
Computes the Dynammic range in each window of the signal"""
def computeDynamicRange(divided_signal, divided_rms):
    """
    Parameters:
        divided_signal: 3-dimensional ndarray of STFT'd signals (2-dimensional)
    returns: 
        dynamic_range: ndarray of RMS values for each window, 
        dynamic_range_mean: average value of dynamic_range
    """
    dynamic_range = np.zeros((divided_signal.shape[0], 1))
    dynamic_range_mean = 0
    for i in range(divided_signal.shape[0]):
        dynamic_max = 20 *np.log10(np.max(np.abs(divided_signal[i])))
        dynamic_range_slice = dynamic_max - divided_rms[i]
        dynamic_range[i] = dynamic_range_slice
        dynamic_range_mean += dynamic_range_slice
    dynamic_range_mean /= divided_signal.shape[0]
    return dynamic_range, dynamic_range_mean

In [32]:
"""spectral centroid 1: averaged spectral centroid compute"""

"""define a helper function for getSpectralCentroid"""
def computeSpectralCentroid(stft_signal, frequencies):
    # compute the magnitude of the stft
    x_n = stft_signal
    f_n = frequencies[:,None]
   
    # multiply each frequency bin by the magnitude
    # Centroid = (Σₙ₌₀ᴺ⁻¹ [f(n) * x(n)]) / (Σₙ₌₀ᴺ⁻¹ x(n))
    numerator = np.sum(f_n * x_n)
    denominator = np.sum(x_n)

    #if x(n)s are zero for some reason
    epsilon = 1e-6
    denominator = np.where(denominator==0, epsilon, denominator)
    
    spec_c = numerator/denominator
    # print(f"{spec_c.shape}")
    return spec_c

""" spectral centroid code adapted from @bahsoun"""
def computeSpectralCentroidsMean(divided_stft_magnitudes, sampling_rate):
    """spectral centroic calc is: Centroid = (Σₙ₌₀ᴺ⁻¹ [f(n) * x(n)]) / (Σₙ₌₀ᴺ⁻¹ x(n))
    np.fft.rfftfreq conputes f(n), need to compute spectral centroid over window in each piece of the signal
    divided_stft_magnitudes contains the magnitudes the divided signal after each piece has been passed through STFT"""
    frequencies = np.fft.rfftfreq(2048, d=1/sampling_rate)
    spectral_centroids = np.zeros((divided_stft_magnitudes.shape[0], 1))
    # spectral_centroids = np.zeros((divided_stft_magnitudes.shape[0], computeSpectralCentroid(divided_stft_magnitudes[0], frequencies).shape[0]))
    mean_spectral_centroid = 0
    for i in range(divided_stft_magnitudes.shape[0]):
        """np.mean in order to average the spectral centroid over the window."""
        spectral_centroid_piece = computeSpectralCentroid(divided_stft_magnitudes[i], frequencies)
        spectral_centroids[i] = spectral_centroid_piece
        mean_spectral_centroid += spectral_centroid_piece
    mean_spectral_centroid /= divided_stft_magnitudes.shape[0]
    return spectral_centroids, mean_spectral_centroid

In [33]:
"""spectral centroid 2: computed centroid at each sampling frame"""
"""define a helper function for getSpectralCentroid"""
def computeSpectralCentroid(stft_signal_mag, frequencies):
    """Computes the spectral centroid for each sampling frame contained in the window.
    parameters:
        stft_signal_mag: 2D numpy array containing magnitude of a signal window's STFT 
        frequencies: frequencies at the sampling rate and window size
    returns:
        spec_c: spectral centroid values at each sampling frame of the signal window's STFT. """
    # compute the magnitude of the stft
    x_n = stft_signal_mag
    f_n = frequencies[:,None]
   
    # multiply each frequency bin by the magnitude
    # Centroid = (Σₙ₌₀ᴺ⁻¹ [f(n) * x(n)]) / (Σₙ₌₀ᴺ⁻¹ x(n))
    numerator = np.sum(f_n * x_n, axis = 0)
    denominator = np.sum(x_n, axis = 0)

    #if x(n)s are zero for some reason
    epsilon = 1e-6
    denominator = np.where(denominator==0, epsilon, denominator)
    
    spec_c = numerator/denominator
    # print(f"{spec_c.shape}")
    return spec_c

""" spectral centroid code adapted from @bahsoun"""
def computeSpectralCentroidsMean(divided_stft_magnitudes, sampling_rate):
    """spectral centroic calc is: Centroid = (Σₙ₌₀ᴺ⁻¹ [f(n) * x(n)]) / (Σₙ₌₀ᴺ⁻¹ x(n))
    np.fft.rfftfreq conputes f(n), need to compute spectral centroid over window in each piece of the signal
    parameters:
        divided_stft_magnitudes: 3D ndarray containing all the windows of signal's STFT magnitude values
        sampling_rate: integer, sampling rate
    returns:
        spectral_centroids: ndarray of the spectral centroid at each frame for a track window
        total_mean_spectral_centroids: float, the average of all spectral centroids over the entire track 
        mean_spectral_centroids: ndarray mean of the spectral centroid for each track window. Contains the mean of spectral_centroids"""
    frequencies = np.fft.rfftfreq(2048, d=1/sampling_rate)
    spectral_centroids = np.zeros((divided_stft_magnitudes.shape[0], computeSpectralCentroid(divided_stft_magnitudes[0], frequencies).shape[0]))
    total_mean_spectral_centroid = 0
    mean_spectral_centroids = np.zeros((divided_stft_magnitudes.shape[0],1))
    for i in range(divided_stft_magnitudes.shape[0]):
        """np.mean in order to average the spectral centroid over the window."""
        spectral_centroid_piece = computeSpectralCentroid(divided_stft_magnitudes[i], frequencies)
        spectral_centroids[i] = spectral_centroid_piece
        mean_spectral_centroid_slice = np.mean(spectral_centroid_piece)
        total_mean_spectral_centroid += mean_spectral_centroid_slice
        mean_spectral_centroids[i] = mean_spectral_centroid_slice
        
    total_mean_spectral_centroid /= divided_stft_magnitudes.shape[0]
    return spectral_centroids, total_mean_spectral_centroid, mean_spectral_centroids

In [34]:
"""method 2 spectral rolloff compute testing"""
def computeSpectralRolloffFrequency(stft_magnitude, frequencies, percentile):
    """Compute the spectral rolloff frequency for each time frame in the STFT.
    Parameters:
        stft_magnitude (np.ndarray): Magnitude of STFT
        frequencies: frequencies at the sampling rate and window size
        percentile (float): Energy threshold (e.g., 85, 95) to define rolloff.
    Returns: 
        np.ndarray: Rolloff frequencies for each time frame (shape: time_frames)."""
    # Compute frequency bins (assuming n_fft=2048 as in your example)
    
    # Total energy per time frame (sum across frequency bins)
    total_energy = np.sum(stft_magnitude, axis=0)
    
    # Threshold energy for each time frame (percentile of total energy)
    threshold = total_energy * percentile
    
    # Cumulative energy along frequency bins (axis=0)
    cumulative_energy = np.cumsum(stft_magnitude, axis=0)
    
    # Find the first frequency bin where cumulative energy >= threshold
    mask = cumulative_energy >= threshold
    rolloff_indices = np.argmax(mask, axis=0)
    
    # Handle cases where threshold is never met (use highest frequency)
    rolloff_indices = np.where(np.any(mask, axis=0), rolloff_indices, len(frequencies) - 1)

    # Get corresponding frequencies
    return frequencies[rolloff_indices]

In [35]:
"""Smapling Frame Spectral roll off featurizer

This function computes the frequency range at each sampling frame"""
def computeSpectralRolloffFrequency(stft_magnitude, frequencies, percentile):
    """Compute the spectral roll off thresholds by using percentile. For percentile calculation
    a cumulative sum method is used.
    parameters: 
        stft_magnitude: 2D ndarray containing the magnitudes of the track window's STFT 
        frequencies: frequencies at the sampling rate and window size
        percentile: float [0.001 ~ 0.499], treshold to calculate the rolloff
    returns: """

    # set thresholds. Two-sided percentile needed
    # eg percentile = 0.05 -> 1-0.05 = 0.95 >= 0.0
    
    if (1-percentile >= percentile): 
        lower = percentile
        upper = 1-percentile
    else:
        lower = 1-percentile
        upper = percentile
    
    total_energy = np.sum(stft_magnitude, axis=0)
    upper_threshold_energy = total_energy*upper
    lower_threshold_energy = total_energy*lower

    #calculate the cumulative energy along frequency bins
    cumulative_energy = np.cumsum(stft_magnitude, axis=0)

    #find freq bin where cumulative energy >= upper threshold, <= low threshold
    upper_bin = cumulative_energy >= upper_threshold_energy
    lower_bin = cumulative_energy <= lower_threshold_energy
    
    #argmax, argmin to get the first bins where the cum energy is < upper threshold, and then > lower threshold
    upper_indices = np.argmax(upper_bin, axis=0)
    lower_indices = np.argmin(lower_bin, axis=0)

    #check if any of the bins are 
    upper_threshold_check = np.any(upper_indices, axis=0)
    lower_threshold_check = np.any(lower_indices, axis=0)
    # handle cases where threshold is not met, we use lowest and highest freqs
    # compute time frames where no frequency bins cross our lower/upper thresholds
    upper_indices = np.where(upper_threshold_check, upper_indices, len(frequencies) - 1)
    lower_indices = np.where(lower_threshold_check, lower_indices, 0)

    # print(f"total energy: {total_energy}")
    # print(f"upper_threshold: {upper_threshold_energy}")
    # print(f"low_threshold: {lower_threshold_energy}")
    return frequencies[upper_indices], frequencies[lower_indices]
    
"""this function calls computeSpectralRolloffFrequency in order to trim each piece of the signal. We then calculate the range 
for the specific interval and average it at the end."""
def computeFrequencyRange(divided_stft_magnitudes, sampling_rate, percentile=0.05):
    """frequency range is max-min after trimming off using spectral rolloff"""
    frequencies = np.fft.rfftfreq(2048, d=1/sampling_rate)
    frequency_ranges = np.zeros((divided_stft_magnitudes.shape[0],divided_stft_magnitudes.shape[2]))
    mean_frequency_range = 0
    for i in range(divided_stft_magnitudes.shape[0]):
        upper_freq_rolloff, low_freq_rolloff = computeSpectralRolloffFrequency(divided_stft_magnitudes[i], frequencies, percentile)
        freq_range_slice = upper_freq_rolloff-low_freq_rolloff
        frequency_ranges[i] = freq_range_slice
        mean_frequency_range += np.mean(freq_range_slice)
    mean_frequency_range /= divided_stft_magnitudes.shape[0]
    return frequency_ranges, mean_frequency_range

In [36]:
"""Spectral Bandwidth Calculation
    this function is used to compute the spectral bandwidth: it takes in the stft magnitudes, the frequences and the spectral centroid
    the bandwidth is the amplitude weighted average of the differences between the spectral components and the centroid"""
def computeSpectralBandwidth(stft_magnitude, frequencies, centroid):
    """formula: sqrt(mag[i]*(freq[i]-centroid[i])^2)"""
    f_n = frequencies[:, None] #size adjustment 
    bandwith = np.square(f_n - centroid)
    bandwith = stft_magnitude * bandwith
    bandwith = np.sqrt(bandwith)
    # print("this is the stft magnitudes: ", stft_magnitude.shape)
    # print("this is the stft freq: ", frequencies.shape)
    # print("this is the stft centroid: ", centroid.shape)
    return bandwith
    


"""This is where we will comput the mean spectral bandwith"""
def computeSpectralBandwidthMean(divided_stft_magnitudes, spectral_centroids, sampling_rate):
    # spectral_centroid, _, _ = computeSpectralCentroidsMean(divided_stft_magnitudes, sampling_rate)
    frequencies = np.fft.rfftfreq(2048, d=1/sampling_rate)
    x, y = computeSpectralBandwidth(divided_stft_magnitudes[0], frequencies, spectral_centroids[0]).shape
    spectral_bandwidths = np.zeros((divided_stft_magnitudes.shape[0], x, y))
    # spectral_bandwidths = np.zeros((divided_stft_magnitudes.shape[0], spectral_centroids.shape[0]))
    total_mean_spectral_bandwidths = 0
    mean_spectral_bandwidths = np.zeros((divided_stft_magnitudes.shape[0],1))
    for i in range(divided_stft_magnitudes.shape[0]):
        spectral_bandwidth_piece = computeSpectralBandwidth(divided_stft_magnitudes[i], frequencies, spectral_centroids[i])
        spectral_bandwidths[i] = spectral_bandwidth_piece
        mean_spectral_bandwidth_piece = np.mean(spectral_bandwidth_piece)
        total_mean_spectral_bandwidths += mean_spectral_bandwidth_piece
        mean_spectral_bandwidths[i] = mean_spectral_bandwidth_piece
    total_mean_spectral_bandwidths /= divided_stft_magnitudes.shape[0]
    return spectral_bandwidths, total_mean_spectral_bandwidths, mean_spectral_bandwidths



# # Testing Spectral Bandwidth
# input = "../audio/cafe_wav.wav"
# signal, stft_signal, sampling_rate = processAudio(input, target_sample_rate = 16000)
# bpm = computeBPM(signal, sampling_rate)

# divided_signal, win_size, win_count = divideSignal(signal, bpm, sampling_rate, beats_per_win=1)
# div_stft, div_stft_mag = divideSTFT(divided_signal)

# spec_centroids, total_mean_spec, mean_spec_centroids = computeSpectralCentroidsMean(div_stft_mag, sampling_rate)

# spec_b, tot_mean_b, mean_b = computeSpectralBandwidthMean(div_stft_mag, spec_centroids, sampling_rate)
# # out1 = computeSpectralBandwidth(div_stft_mag[0], np.fft.rfftfreq(2048, d=1/sampling_rate), spec_centroids[0])



In [37]:
"""testing spectral rolloff, frequency range"""
input = "../audio/The Weeknd - Out of Time.wav"
signal, stft_signal, sampling_rate = processAudio(input, target_sample_rate = 16000)
bpm = computeBPM(signal, sampling_rate)

divided_signal, win_size, win_count = divideSignal(signal, bpm, sampling_rate, beats_per_win=1)
div_stft, div_stft_mag = divideSTFT(divided_signal)

# s = div_stft_mag[0]
# test_any = np.any(s, axis=0)
# freqqs = np.fft.rfftfreq(2048, d=1/sampling_rate)
# upper_freq, lower_freq = computeSpectralRolloffFrequency(s, freqqs, 0.05)
# freq_range = upper_freq - lower_freq
# print(f"freq range at each time frame is: {freq_range}")

freq_ranges, mean_freq_ranges = computeFrequencyRange(div_stft_mag, sampling_rate)
# print(div_stft_mag[0])
# roll_95 = computeSpectralRolloffFrequency(s, fr, 0.95)
# roll_05 = computeSpectralRolloffFrequency(s, fr, 0.05)
# range = roll_95-roll_05

# print(range)

# import matplotlib.pyplot as plt

# time_frames = np.arange(roll_95.shape[1])
# plt.figure(figsize=(10, 4))
# plt.plot(time_frames, roll_95, label="95% Rolloff")
# plt.plot(time_frames, roll_05, label="5% Rolloff")
# plt.fill_between(time_frames, roll_05, roll_95, alpha=0.2)
# plt.title("Spectral Rolloff Frequency Range")
# plt.xlabel("Time Frame")
# plt.ylabel("Frequency (Hz)")
# plt.legend()
# plt.show()

In [38]:
"""testing spectral centroid"""
input = "../audio/The Weeknd - Out of Time.wav"
signal, stft_signal, sampling_rate = processAudio(input, target_sample_rate = 16000)
bpm = computeBPM(signal, sampling_rate)

divided_signal, win_size, win_count = divideSignal(signal, bpm, sampling_rate, beats_per_win=1)
div_stft, div_stft_mag = divideSTFT(divided_signal)
# fr= np.fft.rfftfreq(2048, d=1/sampling_rate)

# s = div_stft_mag[0]
# z = computeSpectralCentroid(s, fr)
# t = np.arange(s.shape[1])
# plt.figure(figsize=(10, 4))
# plt.plot(t, z, label="spectral centroid")
# plt.title("Spectral Centroid")
# plt.xlabel("Time Frame")
# plt.ylabel("Centroid (Hz)")
# plt.legend()
# plt.show()
spectralcentroids, total_spectralcentroid_mean, mean_spectralcentroids = computeSpectralCentroidsMean(div_stft_mag,sampling_rate)

In [42]:
"""test windowing, rms, dynamic range"""
# input = "../audio/Ma Meilleure Ennemie.wav"
# input = "../audio/The Weeknd - Out of Time.wav"
input = "../audio/Time.wav"
signal, stft_signal, sampling_rate = processAudio(input, target_sample_rate = 44100)
# rms = computeRMS(signal)
bpm = computeBPM(signal, sampling_rate)

# beat_duration = 60/bpm
# song_length_seconds = len(signal)/sampling_rate
# beat_count = song_length_seconds/beat_duration

# print(f"Song BPM: {bpm}")
# print(f"Beat duration of the song in seconds: {beat_duration}")
# print(f"Length of the song: {song_length_seconds}")
# print(f"Therefore number of beats for the entire song: {beat_count}")

# beats_per_window = 4
# window_count = int(np.ceil(beat_count/beats_per_window))
# print(f"The window count is: {window_count}, using {beats_per_window} beats per window. This is calculated through beat_count/beats_per_window rounded up.")
# window_size = int(np.ceil(len(signal)/window_count))
# divided_signal_length = window_count * window_size
# print(f"window size at the input sampling rate is therefore: {window_size}")
# print(f"Therefore the entire new reconstructed audio is as such: {divided_signal_length}, which is longer than the original signal: {len(signal)}")
# signal_size_diff = divided_signal_length - len(signal)
# print(f"the difference between the divided signal length and original signal length is: {signal_size_diff} samples")
# plotSpectrogram(stft_signal, sampling_rate)
# print(f"RMS: {rms}dB, BPM: {bpm}")

divided_signal, win_size, win_count = divideSignal(signal, bpm, sampling_rate, beats_per_win=1)
div_stft_s, div_stft_mags = divideSTFT(divided_signal)
# print(f"The shape of the divided signal is as such: {divided_signal.shape}")
# zero_pad_check = divided_signal[divided_signal.shape[0]-1][divided_signal.shape[1]-signal_size_diff-1:divided_signal.shape[1]-1]
# print(f"Check if zero padding: {zero_pad_check}")
# signal2 = np.arange(10)
# div_signal2, _, _ = divideSignal(signal2)
# print(div_signal2[][])
div_rms, div_rms_mean = computeRMS(div_stft_mags)
# print(f"div_rms: {div_rms}, div_rms mean: {div_rms_mean}")
# print(div_rms.shape)

# div_dr, div_dr_mean = computeDynamicRange(divided_signal, div_rms)
# print(f"div_dynamic range: {div_dr}, div_dyanmic range mean: {div_dr_mean}")
# print(div_dr.shape)


  root_mean_squared = 20*np.log10(root_mean_squared)


In [None]:
# """@article{spleeter2020,
#   doi = {10.21105/joss.02154},
#   url = {https://doi.org/10.21105/joss.02154},
#   year = {2020},
#   publisher = {The Open Journal},
#   volume = {5},
#   number = {50},
#   pages = {2154},
#   author = {Romain Hennequin and Anis Khlif and Felix Voituret and Manuel Moussallam},
#   title = {Spleeter: a fast and efficient music source separation tool with pre-trained models},
#   journal = {Journal of Open Source Software},
#   note = {Deezer Research}
# }"""

# import os
# import subprocess
# def splitAudio(input_file):
#     output_folder = os.path.splitext(input_file)[0]
    
#     if not os.path.exists(output_folder):
#         os.makedirs(output_folder)

#     command = [
#         'spleeter', 'separate', 
#         '-i', input_file, 
#         '-o', output_folder
#     ]

#     subprocess.run(command)

#     vocal_file = os.path.join(output_folder, 'vocals.wav')
#     instrumental_file = os.path.join(output_folder, 'instrumental.wav')

#     return vocal_file, instrumental_file


# vocal, instrumental = splitAudio("audio/katy_wav.wav")
import librosa
import numpy as np

import spleeter as spl

from spleeter.separator import Separator

def split_and_process_audio(input_file, target_sample_rate=16000):
    """Split audio and process vocals/instrumentals in memory"""
    # Initialize Spleeter separator (2 stems: vocals + accompaniment)
    separator = Separator('spleeter:2stems')
    
    # Load audio with librosa (already at target sample rate)
    waveform, _ = librosa.load(input_file, sr=target_sample_rate, mono=False)
    
    # Convert to stereo format expected by Spleeter
    if waveform.ndim == 1:
        waveform = np.array([waveform, waveform])  # Convert mono to stereo
    
    # Perform separation (returns numpy array)
    prediction = separator.separate(waveform)
    
    # Extract and process vocals
    vocal_stereo = prediction['vocals']
    vocal_mono = librosa.to_mono(vocal_stereo)
    vocal_norm = vocal_mono / np.max(np.abs(vocal_mono))
    vocal_stft = librosa.stft(vocal_norm, window='hann')
    
    # Extract and process instrumental
    instrumental_stereo = prediction['accompaniment']
    instrumental_mono = librosa.to_mono(instrumental_stereo)
    instrumental_norm = instrumental_mono / np.max(np.abs(instrumental_mono))
    instrumental_stft = librosa.stft(instrumental_norm, window='hann')
    
    return (vocal_norm, vocal_stft), (instrumental_norm, instrumental_stft), target_sample_rate

# Usage example:
vocal_signal, instrumental_signal, sr = split_and_process_audio("audio/katy_wav.wav")

In [None]:
"""
Audio Separation and Processing Pipeline using Spleeter
"""

import os
import subprocess
import tempfile
import numpy as np
import librosa

def split_and_process_audio(
    input_file: str,
    target_sample_rate: int = 16000,
    model_type: str = 'spleeter:2stems'
) -> tuple:
    """
    Separate audio into vocals/instrumentals and process tracks in memory
    
    Args:
        input_file: Path to input audio file
        target_sample_rate: Target sample rate for output signals (default: 16000)
        model_type: Spleeter model variant (default: 'spleeter:2stems')
    
    Returns:
        tuple: (
            (vocal_signal, vocal_stft),
            (instrumental_signal, instrumental_stft),
            sample_rate
        )
    
    Raises:
        FileNotFoundError: If input file doesn't exist
        subprocess.CalledProcessError: If Spleeter separation fails
    """
    
    if not os.path.exists(input_file):
        raise FileNotFoundError(f"Input file not found: {input_file}")

    with tempfile.TemporaryDirectory() as tmp_dir:
        # Run Spleeter separation
        try:
            subprocess.run(
                [
                    'spleeter', 'separate',
                    '-i', input_file,
                    '-o', tmp_dir,
                ],
                check=True,
                capture_output=True
            )
        except subprocess.CalledProcessError as e:
            raise RuntimeError(
                f"Spleeter separation failed: {e.stderr.decode().strip()}"
            ) from e

        # Construct output paths
        base_name = os.path.splitext(os.path.basename(input_file))[0]
        vocal_path = os.path.join(tmp_dir, base_name, 'vocals.wav')
        instrumental_path = os.path.join(tmp_dir, base_name, 'accompaniment.wav')

        # Validate output files
        if not os.path.exists(vocal_path):
            raise FileNotFoundError(f"Vocal track not generated at: {vocal_path}")
        if not os.path.exists(instrumental_path):
            raise FileNotFoundError(f"Instrumental track not generated at: {instrumental_path}")

        # Process vocal track
        vocal_signal, sr = librosa.load(
            vocal_path,
            sr=target_sample_rate,
            mono=True
        )
        vocal_signal = librosa.util.normalize(vocal_signal)
        vocal_stft = librosa.stft(vocal_signal, window='hann')

        # Process instrumental track
        instrumental_signal, _ = librosa.load(
            instrumental_path,
            sr=target_sample_rate,
            mono=True
        )
        instrumental_signal = librosa.util.normalize(instrumental_signal)
        instrumental_stft = librosa.stft(instrumental_signal, window='hann')

    return (
        (vocal_signal, vocal_stft),
        (instrumental_signal, instrumental_stft),
        sr
    )


vocal, instrumental, sr = split_and_process_audio("../audio/katy_wav.wav")
        