In [1]:
#imports
import tomllib
import numpy as np
from pprint import pprint
import librosa
import requests
from pydub import AudioSegment
from io import BytesIO
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import matplotlib.pyplot as plt

In [2]:
with open("cfg.toml", "rb") as cfg:
    keys = tomllib.load(cfg)["spotify"]
    c_id = keys["client_id"]
    c_secret = keys["client_secret"]
    auth_manager = SpotifyClientCredentials(client_id=c_id, client_secret=c_secret)
    
sp = spotipy.Spotify(auth_manager=auth_manager)

In [27]:
"""taken from matthew baleanu and Mohamad-Hassan Bahsoun
16khz downsample (default 22.05khz, spotify audio is 44.1khz.)"""
# def downsampleAudio(audio_file_path, target_sample_rate=16000):
#     #load the audio and downsample it. 
#     #librosa.load converts the audio file input (.wav, preferrably) into a time series. 
#     signal, sampling_rate = librosa.load(audio_file_path, sr=target_sample_rate, mono=True)
#     #normalize amplitude of the audio signal
#     signal = signal / np.max(np.abs(signal))
#     return signal, sampling_rate

# #draft preprocessor module. Downsamples and STFTs the audio. 
# def processAudio(audio_file_path):
#     signal, sampling_rate = downsampleAudio(audio_file_path)
#     #stft signal for feature computation
#     stft_signal = librosa.stft(signal, window='hann')
#     return signal, stft_signal

"""default sampling rate set to 16khz as per @bahsoun"""
def processAudio(audio_file_path, target_sample_rate=16000):
    """load the audio and sample it at the target rate. 
    librosa.load converts the audio file into a time series at the desired sampling rate """
    signal, sampling_rate = librosa.load(audio_file_path, sr=target_sample_rate, mono=True)
    """normalize the amplitude of the signal."""
    signal = signal / np.max(np.abs(signal))
    """stft signal for feature computation. probably will be moved. """
    stft_signal = librosa.stft(signal, window='hann')
    return signal, stft_signal, sampling_rate

#test function for visualization. Not necessary for any applications, but useful for sanity checking results, I think. 
def plotSpectrogram(stft_signal, target_sampling_rate):
    #adapted from @baleanu
    plt.figure(figsize=(12, 12))
    #compute the spectrogram power, then to dB
    power_spectrogram = np.abs(stft_signal)**2
    spectrogram_db = librosa.amplitude_to_db(power_spectrogram, ref=np.max)

    #display spectrogram
    librosa.display.specshow(spectrogram_db, sr = target_sampling_rate, x_axis='time', y_axis='linear', cmap='viridis')
    plt.colorbar(label='Power (dB)')
    plt.title('Power Spectrogram')
    plt.xlabel('Time (s)')
    plt.ylabel('Frequency (Hz)')
    plt.show()

"""as suggested by MVM, the signal we divide up the signal into bins and compute a window size in order to get
    variance for our feature, this way we can determine how relevant they would be for classification/recommendation."""
def divideSignal(signal, bpm, sampling_rate, beats_per_win = 4):
    """process is as such:
    1. get global tempo (bpm)
    2. convert that to beat segments
    3. use it to divide up the song."""
    beat_duration = 60/bpm
    song_length_seconds = len(signal)/sampling_rate
    beat_count = song_length_seconds/beat_duration
    
    """once beat count is computed, compute the window size as a fixed multiple of the beat count
    Round up division on beat_count/beats_per_win to compute the window count. 
    To calculate the window size: the length of the signal is seconds * samples, as that is an audio time series. The window size would 
    therefore be the sample length of len(signal)/window_count rounded up, in order to contain all samples."""
    window_count = int(np.ceil(beat_count/beats_per_win))
    window_size = int(np.ceil(len(signal)/window_count))
    
    """to divide the signal, iterating through the samples is necessary. 
    This is done through preallocating a numpy array for increased efficiency
    and padding at the end for length consistency. Originally performed through cocentration, 
    reshaping used as new methodfor significantly faster performance and less convoluted code.
    (0, max(0, window_count*window_size-len(signal))) is used to calculate how much padding is done. 
    window_count * window_size gives us the total length of the divided signal."""
    
    divided_signal = np.pad(signal,
                            (0, window_count*window_size-len(signal)),
                            mode="constant",
                            constant_values=0)
    
    """now we must reshape the signal."""
    divided_signal = divided_signal.reshape(window_count, window_size)
    return divided_signal, window_size, window_count

# """calls the other sub featurizer modules"""
# def featurize(divided_signal, divided_stft_signal, target_sampling_rate):
#     pass

In [4]:
#from Mohamad-Hassan Bahsoun
def computeRMS(signal):
    #RMS is the square root of the average of the squared signal. 
    squared_signal = np.square(signal)
    mean_squared = np.mean(squared_signal)
    rms = np.sqrt(mean_squared)
    #convert RMS to decibels
    rms = 20*np.log10(rms)
    return rms

def computeDynamicRange(signal, rms):
    #Dynamic range here is defined as peak - RMS, as peak - min would yield the max. 
    #decibel conversion in order to avoid computing division, as that ends up being slower. 
    max_level = 20 *np.log10(np.max(np.abs(signal)))
    dynamic_range = max_level - rms
    return dynamic_range

def computeBPM(signal, target_sample_rate):
    """temporary BPM calculation is simply done as librosa.feature.bpm. this is because @baleanu code has not been updated to same branch, 
    and testing of the window division based on BPM relies on this module
    this is to be replaced with the real computeBPM function, already implementedby @baleanu
    real function signature would actually use a stft_signal"""
    bpm = librosa.feature.tempo(y=signal, sr=target_sample_rate)
    return bpm[0]

In [5]:
"""divided signal RMS compute"""
def computeRMS(divided_signal):
    """computes rms and the variance of the RMS"""
    """Rms Computed as a vector of the """
    rms = np.zeros((divided_signal.shape[0], 1))
    for i in range(divided_signal.shape[0]):
        squared_signal = np.square()
        mean_squared = np.mean(squared_signal)
        rms[i] = np.sqrt(mean_squared)


In [37]:
"""test"""
input = "../audio/Ma Meilleure Ennemie.wav"
# input = "../audio/The Weeknd - Out of Time.wav"
signal, stft_signal, sampling_rate = processAudio(input, target_sample_rate = 16000)
print(f"the length of the stft signal is: {len(stft_signal)}")
# rms = computeRMS(signal)
bpm = computeBPM(signal, sampling_rate)

beat_duration = 60/bpm
song_length_seconds = len(signal)/sampling_rate
beat_count = song_length_seconds/beat_duration

print(f"Song BPM: {bpm}")
print(f"Beat duration of the song in seconds: {beat_duration}")
print(f"Length of the song: {song_length_seconds}")
print(f"Therefore number of beats for the entire song: {beat_count}")

beats_per_window = 4
window_count = int(np.ceil(beat_count/beats_per_window))
print(f"The window count is: {window_count}, using {beats_per_window} beats per window. This is calculated through beat_count/beats_per_window rounded up.")
window_size = int(np.ceil(len(signal)/window_count))
divided_signal_length = window_count * window_size
print(f"window size at the input sampling rate is therefore: {window_size}")
print(f"Therefore the entire new reconstructed audio is as such: {divided_signal_length}, which is longer than the original signal: {len(signal)}")
signal_size_diff = divided_signal_length - len(signal)
print(f"the difference between the divided signal length and original signal length is: {signal_size_diff} samples")
# plotSpectrogram(stft_signal, sampling_rate)
# print(f"RMS: {rms}dB, BPM: {bpm}")

divided_signal,_, _ = divideSignal(signal, bpm, sampling_rate)
print(f"The shape of the divided signal is as such: {divided_signal.shape}")
zero_pad_check = divided_signal[divided_signal.shape[0]-1][divided_signal.shape[1]-signal_size_diff-1:divided_signal.shape[1]-1]
print(f"Check if zero padding: {zero_pad_check}")
# signal2 = np.arange(10)
# div_signal2, _, _ = divideSignal(signal2)
# print(div_signal2[][])

the length of the stft signal is: 1025
Song BPM: 117.1875
Beat duration of the song in seconds: 0.512
Length of the song: 148.004
Therefore number of beats for the entire song: 289.0703125
The window count is: 73, using 4 beats per window. This is calculated through beat_count/beats_per_window rounded up.
window size at the input sampling rate is therefore: 32440
Therefore the entire new reconstructed audio is as such: 2368120, which is longer than the original signal: 2368064
the difference between the divided signal length and original signal length is: 56 samples
The shape of the divided signal is as such: (73, 32440)
Check if zero padding: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0.]


In [7]:
signal = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
div_signal = []
beats_per_window = 4
window_count = int(np.ceil(len(signal)/beats_per_window))
print(window_count)
i = 0
j = 0
for v in range(int(np.ceil(len(signal)/beats_per_window))):
    # print(i)
    j += beats_per_window
    if j > len(signal): #insert trailing zeroes
        div_signal.append(signal[i:j] + [0] * (j - len(signal)))
    else:
        div_signal.append(signal[i:j])
        i += beats_per_window
print(div_signal)

3
[[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 0, 0]]


'using numpy i can make the divisi'

In [30]:
signal = np.arange(10)
win_size = 2
win_count = int(np.ceil(signal.shape[0]/win_size))

print(f"win_count = {win_count}, beats per win = {4}")
total_samples = win_count * win_size
padded_signal = np.pad(
    signal,
    (0, total_samples - len(signal)),
    mode='constant',
    constant_values=0)
print(f"midline: {(0, max(0, total_samples - len(signal)))}")
print(padded_signal)
reshaped = padded_signal.reshape(win_count, win_size)
print(reshaped)


win_count = 5, beats per win = 4
midline: (0, 0)
[0 1 2 3 4 5 6 7 8 9]
[[0 1]
 [2 3]
 [4 5]
 [6 7]
 [8 9]]
