In [9]:
import os
import librosa
import librosa.display
import numpy as np
import subprocess
import skimage.io
from skimage.transform import resize

project_path = 'C:/Users/arwin/Documents/dev/APS360-PROJECT/' # change this to your machine's


In [10]:

def audio_to_spec(audio_path, out_fname="temp.png", sz = (224, 3*224)):
    
    # 1. CONVERT AUDIO FILE TO SPEC
    song_data, song_sr = librosa.load(os.path.join(project_path, audio_path), sr=None)
    # song_data = librosa.feature.melspectrogram(y=song_data, sr=song_sr, n_mels=sz[1], n_fft=1024, hop_length=512, fmax=8000)
    song_data = librosa.feature.melspectrogram(y=song_data, sr=song_sr, n_mels=256, n_fft=2048, hop_length=512*25)

    # 2. Resize for ResNet18 input dims, with height = 224, width = 224*3
    song_data = resize(song_data, sz, order=0, mode='reflect', anti_aliasing=False)

    # 3. Split image into 3 slices by width and insert into each R G B channel
    assert(song_data.shape[1] // 3 == 224)
    channel_sz = song_data.shape[1] // 3
    r, g, b = song_data[:, :channel_sz], song_data[:, channel_sz : 2 * channel_sz], song_data[:, 2 * channel_sz : 3 * channel_sz]
    
    # 4. a - LOG, b - NORM, c - REFLECT, d - INVERT
    def norm_array(X, min=0.0, max=1.0):
        X_std = (X - X.min()) / (X.max() - X.min())
        X_scaled = X_std * (max - min) + min
        return X_scaled
    
    def prepare_array(arr):
        arr = np.log(arr + 1e-9)
        arr = norm_array(arr,0, 255).astype(np.uint8)
        arr = np.flip(arr, axis=0)
        arr = 255 - arr
        return arr

    r = prepare_array(r)
    g = prepare_array(g)
    b = prepare_array(b)
    img = np.stack([r, g, b], axis=-1)


    # 5. Save to img
    skimage.io.imsave(out_fname + ".png", img)



In [29]:

audio_to_spec(audio_path='evaluate/inference_songs/DJ Perfekt - Somani_Money.mp3',
              out_fname='inference_specs/dj perfekt_somani money',)

