In [1]:
"""
Transformation Inspired From:
https://stackoverflow.com/questions/56719138/how-can-i-save-a-librosa-spectrogram-plot-as-a-specific-sized-image/57204349#57204349
"""

# Absolute path of project
project_path = 'C:/Users/Arwin/Documents/dev/APS360-PROJECT' # change this to your machine's

In [2]:
import matplotlib.pyplot as plt
import librosa
import librosa.display
import numpy as np
import pandas as pd
import os
import subprocess
import skimage.io
from skimage.transform import resize
from PIL import Image

In [14]:
def audio_to_spec(audio_path, out_fname="temp.png", sz = (224, 224*3)):
    
    # 1. CONVERT AUDIO FILE TO SPEC
    song_data, song_sr = librosa.load((audio_path), sr=None)
    # song_data = librosa.feature.melspectrogram(y=song_data, sr=song_sr, n_mels=sz[1], n_fft=1024, hop_length=512, fmax=8000)
    song_data = librosa.feature.melspectrogram(y=song_data, sr=song_sr, n_mels=256, n_fft=2048, hop_length=512*4)

    # 2. Resize for ResNet18 input dims, with height = 224, width = 224*3
    song_data = resize(song_data, sz, order=0, mode='reflect', anti_aliasing=False)

    # 3. Split image into 3 slices by width and insert into each R G B channel
    assert(song_data.shape[1] // 3 == 224)
    channel_sz = song_data.shape[1] // 3
    r, g, b = song_data[:, :channel_sz], song_data[:, channel_sz : 2 * channel_sz], song_data[:, 2 * channel_sz : 3 * channel_sz]
    
    # 4. a - LOG, b - NORM, c - REFLECT, d - INVERT
    def norm_array(X, min=0.0, max=1.0):
        X_std = (X - X.min()) / (X.max() - X.min())
        X_scaled = X_std * (max - min) + min
        return X_scaled
    
    def prepare_array(arr):
        arr = np.log(arr + 1e-9)
        arr = norm_array(arr,0, 255).astype(np.uint8)
        arr = np.flip(arr, axis=0)
        arr = 255 - arr
        return arr

    r = prepare_array(r)
    g = prepare_array(g)
    b = prepare_array(b)
    img = np.stack([r, g, b], axis=-1)


    # 5. Save to img
    skimage.io.imsave(out_fname + ".png", img)


MAIN LOOP

In [16]:
spec_path = project_path + '/data/gtzan/genre_specs/'
audio_path = project_path + '/data/gtzan/genres_original/'
# for i, dir in enumerate(os.listdir(audio_path)):

# genres = []
# for dir in (os.listdir(audio_path)):
#     genres.append(dir)

overwrite_all = True
for i, gtzan_genre in enumerate(os.listdir(audio_path)):
    genre_name = gtzan_genre
    genre_audio_dir = os.path.join(audio_path,gtzan_genre)
    genre_spec_dir = os.path.join(spec_path,gtzan_genre)

    # print(genre_audio_dir, genre_spec_dir)
    assert(os.path.exists(genre_audio_dir) and os.path.exists(genre_spec_dir))

    for audio_file in os.listdir(genre_audio_dir):
        print(audio_file)

        # check if img alr exists
        img_path = os.path.join(genre_spec_dir, os.path.splitext(audio_file)[0]).replace('\\', '/') + '.png'
        if os.path.exists(img_path) and os.path.getsize(img_path) >= 1024 and not overwrite_all:
            print(img_path + ' exists, skipping...')
            continue

        try:
            # print(type(genre_audio_dir +'/'+ audio_file))
            audio_to_spec(
                audio_path= genre_audio_dir +'/'+ audio_file,
                out_fname = img_path
            )
            print(img_path)
        except Exception as e:
            print("error converting to png: ", e) 
    


blues.00000.wav
C:/Users/Arwin/Documents/dev/APS360-PROJECT/data/gtzan/genre_specs/blues/blues.00000.png
blues.00001.wav
C:/Users/Arwin/Documents/dev/APS360-PROJECT/data/gtzan/genre_specs/blues/blues.00001.png
blues.00002.wav
C:/Users/Arwin/Documents/dev/APS360-PROJECT/data/gtzan/genre_specs/blues/blues.00002.png
blues.00003.wav
C:/Users/Arwin/Documents/dev/APS360-PROJECT/data/gtzan/genre_specs/blues/blues.00003.png
blues.00004.wav
C:/Users/Arwin/Documents/dev/APS360-PROJECT/data/gtzan/genre_specs/blues/blues.00004.png
blues.00005.wav
C:/Users/Arwin/Documents/dev/APS360-PROJECT/data/gtzan/genre_specs/blues/blues.00005.png
blues.00006.wav
C:/Users/Arwin/Documents/dev/APS360-PROJECT/data/gtzan/genre_specs/blues/blues.00006.png
blues.00007.wav
C:/Users/Arwin/Documents/dev/APS360-PROJECT/data/gtzan/genre_specs/blues/blues.00007.png
blues.00008.wav
C:/Users/Arwin/Documents/dev/APS360-PROJECT/data/gtzan/genre_specs/blues/blues.00008.png
blues.00009.wav
C:/Users/Arwin/Documents/dev/APS360-PRO

  song_data, song_sr = librosa.load((audio_path), sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


error converting to png:  
jazz.00055.wav
C:/Users/Arwin/Documents/dev/APS360-PROJECT/data/gtzan/genre_specs/jazz/jazz.00055.png
jazz.00056.wav
C:/Users/Arwin/Documents/dev/APS360-PROJECT/data/gtzan/genre_specs/jazz/jazz.00056.png
jazz.00057.wav
C:/Users/Arwin/Documents/dev/APS360-PROJECT/data/gtzan/genre_specs/jazz/jazz.00057.png
jazz.00058.wav
C:/Users/Arwin/Documents/dev/APS360-PROJECT/data/gtzan/genre_specs/jazz/jazz.00058.png
jazz.00059.wav
C:/Users/Arwin/Documents/dev/APS360-PROJECT/data/gtzan/genre_specs/jazz/jazz.00059.png
jazz.00060.wav
C:/Users/Arwin/Documents/dev/APS360-PROJECT/data/gtzan/genre_specs/jazz/jazz.00060.png
jazz.00061.wav
C:/Users/Arwin/Documents/dev/APS360-PROJECT/data/gtzan/genre_specs/jazz/jazz.00061.png
jazz.00062.wav
C:/Users/Arwin/Documents/dev/APS360-PROJECT/data/gtzan/genre_specs/jazz/jazz.00062.png
jazz.00063.wav
C:/Users/Arwin/Documents/dev/APS360-PROJECT/data/gtzan/genre_specs/jazz/jazz.00063.png
jazz.00064.wav
C:/Users/Arwin/Documents/dev/APS360-PROJ