In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
import librosa
import matplotlib.pyplot as plt
import os
from datetime import datetime
import itertools
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns',None)

In [2]:
output_file_date = datetime.now().strftime("%Y-%m-%d")
output_file_date

'2023-08-07'

In [3]:
def get_mfcc(wav_file_path):
  y, sr = librosa.load(wav_file_path, offset=0, duration=30)
  mfcc = np.array(librosa.feature.mfcc(y=y, sr=sr))
  return mfcc
    
def get_melspectrogram(wav_file_path):
  y, sr = librosa.load(wav_file_path, offset=10, duration=240)
  melspectrogram = np.array(librosa.feature.melspectrogram(y=y, sr=sr))
  return melspectrogram

def get_chroma_vector(wav_file_path):
  y, sr = librosa.load(wav_file_path)
  chroma = np.array(librosa.feature.chroma_stft(y=y, sr=sr))
  return chroma

def get_tonnetz(wav_file_path):
  y, sr = librosa.load(wav_file_path)
  tonnetz = np.array(librosa.feature.tonnetz(y=y, sr=sr))
  return tonnetz

def get_feature(file_path):
  # Extracting MFCC feature
  mfcc = get_mfcc(file_path)
  mfcc_mean = mfcc.mean(axis=1)
  mfcc_min = mfcc.min(axis=1)
  mfcc_max = mfcc.max(axis=1)
  mfcc_feature = np.concatenate( (mfcc_mean, mfcc_min, mfcc_max) )

  # Extracting Mel Spectrogram feature
  melspectrogram = get_melspectrogram(file_path)
  melspectrogram_mean = melspectrogram.mean(axis=1)
  melspectrogram_min = melspectrogram.min(axis=1)
  melspectrogram_max = melspectrogram.max(axis=1)
  melspectrogram_feature = np.concatenate( (melspectrogram_mean, melspectrogram_min, melspectrogram_max) )

  # Extracting chroma vector feature
  chroma = get_chroma_vector(file_path)
  chroma_mean = chroma.mean(axis=1)
  chroma_min = chroma.min(axis=1)
  chroma_max = chroma.max(axis=1)
  chroma_feature = np.concatenate( (chroma_mean, chroma_min, chroma_max) )

  # Extracting tonnetz feature
  tntz = get_tonnetz(file_path)
  tntz_mean = tntz.mean(axis=1)
  tntz_min = tntz.min(axis=1)
  tntz_max = tntz.max(axis=1)
  tntz_feature = np.concatenate( (tntz_mean, tntz_min, tntz_max) ) 
  
  feature = np.concatenate( (chroma_feature, melspectrogram_feature, mfcc_feature, tntz_feature) )
  return feature

In [4]:
# Need to read in Kaggle data for reverse lookup
# IE: Once we have the song recommendations, we want to look up the information so that we can present that to the user
kaggle = pd.read_csv("../data/SpotifyFeatures.csv")

In [6]:
data_dir = Path('../data/mp3s/')
path_glob = data_dir.rglob('*.mp3')
file_paths = []
for file_path in path_glob:
    file_paths.append(file_path) # creates a list for repeated iteration
    # if this is not done, the .rglob command above has to be repeated to regenerate iterator
len(file_paths) # number of mp3s in directory

11573

In [None]:
tracks,track_ids = [], []
count = 1
for file_path in file_paths:
    # print(file_path)
    print(f'{count}. FILE PATH: \n', f'{file_path} \n')
    path_split = str(file_path).split('/')
    track_ids.append(path_split[3])
    array = get_feature(file_path)
    tracks.append(array)
    count+=1
vectorized_df = pd.DataFrame(tracks, index = track_ids)
# This implementation also scales everything to 498 dimensions

1. FILE PATH: 
 ../data/mp3s/1ZB2qWsheGabSEYvBYxjKn/Take on Me/Weezer - Take on Me.mp3 

2. FILE PATH: 
 ../data/mp3s/5V9H9J5GcUGY5ig029g5OU/Shkleepy/Manwolves - Shkleepy.mp3 

3. FILE PATH: 
 ../data/mp3s/34FsCOAQ0U99vAh3uoiLmm/Bandana (feat. Young Buck)/Dirty Audio, BL3R, Young Buck - Bandana (feat. Young Buck).mp3 

4. FILE PATH: 
 ../data/mp3s/25mldAmMHYzXhDXCxTpTHy/Chloroform/Phoenix - Chloroform.mp3 

5. FILE PATH: 
 ../data/mp3s/1YaOBTTdptDf4vYKpFy56T/Mawaranai Toe Shoes/majiko - Mawaranai Toe Shoes.mp3 

6. FILE PATH: 
 ../data/mp3s/2RbDFTlqdIdiZwO4GaTxi2/moonwalking/Good Scott - moonwalking.mp3 

7. FILE PATH: 
 ../data/mp3s/2lZkIlYXN5SR0UWFgljDCd/Because You Love Me/Jo Dee Messina - Because You Love Me.mp3 

8. FILE PATH: 
 ../data/mp3s/58dSdjfEYNSxte1aNVxuNf/Easy/Mac Ayres - Easy.mp3 

9. FILE PATH: 
 ../data/mp3s/6wulmNhwuptjQkNzWHZ7Ym/Don Palabras/Maldita Vecindad Y Los Hijos Del 5to. Patio - Don Palabras.mp3 

10. FILE PATH: 
 ../data/mp3s/6vLNSXMZ08nlGMgcQ17cib/Landing O

In [None]:
vectorized_df.head()

In [None]:
vectorized_df.shape

In [None]:
vectorized_df.to_parquet(f"{output_file_date}_vectorized_audio_data.parquet")

In [None]:
vectorized_df.to_csv(f"{output_file_date}_vectorized_audio_data.parquet")

### Test Code

In [38]:
# Test to see if it could run through 5 songs
tracks,track_ids = [], []
count = 0
for file_path in file_paths:
    if count <= 4:
        # print(file_path)
        print('FILE PATH: \n', f'{file_path} \n')
        x = str(file_path).split('/')
        print(x)
        track_ids.append(x[3])
        array = get_feature(file_path)
        tracks.append(array)
        count+=1
    else:
        break
df = pd.DataFrame(tracks, index = track_ids)
# This implementation also scales everything to 498 dimensions

FILE PATH: 
 (PosixPath('../data/mp3s/34FsCOAQ0U99vAh3uoiLmm/Bandana (feat. Young Buck)/Dirty Audio, BL3R, Young Buck - Bandana (feat. Young Buck).mp3'),) 

["(PosixPath('..", 'data', 'mp3s', '34FsCOAQ0U99vAh3uoiLmm', 'Bandana (feat. Young Buck)', "Dirty Audio, BL3R, Young Buck - Bandana (feat. Young Buck).mp3'),)"]


TypeError: Invalid file: (PosixPath('../data/mp3s/34FsCOAQ0U99vAh3uoiLmm/Bandana (feat. Young Buck)/Dirty Audio, BL3R, Young Buck - Bandana (feat. Young Buck).mp3'),)

In [7]:
df.shape

(5, 498)