In [2]:
from pathlib import Path
import numpy as np
import pandas as pd
import librosa
import matplotlib.pyplot as plt
import os
from datetime import datetime
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns',None)

In [3]:
# output_file_date = datetime.now().strftime("%Y-%m-%d %H:%M")
output_file_date = datetime.now().strftime("%Y-%m-%d")
output_file_date

'2023-08-08'

In [4]:
def get_mfcc(wav_file_path):
  y, sr = librosa.load(wav_file_path, offset=10, duration=60)
  mfcc = np.array(librosa.feature.mfcc(y=y, sr=sr))
  return mfcc
    
def get_melspectrogram(wav_file_path):
  y, sr = librosa.load(wav_file_path, offset=10, duration=60)
  melspectrogram = np.array(librosa.feature.melspectrogram(y=y, sr=sr))
  return melspectrogram

def get_chroma_vector(wav_file_path):
  y, sr = librosa.load(wav_file_path)
  chroma = np.array(librosa.feature.chroma_stft(y=y, sr=sr))
  return chroma

def get_tonnetz(wav_file_path):
  y, sr = librosa.load(wav_file_path)
  tonnetz = np.array(librosa.feature.tonnetz(y=y, sr=sr))
  return tonnetz

def get_feature(file_path):
  # Extracting MFCC feature
  mfcc = get_mfcc(file_path)
  mfcc_mean = mfcc.mean(axis=1)
  mfcc_min = mfcc.min(axis=1)
  mfcc_max = mfcc.max(axis=1)
  mfcc_feature = np.concatenate( (mfcc_mean, mfcc_min, mfcc_max) )

  # Extracting Mel Spectrogram feature
  melspectrogram = get_melspectrogram(file_path)
  melspectrogram_mean = melspectrogram.mean(axis=1)
  melspectrogram_min = melspectrogram.min(axis=1)
  melspectrogram_max = melspectrogram.max(axis=1)
  melspectrogram_feature = np.concatenate( (melspectrogram_mean, melspectrogram_min, melspectrogram_max) )

  # Extracting chroma vector feature
  chroma = get_chroma_vector(file_path)
  chroma_mean = chroma.mean(axis=1)
  chroma_min = chroma.min(axis=1)
  chroma_max = chroma.max(axis=1)
  chroma_feature = np.concatenate( (chroma_mean, chroma_min, chroma_max) )

  # Extracting tonnetz feature
  tntz = get_tonnetz(file_path)
  tntz_mean = tntz.mean(axis=1)
  tntz_min = tntz.min(axis=1)
  tntz_max = tntz.max(axis=1)
  tntz_feature = np.concatenate( (tntz_mean, tntz_min, tntz_max) ) 
  
  feature = np.concatenate( (chroma_feature, melspectrogram_feature, mfcc_feature, tntz_feature) )
  return feature

In [5]:
# Need to read in Kaggle data for reverse lookup
# IE: Once we have the song recommendations, we want to look up the information so that we can present that to the user
# kaggle = pd.read_csv("../data/SpotifyFeatures.csv")

In [6]:
data_dir = Path('../data/mp3s/')
output_dir = Path('../data/vectorized_mp3s/')
path_glob = data_dir.rglob('*.mp3')
file_paths = []
for file_path in path_glob:
    file_paths.append(file_path) # creates a list for repeated iteration
    # if this is not done, the .rglob command above has to be repeated to regenerate iterator
len(file_paths) # number of mp3s in directory

11572

In [7]:
# for tuple in enumerate(file_paths):
#     print(tuple) # used to find tracks that fail to convert

In [8]:
# 11572/500

23.144

In [9]:
# Had to delete an mp3 as it was only 4 seconds long. 
# I can't explain how that happened, but the code didn't have any issue with the 
# other file paths.

In [11]:
# downloaded_track_ids = []
# for file_path in file_paths:
#     # print(file_path)
#     print(f'{count}. FILE PATH: \n', f'{file_path} \n')
#     path_split = str(file_path).split('/')
#     track_ids.append(path_split[3])

In [12]:
vectorized_track_ids = []
path_glob = output_dir.rglob('*.mp3')
for file_path in path_glob:
    # print(file_path)
    # print(f'{count}. FILE PATH: \n', f'{file_path} \n')
    path_split = str(file_path).split('/')
    vectorized_track_ids.append(path_split[3])
len(vectorized_track_ids)

0

In [None]:
tracks = []
data_dir = Path('../data/mp3s/')
output_dir = Path('../data/vectorized_mp3s/')
downloaded_path_glob = data_dir.rglob('*.mp3')
output_path_glob = output_dir.rglob('*.parquet')
count = 1
file_paths = [file_path for file_path in downloaded_path_glob]
print('Number of MP3 Files: ', len(file_paths),'\n')
vectorized_track_ids = [file_path.stem for file_path in output_path_glob]
for file_path in file_paths:
    print(f'{count}. FILE PATH: \n', f'{file_path}')
    path_split = str(file_path).split('/')
    track_id = path_split[3]
    if (len(vectorized_track_ids)>0) & (track_id in vectorized_track_ids):
        print(f'{track_id} has already been vectorized...skipping...')
        count+=1
    else:
        array = get_feature(file_path)
        tracks.append(array)
        vectorized_df = pd.DataFrame(tracks)
        vectorized_df.columns = vectorized_df.columns.astype(str)
        vectorized_df['track_id'] = track_id
        vectorized_df.to_parquet(f"../data/vectorized_mp3s/{track_id}.parquet")
        vectorized_df.to_csv(f"../data/vectorized_mp3s/{track_id}.csv")
        count+=1
    # if count % 500 == 0:
    #     vectorized_df = pd.DataFrame(tracks, index = track_ids)
    #     output_file_date = datetime.now().strftime("%Y-%m-%d %H:%M")
    #     vectorized_df.to_parquet(f"../data/vectorized_mp3s/{output_file_date}_vectorized_audio_data.parquet")
    #     vectorized_df.to_csv(f"../data/vectorized_mp3s/{output_file_date}_vectorized_audio_data.csv")
    #     continue
# vectorized_df = pd.DataFrame(tracks, index = track_ids)
# This implementation also scales everything to 498 dimensions
# This loop should write out every file as it moves through the data.
# Each one will be 500 rows accept the last one which should be 73
# This should create 24 files

In [None]:
# vectorized_df.head()

In [None]:
# vectorized_df.shape

In [None]:
# vectorized_df.to_parquet(f"../data/vectorized_mp3s/{output_file_date}_vectorized_audio_data.parquet")

In [None]:
# vectorized_df.to_csv(f"../data/vectorized_mp3s/{output_file_date}_vectorized_audio_data.csv")

### Test Code

In [None]:
%%timeit
# Test to see if it could run through 5 songs
tracks,track_ids = [], []
count = 0
for file_path in file_paths:
    if count <= 4:
        # print(file_path)
        print('FILE PATH: \n', f'{file_path} \n')
        x = str(file_path).split('/')
        print(x)
        track_ids.append(x[3])
        array = get_feature(file_path)
        tracks.append(array)
        count+=1
    else:
        break
df = pd.DataFrame(tracks, index = track_ids)
# This implementation also scales everything to 498 dimensions

In [10]:
df.shape

(5, 498)

In [13]:
# assuming 9 seconds per 5 songs
((11573/5)*9)/3600

5.786499999999999

In [11]:
x = get_feature('../data/mp3s/3vOALqMX4c76GqWiJs3mEw/Shopping for Her Gift - Bonus Track/Alonzo Bodden - Shopping for Her Gift - Bonus Track.mp3')

  y, sr = librosa.load(wav_file_path, offset=10, duration=60)
  y, sr = librosa.load(wav_file_path, offset=10, duration=60)


In [13]:
x.shape

(498,)