In [None]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os,IPython, librosa, mir_eval
from os import listdir
from os.path import isfile, join,isdir
from IPython.display import Audio
from librosa.display import waveplot,specshow
from librosa.onset import onset_strength, onset_detect
from librosa.feature import melspectrogram, mfcc
from librosa import load

from collections import defaultdict,OrderedDict
import sklearn
from sklearn.preprocessing import StandardScaler,LabelEncoder
import scipy
from pandas import HDFStore,DataFrame


# Song Extraction from fma_small

1. Find the song folder path relative to the current computer
2. Retrieve the different genre classifications
3. Identify each song via its full path to song using index, to guarantee one-to-one mapping 
4. Sort in alphabetical order

In [None]:
#retrieving path to the fma_small directory and the corresponding meta data
HOME_DIR = IPython.utils.path.get_home_dir()

temp = join(HOME_DIR, 'Documents')
path_to_small_fma = join(temp, 'fma_small')
json_file = join(path_to_small_fma,'fma_small.json')
#locate meta_dta
print(path_to_small_fma)
df = pd.read_json(json_file)
print(json_file)


In [None]:
#only choose top genre as the label
df = df.loc[:,['top_genre']]

#ensure that the genre name matches file name in fma_small, 
#i.e Oldtime / Historian conflict issue
df['top_genre']=df['top_genre'].apply(lambda y: y.split(os.sep)[0].strip())
#locate each individual song by its full path
df['temp'] = path_to_small_fma
str_index = ["%.2d" % x for x in df.index]
complete_genre_list = df['top_genre'].unique()
le = LabelEncoder()
encoded_genres= le.fit(complete_genre_list)
print(complete_genre_list)

#create full path to file and store as a single array
df['full_path_to_song'] = df.temp.map(str)+ "/"+ df['top_genre'].values+ "/"+ str_index+ ".mp3"
del df['temp']
#keep songs according to alphabetical order of songs 
df.sort_values(by = 'top_genre', inplace = True)
df.head()


In [None]:
#retrieve number of songs per genre
genre_and_count = df['top_genre'].value_counts().sort_index()
all_songs_path = df['full_path_to_song'].values  
print(genre_and_count)

In [None]:
song_counts = [] 
ordered_genres = [] 
full_song_df = OrderedDict()
genre_to_song_dict = {}


#retrieve number of songs per genre with the order preserved in two lists
#ordered_genres
#song_counts
for i,genre in enumerate(genre_and_count.index):
    ordered_genres.append(genre)
    temp = df['top_genre'].value_counts()[genre]
    song_counts.append(temp)

genre_to_song_zipped = zip(ordered_genres,song_counts)

num_of_genre = np.shape(genre_and_count)[0]
print("The number of genres is:", num_of_genre)

#dictionary with each song count and its corresponding genre
for genre,song_count in genre_to_song_zipped:
    genre_to_song_dict[genre] = song_count

genre_to_song_dict

In [None]:
num_of_genre = np.shape(genre_and_count)[0]
all_songs_path = df['full_path_to_song'].values

paths_dict = OrderedDict()

#prepend zero so we have a start point for all_songs_path 
#and avoid messing with indices

#use the cumulative sum to find none uniform ranges
song_counts.insert(0,0)
cumulative_sum = np.cumsum(song_counts,dtype=int)

#creates a dictionary of the genres and its corresponding path
for i,genre in enumerate(ordered_genres):
    str1=genre
    str2 = "_paths"
    genre_paths = "".join((str1,str2))
    paths_dict[genre_paths] = all_songs_path[cumulative_sum[i]:cumulative_sum[i+1]]

#paths_dict
#{genre_path_name: genre_paths}
print("{'Electronic_paths:[array_of_all_electronic_paths]}")

In [None]:
%%time
num_of_songs = 3
sampling_rate = 44100

genre_signals_dict = OrderedDict()
#creates a dictionary of the signals in a genre and their raw file
for genre_path_name,genre_paths in paths_dict.items():
    str1=genre_path_name[:-5]
    str2 = "signals"
    genre_signals = "".join((str1,str2))       
    try:
        first_three = genre_paths[:num_of_songs]
        genre_signals_dict[genre_signals] = [
        load(p,sr=None)[0] for p in first_three]
    except IOError as exc:
        print("Unable to locate folder")
        #raise IOError("%s: %s" % (genre_paths, exc.strerror))
        
#genre_signals_dict
#{genre_signals_name:genre_signals_paths}
print("{'Electronic_signals:[array_of_all_electronic_paths]}")

In [None]:
def extract_features(signal,sampling_rate,n_mfcc,genre):
    from librosa.feature import (zero_crossing_rate,spectral_centroid,
    spectral_bandwidth, tonnetz)

    zcr = zero_crossing_rate(signal)[0]
    norm_zcr = StandardScaler().fit_transform(zcr.reshape(1,-1))
    avg_zcr = np.mean(zcr)
    std_zcr = np.std(zcr)
    
    act_mfcc = mfcc(signal, sr=sampling_rate, n_mfcc=n_mfcc)
    norm_mfcc = StandardScaler().fit_transform(act_mfcc)
    avg_mfcc = np.mean(act_mfcc)
    std_mfcc = np.std(act_mfcc)
    
#    act_tonnetz = tonnetz(signal, sr = sampling_rate)
#    norm_tonnetz = StandardScaler().fit_transform(act_tonnetz)
#    avg_tonnetz = np.mean(act_mfcc)
#    std_tonnetz = np.std(act_mfcc)
    
    return [
        avg_zcr,std_zcr,avg_mfcc,std_mfcc,le.transform([genre])[0]
        #,avg_tonnetz,std_tonnetz
    ]

In [None]:


%%time
n_mfcc =12
d={}
song_num = 0
for genre_path_name,genre_paths in paths_dict.items(): 
    song_num=song_num+1
    #for i in range(len(all_songs_path)):
    try:
        for song_path in genre_paths:           
            song_signal = librosa.load(song_path,sr=None)[0]
            curr_song_genre= genre_path_name[:-6]
            d[song_num]= extract_features(song_signal,sampling_rate,n_mfcc,curr_song_genre)
    except IOError as exc:
        print("Unable to locate folder")
            
complete_df = pd.DataFrame(data=d,index =index)

In [None]:
complete_df = complete_df.T.copy()
complete_df