In [2]:
import sys
import os
import numpy as np
import pandas as pd
from multiprocessing import Process
sys.path.insert(0, '../../')

In [3]:
from library.notebook_api.data_loader import CombinedDataLoader, ModelDataLoader

In [4]:
from library.source_data.feature_extractor import AudioFeatureExtractor

## Load initial Meta

In [6]:
df = CombinedDataLoader().get_combined_df()#.head()
df_files_available = df[df.file_available ==1]
df_genres_available = df[df.label.isnull() ==False]
df_filtered = df_files_available[ df.label.isnull() ==False ]

  df_filtered = df_files_available[ df.label.isnull() ==False ]


In [9]:
print('tracks in meta', 
      len(df), 
      ': with files available: ', 
      len(df_files_available),
      ': with genres available: ', 
      len(df_genres_available),
       ': with both available: ', 
      len(df_filtered),
      
      )

tracks in meta 107574 : with files available:  9000 : with genres available:  50598 : with both available:  9000


In [10]:
df_filtered.count()

track_id          9000
dataset           9000
audio_path        9000
label             9000
fma_genre_top     9000
fma_genres        9000
fma_genres_all    9000
file_available    9000
dtype: int64

## Feature Extraction on full dataset with fma_small

 Extract just the features into memory and then persist to parquet

In [11]:
class AudioParallelProcessor():
    '''allows for parallel extraction'''
    def __init__(self, source_data, version = '003', batch=1, threads = 5):
        self.version = version 
        self.batch = batch
        self.threads = threads
        self.source_data = source_data
        self.input_length = len(self.source_data)
        self.start_thread_size = int(self.input_length/self.threads)

        return
    
    def execute(self):
        self.run_processes(self.get_extract_processes())
    
    def get_extract_processes(self):
        
        #map of index number and the function to run 
        extract_thread_functions = {}
        #map of index number and process to execture 
        extract_processes = {}
        #1 based indexes for the threads 
        thread_indexes = list(range(1,self.threads+1))
   
        #Instantiate the index locations for getting rows from data frame 
        start_record = 0
        end_record = self.start_thread_size 

        #for reach thread build out the processes for a subset of the data 
        for thread in thread_indexes:
            if thread == 2:
                start_record +=1
            if end_record == self.input_length - self.input_length%self.threads:
                end_record += self.input_length%self.threads
            print(start_record, end_record, len(batch_input.iloc[start_record:end_record]))
            start_record += self.start_thread_size
            end_record += self.start_thread_size 
            #extract_thread_functions[thread] = AudioFeatureExtractor(df[df.label.isnull() ==False].sample(100))
            extract_thread_functions[thread] = AudioFeatureExtractor(self.source_data.iloc[start_record:end_record])
            extract_processes[thread] = Process(target =extract_thread_functions[thread].run_extraction_thread,args=(self.version,self.batch,thread))
        return extract_processes


    def run_processes(self,extract_processes):
        for thread_index, process in extract_processes.items():
            process.start()
        for thread_index, process in extract_processes.items():
            process.join()



In [15]:
batch_input = df_filtered.sample(100).copy()
batch = AudioParallelProcessor(batch_input,version = '006', batch=1, threads = 8)
batch.execute()

0 12 12
13 24 11
25 36 11
37 48 11
49 60 11
61 72 11
73 84 11
85 100 15


	This function was moved to 'librosa.feature.rhythm.tempo' in librosa version 0.10.0.
	This alias will be removed in librosa version 1.0.
  tempo = librosa.beat.tempo(onset_envelope=onset_env, sr=sr)[0]
	This function was moved to 'librosa.feature.rhythm.tempo' in librosa version 0.10.0.
	This alias will be removed in librosa version 1.0.
  tempo = librosa.beat.tempo(onset_envelope=onset_env, sr=sr)[0]
	This function was moved to 'librosa.feature.rhythm.tempo' in librosa version 0.10.0.
	This alias will be removed in librosa version 1.0.
  tempo = librosa.beat.tempo(onset_envelope=onset_env, sr=sr)[0]
	This function was moved to 'librosa.feature.rhythm.tempo' in librosa version 0.10.0.
	This alias will be removed in librosa version 1.0.
  tempo = librosa.beat.tempo(onset_envelope=onset_env, sr=sr)[0]
	This function was moved to 'librosa.feature.rhythm.tempo' in librosa version 0.10.0.
	This alias will be removed in librosa version 1.0.
  tempo = librosa.beat.tempo(onset_envelope=onset_

In [16]:
full_model_data = ModelDataLoader('006')

In [17]:
full_model_data.df.count()

index                                 81
track_id                              81
dataset                               81
audio_path                            81
label                                 81
fma_genre_top                         81
fma_genres                            81
fma_genres_all                        81
file_available                        81
sampling_rate                         81
features                              81
spectral_centroids_mean               81
spectral_centroids_delta_mean         81
spectral_centroids_accelerate_mean    81
spectral_bandwidth_mean               81
spectral_rolloff_mean                 81
zero_crossing_rate_mean               81
rms_mean                              81
chroma_stft_mean                      81
mfccs_mean                            81
onset                                 81
tempo                                 81
contrast                              81
tonnetz                               81
mfccs_min       

## Extraction while saving audio and features on small sample
in this run, we saved the output of librosa load in the parquet as well.  This takes a lot of memory so only did a small example

In [4]:
#instantiate data 
#
gtza_test_data = df_filtered[df_filtered.dataset == 'gtzan']

In [5]:
#instantiate extractor
#commented in alternate return classe in get_audio_data method to include full audio
gtza_extract = AudioFeatureExtractor(gtza_test_data.head(5))

In [6]:
#run extraction
#print statement uncommented 
gtza_extract.add_audio_data_to_df()

Processing File  /project_data_source/gtzan_dataset/Data/genres_original/blues/blues.00000.wav
Run Librosa Load
extracting_features
Processing File  /project_data_source/gtzan_dataset/Data/genres_original/blues/blues.00001.wav
Run Librosa Load
extracting_features
Processing File  /project_data_source/gtzan_dataset/Data/genres_original/blues/blues.00002.wav
Run Librosa Load
extracting_features
Processing File  /project_data_source/gtzan_dataset/Data/genres_original/blues/blues.00003.wav
Run Librosa Load
extracting_features
Processing File  /project_data_source/gtzan_dataset/Data/genres_original/blues/blues.00004.wav
Run Librosa Load
extracting_features
putting features to their own columns


In [7]:
#save the results
gtza_extract.save_results() 

In [10]:
#Load the model data for this version
model_data = ModelDataLoader('000')

In [11]:
#Note full librosa_load output array was also saved in this parquet
model_data.df

Unnamed: 0_level_0,dataset,audio_path,label,fma_genre_top,fma_genres,fma_genres_all,librosa_load,sampling_rate,features,spectral_centroids_mean,spectral_centroids_delta_mean,spectral_centroids_accelerate_mean,spectral_bandwidth_mean,spectral_rolloff_mean,zero_crossing_rate_mean,rms_mean,chroma_stft_mean,mfccs_mean
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
blues.00000.wav,gtzan,/project_data_source/gtzan_dataset/Data/genres...,blues,,,,"[0.0073242188, 0.016601562, 0.0076293945, -0.0...",22050,"[1784.1227, -0.49052292, -0.061881434, 2002.41...",1784.122681,-0.490523,-0.061881,2002.412354,3805.723145,0.083045,0.130184,0.350129,3.12123
blues.00001.wav,gtzan,/project_data_source/gtzan_dataset/Data/genres...,blues,,,,"[0.0034179688, 0.0043029785, 0.001373291, 0.00...",22050,"[1530.2617, 0.6459638, -0.0855879, 2038.9875, ...",1530.261719,0.645964,-0.085588,2038.987549,3550.713623,0.05604,0.095908,0.340849,-0.007777
blues.00002.wav,gtzan,/project_data_source/gtzan_dataset/Data/genres...,blues,,,,"[0.019012451, 0.047698975, 0.029418945, -0.018...",22050,"[1552.8325, 0.3400274, 0.025039379, 1747.754, ...",1552.83252,0.340027,0.025039,1747.754028,3042.410156,0.076291,0.175473,0.363538,3.36407
blues.00003.wav,gtzan,/project_data_source/gtzan_dataset/Data/genres...,blues,,,,"[-0.013000488, -0.03060913, -0.036071777, -0.0...",22050,"[1070.1534, 0.25125474, 0.07936938, 1596.4226,...",1070.153442,0.251255,0.079369,1596.422607,2184.87915,0.033309,0.14104,0.404854,0.86687
blues.00004.wav,gtzan,/project_data_source/gtzan_dataset/Data/genres...,blues,,,,"[-0.0063171387, -0.009277344, -0.008331299, -0...",22050,"[1835.1285, -0.17207043, -0.24526799, 1748.410...",1835.12854,-0.17207,-0.245268,1748.410767,3579.95752,0.101461,0.091501,0.308526,-9.653034
