# Design How to Combine the data sources


In [83]:
import pandas as pd
import fma_modules.utils as fma_utils
import librosa


### Data Combining
we should have a data source with harmonized ids and meta for each audio file as well as labels 

In [180]:
class DataSource():
    '''Base class for datasource information
    Attributes:
        metadata_path:  the path to data csv files
        audio_path: the path to audio files
        columns: the defined list of columns to include in dataframe
    
    
    '''
    def __init__(self,metadata_path, audio_path):
        self.metadata_path = metadata_path 
        self.audio_path = audio_path
        self.columns = ['dataset',
                        'audio_path',
                        'label',
                        'fma_genre_top',
                        'fma_genres',
                        'fma_genres_all']
        return 
    
    def get_file_meta(self):
        '''Returns standardized /harmonized dataframe'''


        return 
    
    def get_audio_paths(self):
        '''Returns series of audio paths '''
        return

        
class FreeMusicArchive(DataSource):
    '''Specifics of Free Music Archive Data Source'''

    def __init__(self, metadata_path, audio_path):
        DataSource.__init__(self, metadata_path, audio_path)
        self.tracks = tracks =fma_utils.load(self.metadata_path + 'tracks.csv')

    def get_file_meta(self):
        track_meta = self.tracks['track']
        id_and_labels = (track_meta[['genre_top','genres','genres_all']]
                         .rename(columns={'genre_top': 'fma_genre_top',
                                          'genres': 'fma_genres',
                                          'genres_all': 'fma_genres_all'
                                          
                                          })
                         )
        id_and_labels['dataset']= 'fma'
        id_and_labels['audio_path'] = self.get_audio_paths()
        id_and_labels['label'] = id_and_labels['fma_genre_top']

        
        return id_and_labels[self.columns]
    
    def get_audio_paths(self):
        return (self.tracks.index
                    .to_series()
                    .map(lambda index: fma_utils.get_audio_path(self.audio_path, index))
                )


class GTZAN(DataSource):
    '''Specifics of GTZAN data source'''

    def __init__(self, metadata_path, audio_path):
        DataSource.__init__(self, metadata_path, audio_path)
        self.features_30_sec = pd.read_csv(metadata_path+ 'features_30_sec.csv')
        return
    def get_file_meta(self):      
        id_and_labels = self.features_30_sec[['filename','label']].reset_index()
        id_and_labels['track_id'] = id_and_labels['filename']

        id_and_labels['dataset']= 'gtzan'

        id_and_labels['audio_path'] = self.audio_path +'/'+ id_and_labels.label + '/' + id_and_labels.filename

        harmonized = id_and_labels.set_index('track_id')
        harmonized['fma_genre_top'] = 'n/a'
        harmonized['fma_genres'] = 'n/a'
        harmonized['fma_genres_all'] = 'n/a'

        return harmonized[self.columns]
    

class CombinedDataLoader():
    '''Loads each data source and provides acccess to unioned result
    Attributes:
        df:  The unioned result of common columns of fma and gtzan datasources
   
    
    
    '''
    def __init__(self):
        self.FMA_MEATADATA_PATH ="project_data_source/free_music_archive/fma_metadata/"
        self.FMA_AUDIO_PATH = "project_data_source/free_music_archive/fma_small/"
        self.fma = FreeMusicArchive(self.FMA_MEATADATA_PATH,self.FMA_AUDIO_PATH)
        self.GTZAN_MEATADATA_PATH = "project_data_source/gtzan_dataset/Data/"
        self.GTZAN_AUDIO_PATH = "project_data_source/gtzan_dataset/Data/genres_original"
        self.gtzan = GTZAN(self.GTZAN_MEATADATA_PATH,self.GTZAN_AUDIO_PATH)
        self.df = self.get_combined_df()
        return 
    def get_combined_df(self):
        
        return pd.concat([data.get_file_meta() for data in [self.fma,self.gtzan]])
    


    

### Audio Extraction Class
the combined dataframe can be the source for an audio feature extraction process where we load files into libraries like librosa and leverage the libraries extact numerical features. 
the features could be put back into the dataframe or written out somewhere. 

In [174]:
class AudioFeatureExtractor():
    '''interface for audio feature extraction libraries
    takes a dataframe of the source data as input
    provides method for adding extracted audio data and features to dataframe

    Attributes:
        source_data: should be based on CombinedDataLoader.df, a subset of rows can be passed for testing
        df : current state of the data frame 
   

    
    '''
    def __init__(self,source_data):
        self.df = source_data.copy()
        return
    
    def get_audio_data(self,file_name):
        try:
            y, sr = librosa.load(file_name)
        except:
            return 0
        return y, sr
    
    def add_audio_data_to_df(self):
        self.df['audio_data'] = self.df['audio_path'].apply(self.get_audio_data)

        return 
    
    

### Bring in Data

In [181]:
data = CombinedDataLoader()

In [182]:
combined = data.df

In [183]:
combined.groupby('dataset')['audio_path'].count()

dataset
fma      106574
gtzan      1000
Name: audio_path, dtype: int64

### Test Extraction With Librosa - FMA dataset

In [190]:
#get a subset of rows for testing 
test_data = combined.iloc[0:100]
#test_data = combined[combined.dataset == 'gtzan']

In [191]:
#instantiate extractor
afe = AudioFeatureExtractor(test_data)

In [192]:
#view source pre-extraction
afe.df

Unnamed: 0_level_0,dataset,audio_path,label,fma_genre_top,fma_genres,fma_genres_all
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2,fma,project_data_source/free_music_archive/fma_sma...,Hip-Hop,Hip-Hop,[21],[21]
3,fma,project_data_source/free_music_archive/fma_sma...,Hip-Hop,Hip-Hop,[21],[21]
5,fma,project_data_source/free_music_archive/fma_sma...,Hip-Hop,Hip-Hop,[21],[21]
10,fma,project_data_source/free_music_archive/fma_sma...,Pop,Pop,[10],[10]
20,fma,project_data_source/free_music_archive/fma_sma...,,,"[76, 103]","[17, 10, 76, 103]"
...,...,...,...,...,...,...
251,fma,project_data_source/free_music_archive/fma_sma...,,,"[12, 76]","[10, 12, 76]"
252,fma,project_data_source/free_music_archive/fma_sma...,Rock,Rock,[12],[12]
253,fma,project_data_source/free_music_archive/fma_sma...,,,"[12, 76]","[10, 12, 76]"
254,fma,project_data_source/free_music_archive/fma_sma...,,,"[12, 76]","[10, 12, 76]"


In [193]:
#run process to load librosa data into dataframe
afe.add_audio_data_to_df()

  y, sr = librosa.load(file_name)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


In [194]:
#analyze the success rate 
afe.df.groupby(afe.df['audio_data'] != 0).count()

Unnamed: 0_level_0,dataset,audio_path,label,fma_genre_top,fma_genres,fma_genres_all,audio_data
audio_data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
False,80,80,67,67,80,80,80
True,20,20,20,20,20,20,20


In [195]:
#inspect data 
afe.df['audio_data']

track_id
2      ([9.313226e-09, 2.7939677e-09, -3.7252903e-09,...
3                                                      0
5      ([-2.561137e-09, 5.5879354e-09, -5.5879354e-09...
10     ([-2.2351742e-08, 7.450581e-09, -7.450581e-09,...
20                                                     0
                             ...                        
251                                                    0
252                                                    0
253                                                    0
254                                                    0
255    ([-4.656613e-09, -7.450581e-09, 9.313226e-10, ...
Name: audio_data, Length: 100, dtype: object

#### investigate the missing files

In [131]:
afe.df['audio_path'].iloc[1]

'project_data_source/free_music_archive/fma_small/000/000003.mp3'

In [133]:
tid_str = '{:06d}'.format(3)

tid_str[:3]+'/'+tid_str

'000/000003'

### Test Extraction With Librosa - gtza dataset

In [198]:
gtza_test_data = combined[combined.dataset == 'gtzan']

#instantiate extractor
afe_gtza = AudioFeatureExtractor(gtza_test_data)
#run extraction
afe_gtza.add_audio_data_to_df()

  y, sr = librosa.load(file_name)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


In [200]:
#evaluate results
afe_gtza.df.groupby(afe_gtza.df['audio_data'] != 0).count()

Unnamed: 0_level_0,dataset,audio_path,label,fma_genre_top,fma_genres,fma_genres_all,audio_data
audio_data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
False,1,1,1,1,1,1,1
True,999,999,999,999,999,999,999


In [203]:
afe_gtza.df[['audio_data']]

Unnamed: 0_level_0,audio_data
track_id,Unnamed: 1_level_1
blues.00000.wav,"([0.0073242188, 0.016601562, 0.0076293945, -0...."
blues.00001.wav,"([0.0034179688, 0.0043029785, 0.001373291, 0.0..."
blues.00002.wav,"([0.019012451, 0.047698975, 0.029418945, -0.01..."
blues.00003.wav,"([-0.013000488, -0.03060913, -0.036071777, -0...."
blues.00004.wav,"([-0.0063171387, -0.009277344, -0.008331299, -..."
...,...
rock.00095.wav,"([-0.0826416, -0.12426758, -0.09277344, -0.074..."
rock.00096.wav,"([0.07272339, 0.10369873, 0.10211182, 0.107116..."
rock.00097.wav,"([-0.03414917, -0.038360596, 0.0030822754, 0.0..."
rock.00098.wav,"([0.0859375, 0.14483643, 0.14294434, 0.1555481..."
