# Design How to Combine the data sources


In [15]:
import pandas as pd
import fma_modules.utils as fma_utils
import librosa


### Data Combining
we should have a data source with harmonized ids and meta for each audio file as well as labels 

In [44]:
class DataSource():
    '''Base class for datasource information
    Attributes:
        metadata_path:  the path to data csv files
        audio_path: the path to audio files
        columns: the defined list of columns to include in dataframe
    
    
    '''
    def __init__(self,metadata_path, audio_path):
        self.metadata_path = metadata_path 
        self.audio_path = audio_path
        self.columns = ['dataset',
                        'audio_path',
                        'label',
                        'fma_genre_top',
                        'fma_genres',
                        'fma_genres_all']
        return 
    
    def get_file_meta(self):
        '''Returns standardized /harmonized dataframe'''


        return 
    
    def get_audio_paths(self):
        '''Returns series of audio paths '''
        return

        
class FreeMusicArchive(DataSource):
    '''Specifics of Free Music Archive Data Source'''

    def __init__(self, metadata_path, audio_path):
        DataSource.__init__(self, metadata_path, audio_path)
        self.tracks = tracks =fma_utils.load(self.metadata_path + 'tracks.csv')

    def get_file_meta(self):
        track_meta = self.tracks['track']
        id_and_labels = (track_meta[['genre_top','genres','genres_all']]
                         .rename(columns={'genre_top': 'fma_genre_top',
                                          'genres': 'fma_genres',
                                          'genres_all': 'fma_genres_all'
                                          
                                          })
                         )
        id_and_labels['dataset']= 'fma'
        id_and_labels['audio_path'] = self.get_audio_paths()
        id_and_labels['label'] = id_and_labels['fma_genre_top']
        
        #lower case and replace '-'
        id_and_labels['label'] = id_and_labels['label'].str.lower()
        id_and_labels['label'] = id_and_labels['label'].str.replace('-', '')

        
        return id_and_labels[self.columns]
    
    def get_audio_paths(self):
        return (self.tracks.index
                    .to_series()
                    .map(lambda index: fma_utils.get_audio_path(self.audio_path, index))
                )


class GTZAN(DataSource):
    '''Specifics of GTZAN data source'''

    def __init__(self, metadata_path, audio_path):
        DataSource.__init__(self, metadata_path, audio_path)
        self.features_30_sec = pd.read_csv(metadata_path+ 'features_30_sec.csv')
        return
    def get_file_meta(self):      
        id_and_labels = self.features_30_sec[['filename','label']].reset_index()
        id_and_labels['track_id'] = id_and_labels['filename']

        id_and_labels['dataset']= 'gtzan'

        id_and_labels['audio_path'] = self.audio_path +'/'+ id_and_labels.label + '/' + id_and_labels.filename

        harmonized = id_and_labels.set_index('track_id')
        harmonized['fma_genre_top'] = 'n/a'
        harmonized['fma_genres'] = 'n/a'
        harmonized['fma_genres_all'] = 'n/a'
        
        #consolidate subgenres to match FMA genres 
        harmonized['label'] = harmonized['label'].replace('metal', 'rock')
        harmonized['label'] = harmonized['label'].replace('disco', 'soulrnb')        
        harmonized['label'] = harmonized['label'].replace('reggae', 'international')

        return harmonized[self.columns]
    

class CombinedDataLoader():
    '''Loads each data source and provides acccess to unioned result
    Attributes:
        df:  The unioned result of common columns of fma and gtzan datasources
   
    
    
    '''
    def __init__(self):
        self.FMA_MEATADATA_PATH ="project_data_source/free_music_archive/fma_metadata/"
        self.FMA_AUDIO_PATH = "project_data_source/free_music_archive/fma_small/"
        self.fma = FreeMusicArchive(self.FMA_MEATADATA_PATH,self.FMA_AUDIO_PATH)
        self.GTZAN_MEATADATA_PATH = "project_data_source/gtzan_dataset/Data/"
        self.GTZAN_AUDIO_PATH = "project_data_source/gtzan_dataset/Data/genres_original"
        self.gtzan = GTZAN(self.GTZAN_MEATADATA_PATH,self.GTZAN_AUDIO_PATH)
        self.df = self.get_combined_df()
        return 
    def get_combined_df(self):
        
        return pd.concat([data.get_file_meta() for data in [self.fma,self.gtzan]])
    


    

### Audio Extraction Class
the combined dataframe can be the source for an audio feature extraction process where we load files into libraries like librosa and leverage the libraries extact numerical features. 
the features could be put back into the dataframe or written out somewhere. 

In [45]:
class AudioFeatureExtractor():
    '''interface for audio feature extraction libraries
    takes a dataframe of the source data as input
    provides method for adding extracted audio data and features to dataframe

    Attributes:
        source_data: should be based on CombinedDataLoader.df, a subset of rows can be passed for testing
        df : current state of the data frame 
   

    
    '''
    def __init__(self,source_data):
        self.df = source_data.copy()
        return
    
    def get_audio_data(self,file_name):
        try:
            y, sr = librosa.load(file_name)
        except:
            return 0
        return y, sr
    
    def add_audio_data_to_df(self):
        self.df['audio_data'] = self.df['audio_path'].apply(self.get_audio_data)

        return 
    
    

### Bring in Data

In [46]:
data = CombinedDataLoader()

In [47]:
combined = data.df

In [20]:
combined.groupby('dataset')['audio_path'].count()

dataset
fma      106574
gtzan      1000
Name: audio_path, dtype: int64

### Test Extraction With Librosa - FMA dataset

In [21]:
#get a subset of rows for testing 
test_data = combined.iloc[0:100]
#test_data = combined[combined.dataset == 'gtzan']

In [22]:
#instantiate extractor
afe = AudioFeatureExtractor(test_data)

In [23]:
#view source pre-extraction
afe.df

Unnamed: 0_level_0,dataset,audio_path,label,fma_genre_top,fma_genres,fma_genres_all
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2,fma,project_data_source/free_music_archive/fma_sma...,Hip-Hop,Hip-Hop,[21],[21]
3,fma,project_data_source/free_music_archive/fma_sma...,Hip-Hop,Hip-Hop,[21],[21]
5,fma,project_data_source/free_music_archive/fma_sma...,Hip-Hop,Hip-Hop,[21],[21]
10,fma,project_data_source/free_music_archive/fma_sma...,Pop,Pop,[10],[10]
20,fma,project_data_source/free_music_archive/fma_sma...,,,"[76, 103]","[17, 10, 76, 103]"
...,...,...,...,...,...,...
251,fma,project_data_source/free_music_archive/fma_sma...,,,"[12, 76]","[10, 12, 76]"
252,fma,project_data_source/free_music_archive/fma_sma...,Rock,Rock,[12],[12]
253,fma,project_data_source/free_music_archive/fma_sma...,,,"[12, 76]","[10, 12, 76]"
254,fma,project_data_source/free_music_archive/fma_sma...,,,"[12, 76]","[10, 12, 76]"


In [24]:
#run process to load librosa data into dataframe
afe.add_audio_data_to_df()

  y, sr = librosa.load(file_name)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


In [25]:
#analyze the success rate 
afe.df.groupby(afe.df['audio_data'] != 0).count()

Unnamed: 0_level_0,dataset,audio_path,label,fma_genre_top,fma_genres,fma_genres_all,audio_data
audio_data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
False,80,80,67,67,80,80,80
True,20,20,20,20,20,20,20


In [26]:
#inspect data 
afe.df['audio_data']

track_id
2      ([3.7252903e-09, 4.1909516e-09, 9.313226e-10, ...
3                                                      0
5      ([-2.3283064e-09, -6.519258e-09, 0.0, -9.31322...
10     ([3.7252903e-09, 7.450581e-09, 0.0, 7.450581e-...
20                                                     0
                             ...                        
251                                                    0
252                                                    0
253                                                    0
254                                                    0
255    ([2.7939677e-09, -1.21071935e-08, 1.8626451e-0...
Name: audio_data, Length: 100, dtype: object

#### investigate the missing files

In [27]:
afe.df['audio_path'].iloc[1]

'project_data_source/free_music_archive/fma_small/000/000003.mp3'

In [28]:
tid_str = '{:06d}'.format(3)

tid_str[:3]+'/'+tid_str

'000/000003'

### Test Extraction With Librosa - gtza dataset

In [29]:
gtza_test_data = combined[combined.dataset == 'gtzan']

#instantiate extractor
afe_gtza = AudioFeatureExtractor(gtza_test_data)
#run extraction
afe_gtza.add_audio_data_to_df()

  y, sr = librosa.load(file_name)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


In [30]:
#evaluate results
afe_gtza.df.groupby(afe_gtza.df['audio_data'] != 0).count()

Unnamed: 0_level_0,dataset,audio_path,label,fma_genre_top,fma_genres,fma_genres_all,audio_data
audio_data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
False,1,1,1,1,1,1,1
True,999,999,999,999,999,999,999


In [31]:
afe_gtza.df[['audio_data']]

Unnamed: 0_level_0,audio_data
track_id,Unnamed: 1_level_1
blues.00000.wav,"([0.0073242188, 0.016601562, 0.0076293945, -0...."
blues.00001.wav,"([0.0034179688, 0.0043029785, 0.001373291, 0.0..."
blues.00002.wav,"([0.019012451, 0.047698975, 0.029418945, -0.01..."
blues.00003.wav,"([-0.013000488, -0.03060913, -0.036071777, -0...."
blues.00004.wav,"([-0.0063171387, -0.009277344, -0.008331299, -..."
...,...
rock.00095.wav,"([-0.0826416, -0.12426758, -0.09277344, -0.074..."
rock.00096.wav,"([0.07272339, 0.10369873, 0.10211182, 0.107116..."
rock.00097.wav,"([-0.03414917, -0.038360596, 0.0030822754, 0.0..."
rock.00098.wav,"([0.0859375, 0.14483643, 0.14294434, 0.1555481..."


### Genre name exploration 

In [188]:
combined

Unnamed: 0_level_0,dataset,audio_path,label,fma_genre_top,fma_genres,fma_genres_all
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2,fma,project_data_source/free_music_archive/fma_sma...,hiphop,Hip-Hop,[21],[21]
3,fma,project_data_source/free_music_archive/fma_sma...,hiphop,Hip-Hop,[21],[21]
5,fma,project_data_source/free_music_archive/fma_sma...,hiphop,Hip-Hop,[21],[21]
10,fma,project_data_source/free_music_archive/fma_sma...,pop,Pop,[10],[10]
20,fma,project_data_source/free_music_archive/fma_sma...,,,"[76, 103]","[17, 10, 76, 103]"
...,...,...,...,...,...,...
rock.00095.wav,gtzan,project_data_source/gtzan_dataset/Data/genres_...,rock,,,
rock.00096.wav,gtzan,project_data_source/gtzan_dataset/Data/genres_...,rock,,,
rock.00097.wav,gtzan,project_data_source/gtzan_dataset/Data/genres_...,rock,,,
rock.00098.wav,gtzan,project_data_source/gtzan_dataset/Data/genres_...,rock,,,


In [189]:
combined.value_counts(['label'])

label             
rock                  14282
experimental          10608
electronic             9372
hiphop                 3652
folk                   2803
pop                    2432
instrumental           2079
international          1389
classical              1330
jazz                    671
oldtime / historic      554
spoken                  423
country                 294
blues                   210
soulrnb                 175
metal                   100
disco                   100
reggae                  100
easy listening           24
dtype: int64

In [190]:
#classes are not well balanced (rock 28% vs. classical 2.6%)
combined['label'].value_counts(normalize=True) * 100

rock                  28.226412
experimental          20.965256
electronic            18.522471
hiphop                 7.217677
folk                   5.539745
pop                    4.806514
instrumental           4.108858
international          2.745168
classical              2.628562
jazz                   1.326139
oldtime / historic     1.094905
spoken                 0.836001
country                0.581051
blues                  0.415036
soulrnb                0.345863
disco                  0.197636
metal                  0.197636
reggae                 0.197636
easy listening         0.047433
Name: label, dtype: float64

In [191]:
#count NAs in a column 
print(combined['label'].isna().sum())

# percentage missing -> loosing ~53% of data 
combined['label'].isna().sum() /len(combined)

56976


0.5296447096882146

### multiple genres

In [200]:
genres_ids = pd.read_csv('project_data_source/free_music_archive/fma_metadata/genres.csv')

label_na_df = combined[combined['label'].isna()]
label_na_df_fma = label_na_df[label_na_df['dataset'] == 'fma']
g = label_na_df_fma[['fma_genres']]

genres_df = pd.DataFrame(g.fma_genres.values.tolist()).add_prefix('genre_')[['genre_0','genre_1']]

genres_df.merge(genres_ids, left_on = 'genre_0', right_on = 'genre_id')

lookup_dict = dict(zip(genres_ids['genre_id'], genres_ids['title']))

genres_df['genre_0_title'] = genres_df['genre_0'].map(lookup_dict)
genres_df['genre_1_title'] = genres_df['genre_1'].map(lookup_dict)
genres_df['combined_genre'] = genres_df['genre_0_title'] + ' / ' + genres_df['genre_1_title']

In [202]:
genres_df

Unnamed: 0,genre_0,genre_1,genre_0_title,genre_1_title,combined_genre
0,76.0,103.0,Experimental Pop,Singer-Songwriter,Experimental Pop / Singer-Songwriter
1,76.0,103.0,Experimental Pop,Singer-Songwriter,Experimental Pop / Singer-Songwriter
2,76.0,103.0,Experimental Pop,Singer-Songwriter,Experimental Pop / Singer-Songwriter
3,76.0,103.0,Experimental Pop,Singer-Songwriter,Experimental Pop / Singer-Songwriter
4,76.0,103.0,Experimental Pop,Singer-Songwriter,Experimental Pop / Singer-Songwriter
...,...,...,...,...,...
56971,15.0,32.0,Electronic,Noise,Electronic / Noise
56972,15.0,32.0,Electronic,Noise,Electronic / Noise
56973,42.0,107.0,Ambient Electronic,Ambient,Ambient Electronic / Ambient
56974,,,,,
