In [3]:
import librosa
import pandas as pd
from tqdm import tqdm_notebook

### Feature Extraction
This is the feature extraction used for instrument identification, the features used were selected through reading of articles and analyzing results of the sources below. The file's features are extracted and returned in an array.

#### Sources:
[MFCC, Mel Spec, Chroma Constant Q](https://www2.ak.tu-berlin.de/~akgroup/ak_pub/abschlussarbeiten/2018/Seipel_MasA.pdf)

[MFCC, Constant Q Transform](https://jamesowers.github.io/files/thesis.pdf)

[Analysis of MFCC Coefficients](http://www.haskins.yale.edu/sr/SR061/SR061_14.pdf)

[General Feature Extraction](https://medium.com/@nadimkawwa/can-we-guess-musical-instruments-with-machine-learning-afc8790590b8)

In [4]:
"""
Name: feature_extract
Input: String name of file to analyze
Returns: 
Array of:
    y_harmonic
    y_percussive 
    chroma_cens 
    mfcc
    mel_spec
    spec_contrast
Note: 
    MFCC and Chroma_cens are arrays of 12
    Mel_spec and spec_contrast are also arrays of 1xN.
"""
def feature_extract(file):
    y, sr = librosa.load(file, sr=None)
    
    hop_length = 512
    
    # Separate harmonics and percussives into two waveforms
    y_harmonic, y_percussive = librosa.effects.hpss(y)    
    
    #Chroma Energy Normalized (CENS)
    chroma_cens = librosa.feature.chroma_cens(y=y, sr=sr)
    
    #Mel Spectrogram
    mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, 
                                                 fmax = 8000)
    #Mel-Frequency Cepstral Coefficients (MFCC) features from the raw signal
    mfcc = librosa.feature.mfcc(y=y, sr=sr, hop_length=hop_length, n_mfcc=13)
    
    #Spectral Contrast
    spec_contrast = librosa.feature.spectral_contrast(y=y, sr=sr)
    
    y_harmonic = np.mean(y_harmonic)
    y_percussive = np.mean(y_percussive)
    mel_spec = np.mean(mel_spec, axis=1)
    mfcc = np.mean(mfcc, axis=1)
    chroma_cens = np.mean(chroma_cens,axis=1)
    spec_contrast = np.mean(spec_contrast, axis=1)
    
    return [y_harmonic, y_percussive, chroma_cens, mfcc, mel_spec, 
            spec_contrast]

### Build Dataset 
Builds the dataset extracting all features to be used for prediction.
Extracts the features into csv files for train, test and valid NSynth data folders.

Note: Uses NSynth Qualities that are in the annotated JSON files that accompany the datasets. 
If you are training with NSynth and predicting on non NSynth, either: 
- Create your own Qualities
- Use the build_dataset_noQualities function

In [6]:
"""
Name: build_dataset
Input: 
    data_group: Train, valid, test set name
    source: Acoustic, electronic, synthetic
    class1: First instrument to classify
    class2: second instrument to classify
Returns: 
Array of:
    y_harmonic
    y_percussive 
    mel_spec
    mfcc_0-12
    chroma_0-12
    spec_contrast
"""
def build_dataset(data_group, source, class1, class2):
    
    new_dir='Dataset/nsynth-'+data_group+'/audio/'     #set the audio directory (test, train, etc)
    dataframe_raw = pd.read_json(path_or_buf='Dataset/nsynth-'+data_group+'/examples.json', orient='index') #read all instruments from examples.json
    dataframe_specific = dataframe_raw.loc[(dataframe_raw['instrument_family_str'] == class2) | (dataframe_raw['instrument_family_str'] == class1)]           #narrow down by family (strings, etc)
    dataframe_specific = dataframe_specific.loc[dataframe_specific['instrument_source_str'] == source]     #narrow down by source (acoustic, etc)

   
    Y_target_class = dataframe_specific.instrument_family_str.replace(to_replace=[class2, class1], value=[0, 1])
    filenames = dataframe_specific.index.tolist()     #get filenames from our dataframe, put into list
    
    dictionary = {}
    #Create the dictionary of files.
    #Note: TQDM is a loading bar
    for file in tqdm_notebook(filenames):           
        features = feature_extract((new_dir+file+'.wav'))
        dictionary[file] = features
    
    feature_dataframe = pd.DataFrame.from_dict(dictionary, orient='index',
                                       columns=['y_harmonic', 'y_percussive', 'chroma_cens', 
                                                'mfcc', 'mel_spec', 'spec_contrast'])
    
    #Take averages of each coefficient etc and create their own feature
    mel_spec_data = pd.DataFrame(feature_dataframe.mel_spec.values.tolist(), 
                                 index=feature_dataframe.index)
    mel_spec_data = mel_spec_data.add_prefix('Mel_Spec_')
    
    mfcc_data = pd.DataFrame(feature_dataframe.mfcc.values.tolist(), 
                             index=feature_dataframe.index)
    mfcc_data = mfcc_data.add_prefix('MFCC_')
    
    chroma_data = pd.DataFrame(feature_dataframe.chroma_cens.values.tolist(), 
                               index=feature_dataframe.index)
    chroma_data = chroma_data.add_prefix('Chroma_')
    
    spec_contrast_data = pd.DataFrame(feature_dataframe.spec_contrast.values.tolist(), 
                                      index=feature_dataframe.index)
    spec_contrast_data = spec_contrast_data.add_prefix('Spec_Contrast_')
    
    #Drop the old feature columns
    feature_dataframe = feature_dataframe.drop(
        labels=['mel_spec', 'mfcc',
                'chroma_cens', 'spec_contrast'],
                                       axis=1)
    #Add the extracted features
    feature_dataframe = pd.concat([feature_dataframe, mel_spec_data, mfcc_data, 
                           chroma_data, spec_contrast_data],
                         axis = 1, join='inner')
    
    qualities =  pd.DataFrame(dataframe_specific.qualities.values.tolist(), 
                         index = dataframe_specific.index)
    qualities = qualities.add_prefix('NSynth_Quality_')
    dataframe_specific = dataframe_specific.drop(labels=['instrument', 'instrument_family',
                                                          'instrument_family_str', 'instrument_source',
                                                          'instrument_source_str', 'instrument_str',
                                                          'note', 'note_str', 'pitch',
                                                          'qualities_str', 'sample_rate',
                                                         'qualities'], axis=1)
    dataframe_specific.drop(dataframe_specific.columns[0], axis=1, inplace=True)
    feature_final = pd.concat([dataframe_specific, feature_dataframe, qualities], axis=1, sort=False)
    feature_final['target'] = Y_target_class
    feature_final.to_csv('./'+data_group+'.csv')
    
    #returns dataframe of features
    return feature_final 

In [5]:
# get string members from dataset
build_dataset('train', 'acoustic', 'keyboard','string')
build_dataset('valid', 'acoustic', 'keyboard','string')
build_dataset('test', 'acoustic', 'keyboard','string')


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=27458.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=1135.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=425.0), HTML(value='')))




Unnamed: 0,y_harmonic,y_percussive,Mel_Spec_0,Mel_Spec_1,Mel_Spec_2,Mel_Spec_3,Mel_Spec_4,Mel_Spec_5,Mel_Spec_6,Mel_Spec_7,...,NSynth_Quality_1,NSynth_Quality_2,NSynth_Quality_3,NSynth_Quality_4,NSynth_Quality_5,NSynth_Quality_6,NSynth_Quality_7,NSynth_Quality_8,NSynth_Quality_9,target
string_acoustic_056-047-075,2.276689e-06,-2.910399e-06,0.001254,0.000437,0.007593,0.132260,3.299507,1.221329,0.887105,0.396326,...,0,0,1,0,0,0,0,1,0,0
string_acoustic_014-063-050,-6.387113e-06,3.891315e-06,0.004674,0.012480,0.172364,0.511060,1.016936,0.569358,0.130581,0.286367,...,0,0,1,0,0,0,1,1,0,0
keyboard_acoustic_004-058-127,-3.420692e-06,-5.919131e-06,0.146179,0.369565,0.398131,0.961064,0.372122,0.310898,0.396792,1.115348,...,0,0,0,0,0,0,0,1,0,1
keyboard_acoustic_004-102-100,-8.896791e-07,7.785799e-07,0.198365,0.214118,0.199922,0.367875,0.134841,0.071007,0.038443,0.126842,...,0,0,0,0,0,0,1,1,0,1
string_acoustic_071-035-100,6.045782e-07,1.035758e-05,0.158858,1063.997437,1872.624268,3.170357,205.077103,79.005653,1.098996,7.933605,...,0,0,0,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
string_acoustic_056-071-100,2.481437e-07,8.309302e-07,0.000416,0.000115,0.000033,0.000027,0.000031,0.000283,0.002327,0.005635,...,0,0,1,0,0,0,1,1,0,0
string_acoustic_057-079-025,1.537761e-06,-5.174143e-06,0.005490,0.011117,0.006833,0.004046,0.002382,0.001289,0.001755,0.001601,...,0,0,0,0,0,1,0,1,0,0
keyboard_acoustic_004-105-025,1.393454e-06,-7.359323e-05,4.365399,6.370022,6.462551,14.129640,6.955965,3.840810,2.327511,4.322537,...,0,0,0,0,0,0,0,1,0,1
string_acoustic_071-027-127,6.190578e-07,7.037017e-06,0.365868,8.830516,97.897964,41.824306,68.299484,6.137504,3.585146,2.461521,...,0,0,0,1,0,0,0,1,0,0


### Build Dataset No NSynth Qualities
Builds the dataset extracting all features to be used for prediction.
Extracts the features into csv files for train, test and valid NSynth data folders.

Note: Does NOT use NSynth Qualities that are in the annotated JSON files that accompany the datasets. 

In [8]:

"""
Name: build_dataset_noQualities
Input: 
    data_group: Train, valid, test set name
    source: Acoustic, electronic, synthetic
    class1: First instrument to classify
    class2: second instrument to classify
Returns: 
Array of:
    y_harmonic
    y_percussive 
    mel_spec
    mfcc_0-12
    chroma_0-12
    spec_contrast
    
NOTE: 
In this version of get_dataset, NSynth Qualities are not used.
This is because the RACK dataset does not include the annotations,
and therefore cannot be used for model building or predicting.
"""
def build_dataset_noQualities(data_group, source, class1, class2):
    
    new_dir='Dataset/'+data_group+'/audio/'     #set the audio directory (test, train, etc)
    #read all instruments from examples.json
    dataframe_raw = pd.read_json(path_or_buf='Dataset/'+data_group+'/examples.json', orient='index') 
    #narrow down by family (strings, etc)
    dataframe_specific = dataframe_raw.loc[(dataframe_raw['instrument_family_str'] == class1) 
                                           | (dataframe_raw['instrument_family_str'] == class2)]
    #narrow down by source (acoustic, etc)
    dataframe_specific = dataframe_specific.loc[dataframe_specific['instrument_source_str'] == source]     

    Y_target_class = dataframe_specific.instrument_family_str.replace(to_replace=[class1, class2], value=[0, 1])
    filenames = dataframe_specific.index.tolist()     #get filenames from our dataframe, put into list
    
    dictionary = {}
    #Create the dictionary of files.
    #Note: TQDM is a loading bar
    for file in tqdm_notebook(filenames):           
        features = feature_extract((new_dir+file+'.wav'))
        dictionary[file] = features
    
    feature_dataframe = pd.DataFrame.from_dict(dictionary, orient='index',
                                       columns=['y_harmonic', 'y_percussive', 'chroma_cens', 
                                                'mfcc', 'mel_spec', 'spec_contrast'])
    
    #Take averages of each coefficient etc and create their own feature
    mel_spec_data = pd.DataFrame(feature_dataframe.mel_spec.values.tolist(), 
                                 index=feature_dataframe.index)
    mel_spec_data = mel_spec_data.add_prefix('Mel_Spec_')
    
    mfcc_data = pd.DataFrame(feature_dataframe.mfcc.values.tolist(), 
                             index=feature_dataframe.index)
    mfcc_data = mfcc_data.add_prefix('MFCC_')
    
    chroma_data = pd.DataFrame(feature_dataframe.chroma_cens.values.tolist(), 
                               index=feature_dataframe.index)
    chroma_data = chroma_data.add_prefix('Chroma_')
    
    spec_contrast_data = pd.DataFrame(feature_dataframe.spec_contrast.values.tolist(), 
                                      index=feature_dataframe.index)
    spec_contrast_data = spec_contrast_data.add_prefix('Spec_Contrast_')
    
    #Drop the old feature columns
    feature_dataframe = feature_dataframe.drop(
        labels=['mel_spec', 'mfcc',
                'chroma_cens', 'spec_contrast'],
                                       axis=1)
    #Add the extracted features
    feature_dataframe = pd.concat([feature_dataframe, mel_spec_data, mfcc_data, 
                           chroma_data, spec_contrast_data],
                         axis = 1, join='inner')
    
    feature_dataframe['target'] = Y_target_class
    feature_dataframe.to_csv('./RACK_'+data_group+'.csv')
    
    #returns dataframe of features
    return feature_dataframe 

In [15]:
# get string members from dataset
#Valid will be used to train, test for validation and rack for test prediction
build_dataset_noQualities('nsynth-valid', 'acoustic', 'keyboard','string')
build_dataset_noQualities('nsynth-test', 'acoustic', 'keyboard','string')
build_dataset_noQualities('rack', 'acoustic', 'keyboard','string')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=1135.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=425.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=91.0), HTML(value='')))




Unnamed: 0,y_harmonic,y_percussive,Mel_Spec_0,Mel_Spec_1,Mel_Spec_2,Mel_Spec_3,Mel_Spec_4,Mel_Spec_5,Mel_Spec_6,Mel_Spec_7,...,Chroma_3,Chroma_4,Chroma_5,Chroma_6,Chroma_7,Chroma_8,Chroma_9,Chroma_10,Chroma_11,target
piano_C3,-3.013500e-05,-4.806918e-05,0.010675,0.020180,0.007597,0.011244,0.275979,0.390320,0.038722,0.011710,...,0.056698,0.080517,0.076741,0.284282,0.419057,0.263463,0.053512,0.082871,0.379030,0
piano_C#3,-4.532198e-05,-3.599460e-05,0.004827,0.002378,0.004896,0.005988,0.330815,2.059720,0.107522,0.011353,...,0.067539,0.125573,0.169934,0.069656,0.064145,0.231651,0.077618,0.120991,0.135158,0
piano_D3,-3.923611e-05,-4.110286e-05,0.004261,0.002287,0.007585,0.013103,0.024830,2.144192,0.747066,0.012003,...,0.339147,0.094621,0.123334,0.155904,0.083371,0.239996,0.371104,0.317413,0.164585,0
piano_D#3,-4.894353e-05,-3.760510e-05,0.003800,0.001841,0.002671,0.005933,0.008173,2.686152,3.627953,0.052367,...,0.579025,0.397959,0.091141,0.117771,0.154132,0.088176,0.060912,0.240429,0.119155,0
piano_E3,-4.348458e-05,-3.993408e-05,0.003977,0.001781,0.004463,0.014611,0.010162,0.714662,7.354862,0.469965,...,0.412445,0.553940,0.391922,0.080273,0.080177,0.081519,0.072212,0.235855,0.338749,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
violin_short_G#4,2.499617e-06,1.922243e-06,0.000943,0.000878,0.000378,0.000188,0.000393,0.000674,0.001414,0.001356,...,0.249586,0.336175,0.420468,0.334355,0.287580,0.232396,0.223956,0.238590,0.219141,1
violin_short_G#5,1.450308e-05,4.068461e-07,0.001391,0.000438,0.000369,0.000332,0.000263,0.000777,0.001781,0.001617,...,0.308798,0.273399,0.299165,0.405413,0.329210,0.247434,0.215524,0.238310,0.207673,1
violin_short_G3,-7.049761e-07,4.481533e-06,0.004598,0.002277,0.000668,0.000447,0.000457,0.000651,0.001624,0.119252,...,0.219663,0.209422,0.203364,0.416846,0.465160,0.328932,0.084750,0.128130,0.145641,1
violin_short_G4,3.747064e-06,1.747863e-06,0.000805,0.000335,0.000368,0.000228,0.000419,0.000634,0.001343,0.001393,...,0.252244,0.237999,0.306162,0.407312,0.326623,0.235803,0.222374,0.276341,0.215604,1


### Build Dataset No Spec Contrast OR NSynth
Builds the dataset extracting all features to be used for prediction.
Extracts the features into csv files for train, test and valid NSynth data folders.

Note: Does NOT use Spec Contrast 

In [13]:
"""
Name: build_dataset_noSpec
Input: 
    data_group: Train, valid, test set name
    source: Acoustic, electronic, synthetic
    class1: First instrument to classify
    class2: second instrument to classify
Returns: 
Array of:
    y_harmonic
    y_percussive 
    mel_spec
    mfcc_0-12
    chroma_0-12
    
NOTE: 
In this version of get_dataset, NSynth Qualities are not used.
This is because the RACK dataset does not include the annotations,
and therefore cannot be used for model building or predicting.
"""
def build_dataset_noSpec(data_group, source, class1, class2):
    
    new_dir='Dataset/'+data_group+'/audio/'     #set the audio directory (test, train, etc)
    #read all instruments from examples.json
    dataframe_raw = pd.read_json(path_or_buf='Dataset/'+data_group+'/examples.json', orient='index') 
    #narrow down by family (strings, etc)
    dataframe_specific = dataframe_raw.loc[(dataframe_raw['instrument_family_str'] == class1) 
                                           | (dataframe_raw['instrument_family_str'] == class2)]
    #narrow down by source (acoustic, etc)
    dataframe_specific = dataframe_specific.loc[dataframe_specific['instrument_source_str'] == source]     

    Y_target_class = dataframe_specific.instrument_family_str.replace(to_replace=[class1, class2], value=[0, 1])
    filenames = dataframe_specific.index.tolist()     #get filenames from our dataframe, put into list
    
    dictionary = {}
    #Create the dictionary of files.
    #Note: TQDM is a loading bar
    for file in tqdm_notebook(filenames):           
        features = feature_extract((new_dir+file+'.wav'))
        dictionary[file] = features
    
    #Still retrieves spec contrast but doesn't use it
    feature_dataframe = pd.DataFrame.from_dict(dictionary, orient='index',
                                       columns=['y_harmonic', 'y_percussive', 'chroma_cens', 
                                                'mfcc', 'mel_spec', 'spec_contrast'])
    
    #Take averages of each coefficient etc and create their own feature
    mel_spec_data = pd.DataFrame(feature_dataframe.mel_spec.values.tolist(), 
                                 index=feature_dataframe.index)
    mel_spec_data = mel_spec_data.add_prefix('Mel_Spec_')
    
    mfcc_data = pd.DataFrame(feature_dataframe.mfcc.values.tolist(), 
                             index=feature_dataframe.index)
    mfcc_data = mfcc_data.add_prefix('MFCC_')
    
    chroma_data = pd.DataFrame(feature_dataframe.chroma_cens.values.tolist(), 
                               index=feature_dataframe.index)
    chroma_data = chroma_data.add_prefix('Chroma_')

    #Drop the old feature columns
    feature_dataframe = feature_dataframe.drop(
        labels=['mel_spec', 'mfcc',
                'chroma_cens', 'spec_contrast'],
                                       axis=1)
    #Add the extracted features
    feature_dataframe = pd.concat([feature_dataframe, mel_spec_data, mfcc_data, 
                           chroma_data],
                         axis = 1, join='inner')
    
    feature_dataframe['target'] = Y_target_class
    feature_dataframe.to_csv('./No_Spec_'+data_group+'.csv')
    
    #returns dataframe of features
    return feature_dataframe 

In [14]:
# get string members from dataset
build_dataset_noSpec('nsynth-train', 'acoustic', 'keyboard','string')
build_dataset_noSpec('nsynth-valid', 'acoustic', 'keyboard','string')
build_dataset_noSpec('nsynth-test', 'acoustic', 'keyboard','string')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=27458.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1135.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=425.0), HTML(value='')))




Unnamed: 0,y_harmonic,y_percussive,Mel_Spec_0,Mel_Spec_1,Mel_Spec_2,Mel_Spec_3,Mel_Spec_4,Mel_Spec_5,Mel_Spec_6,Mel_Spec_7,...,Chroma_3,Chroma_4,Chroma_5,Chroma_6,Chroma_7,Chroma_8,Chroma_9,Chroma_10,Chroma_11,target
string_acoustic_056-047-075,2.276689e-06,-2.910399e-06,0.001254,0.000437,0.007593,0.132260,3.299507,1.221329,0.887105,0.396326,...,0.041453,0.073973,0.101020,0.121887,0.165693,0.052606,0.126301,0.395618,0.512497,1
string_acoustic_014-063-050,-6.387113e-06,3.891315e-06,0.004674,0.012480,0.172364,0.511060,1.016936,0.569358,0.130581,0.286367,...,0.247231,0.311355,0.258811,0.260765,0.232921,0.191741,0.020246,0.016653,0.052146,1
keyboard_acoustic_004-058-127,-3.420692e-06,-5.919131e-06,0.146179,0.369565,0.398131,0.961064,0.372122,0.310898,0.396792,1.115348,...,0.097787,0.106359,0.096473,0.059277,0.032449,0.032214,0.458635,0.633814,0.473030,0
keyboard_acoustic_004-102-100,-8.896791e-07,7.785799e-07,0.198365,0.214118,0.199922,0.367875,0.134841,0.071007,0.038443,0.126842,...,0.227324,0.222901,0.408542,0.391293,0.223255,0.100572,0.052492,0.079999,0.222173,0
string_acoustic_071-035-100,6.045782e-07,1.035758e-05,0.158858,1063.997437,1872.624268,3.170357,205.077103,79.005653,1.098996,7.933605,...,0.000000,0.000000,0.000000,0.001142,0.001142,0.001142,0.007257,0.464621,0.681546,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
string_acoustic_056-071-100,2.481437e-07,8.309302e-07,0.000416,0.000115,0.000033,0.000027,0.000031,0.000283,0.002327,0.005635,...,0.159077,0.075835,0.065218,0.154160,0.092104,0.146989,0.033909,0.215096,0.259627,1
string_acoustic_057-079-025,1.537761e-06,-5.174143e-06,0.005490,0.011117,0.006833,0.004046,0.002382,0.001289,0.001755,0.001601,...,0.072011,0.047355,0.056312,0.463472,0.652449,0.502598,0.071499,0.012553,0.049475,1
keyboard_acoustic_004-105-025,1.393454e-06,-7.359323e-05,4.365399,6.370022,6.462551,14.129640,6.955965,3.840810,2.327511,4.322537,...,0.398648,0.496591,0.417117,0.299458,0.181825,0.123395,0.033306,0.058485,0.110097,0
string_acoustic_071-027-127,6.190578e-07,7.037017e-06,0.365868,8.830516,97.897964,41.824306,68.299484,6.137504,3.585146,2.461521,...,0.585254,0.376469,0.002584,0.000000,0.019717,0.014820,0.238283,0.416757,0.293561,1
