# UMAP for bird-songs  
---

In [None]:
## Installs for Coursera ^^
# conda install -c conda-forge librosa
# conda install -c conda-forge umap-learn
# conda install -c plotly plotly=4.14.3

In [2]:
from umap import UMAP
from sklearn.datasets import load_digits
import plotly.express as px
import plotly.io as pio
pio.renderers.default = 'iframe'

import os
import numpy as np
from matplotlib import pyplot as plt
import IPython.display as ipd
import librosa
import librosa.display
import pandas as pd
import glob
# import ffmpeg
%matplotlib inline

# Set general font size
plt.rcParams['font.size'] = '14'

import warnings
warnings.filterwarnings("ignore")

from tqdm.notebook import tqdm


In [3]:
def get_audio_filenames(audio_folder):
    '''Create a list of audio files in the provided folder'''
    
    audio_files = []
    for fn_mp3 in glob.glob(f"{audio_folder}*"):
        audio_files.append(fn_mp3)
    return audio_files

audio_filenames = get_audio_filenames('audio/')
    

In [4]:
def create_df_xeno_canto(csv_file, audio_folder, min_dur, max_dur):
    '''Create pandas dataframe with features provided by xeno-canto and append file name of audio file'''
    
    df = pd.read_csv(csv_file, header=None)
    
    df.columns = ['id', 'genus', 'species', 'subspecies', 'name', 'location',
                  'type', 'quality', 'length', 'bird-seen']
    df['filename'] = df['id'].map('audio/{}.mp3'.format, na_action='ignore')
    df['seconds'] = df['length'].apply(lambda x: int(x[0])*60 + int(x[-2:]))
    
    df = df[(df['seconds']>=min_dur) & (df['seconds']<=max_dur)]
    
    df.set_index('id', inplace=True)

    return df

df_xeno_canto = create_df_xeno_canto('features.csv', 'audio/', 2, 10)

# df_xeno_canto.sort_values(by='seconds', ascending=True).head()
df_xeno_canto.head()

Unnamed: 0_level_0,genus,species,subspecies,name,location,type,quality,length,bird-seen,filename,seconds
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
11500,Branta,canadensis,maxima or interior,Canada Goose,United States,call,C,0:07,unknown,audio/11500.mp3,7
298756,Cygnus,buccinator,,Trumpeter Swan,United States,call,B,0:10,yes,audio/298756.mp3,10
298754,Cygnus,buccinator,,Trumpeter Swan,United States,call,B,0:02,yes,audio/298754.mp3,2
11848,Cygnus,columbianus,,Tundra Swan,United States,Call,B,0:04,unknown,audio/11848.mp3,4
298758,Anas,platyrhynchos,,Mallard,United States,call,B,0:09,yes,audio/298758.mp3,9


In [30]:
def create_mfcc_spectral_features(number_mfcc, sample_rate=22050):
    ''' '''

    # Create df to hold mfcc spectral features
    df = pd.DataFrame(columns = ['mfcc_avg' + str(item+1) for item in list(range(number_mfcc))] +
                                ['mfcc_std' + str(item+1) for item in list(range(number_mfcc))], 
                      index = df_xeno_canto['filename'])
        

    for audio_filename in tqdm(df_xeno_canto['filename']):


        y, sr = librosa.load(audio_filename, sr=sample_rate, mono=True)
        mfccs = librosa.feature.mfcc(y=y[:110250], sr=sr, n_mfcc=number_mfcc, hop_length=2048)
        #print(mfccs)

        mfcc_means = [np.mean(item) for item in mfccs]
        mfcc_stds = [np.std(item) for item in mfccs]
        # print("mfcc means:", mfcc_means, "\nmfcc std deviations:", mfcc_stds)
        df.loc[audio_filename] = mfcc_means + mfcc_stds
  
    return df


number_mfcc = 12
df_mfcc = create_mfcc_spectral_features(number_mfcc)


  0%|          | 0/143 [00:00<?, ?it/s]

In [31]:
def create_df_umap(df1, df2):

    df_umap = df1.merge(df2, how ='left', left_on='filename', right_index=True)

    return df_umap

df_umap = create_df_umap(df_xeno_canto, df_mfcc)


In [32]:
def create_umap(df, num_features=20, color_feature='species'):
    
    umap_2d = UMAP(random_state=0)
    umap_2d.fit(df.iloc[:, -num_features:].values)

    projections = umap_2d.transform(df.iloc[:, -num_features:].values)

    fig = px.scatter(
        projections, x=0, y=1,
        color=list(df[color_feature].astype(str)), labels={'color': 'genus'},
        title = f"UMAP Plot of {len(df)} bird audio samples colored by {color_feature}"
    )
    
    return fig


create_umap(df_umap, number_mfcc*2, 'genus')
