# UMAP Visualization of Bird Songs and Calls
---

In [1]:
## Installs for Coursera (on terminal only)
# conda install -c conda-forge librosa umap-learn
# conda install -c plotly plotly=4.14.3

In [2]:
## Installs for CoLab (run from notebook)
# !pip install umap-learn
# !pip install plotly==4.14.3

In [3]:
# # Mount Google Drive if running on CoLab
# # 4/1AY0e-g6CTocvBK45PWlJu2ycBuOMTgu36b-VZgpnCXitW_Vy3ckpGGBzur8
# from google.colab import drive
# drive.mount('content')
# # /content/content/MyDrive/bird-songs/audio

## 1. Import Libraries

In [4]:
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
import os
from collections import OrderedDict
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings("ignore")

# Libraries for audio processing
import librosa
import librosa.display

# Libraries for UMAP and t-SNE
from umap import UMAP
from sklearn.manifold import TSNE

# Libraries for visualization
import plotly.express as px
import plotly.io as pio
pio.renderers.default = 'iframe'
from matplotlib import pyplot as plt
%matplotlib inline
# Set general font size for matplotlib
plt.rcParams['font.size'] = '14'


# Check whether google drive mounted
path = '/content/content/MyDrive/bird-songs/'
if os.path.isdir(path) == True:
    print('Google Drive Mounted')
    run_on_colab = True
else:
    print('Using local drive')
    run_on_colab = False

Using local drive


## 2. Define Global Variables for Analysis

In [5]:
SAMPLE_RATE = 22050 # Hz
SUBCLIP_SEC = 1.5 # sec
NUMBER_MEL = 20
FMIN = 4000 # Minimum Hz
N_FFT = 512 # Choose 2^n where n is integer
HOP_LENGTH = 256 # Choose equal, half, or quarter of N_FFT

## 3. Set up Dataframe to Import and Label Audio Files

In [6]:
def audio_path_filename(fileid):
    '''Helper function to set path and filename for each audio file, based on file id from the xeno canto
    database and whether this notebook is run locally or on CoLab'''
    
    if run_on_colab == True:
        path_filename = '/content/content/MyDrive/bird-songs/audio_8sec/' + str(fileid) + '.wav'
                                
    else:
        path_filename = 'audio/audio_8sec/' + str(fileid) + '.wav'
    
    return path_filename


def load_xeno_canto_data():
    '''Create dataframe that holds complete set of audio files that will be used for subsequent UMAP analysis.
    Each row in the returned dataframe will contain the file id as its index, the english name of the bird species
    (en), the type of recording (type), label number corresponding to species (label), and the complete path to the
    file (filename)
    '''

    if run_on_colab == True:
        df = pd.read_csv('/content/content/MyDrive/bird-songs/features_filtered.csv', encoding='latin')
    else:
        df = pd.read_csv('features_filtered.csv', encoding='latin')
        
    df.rename(columns={"id": "fileid"}, inplace=True)

    # These are the bird species we selected for our analysis
    selected_species = ['Red-winged Blackbird', 'Common Yellowthroat', 'Northern Cardinal', 
                        'Carolina Wren', 'Red Crossbill', 'Spotted Towhee']

    # We only need the fileid, labels, and type
    df = df[df.en.isin(selected_species)][['fileid', 'en', 'type']]
    df['type'] = df['type'].str.lower()
    
    # For convenience, create a label column holding integers corresponding to each bird species
    labels = {"Red-winged Blackbird": 0, 'Common Yellowthroat': 1, 'Northern Cardinal': 2,
              'Carolina Wren': 3,'Red Crossbill': 4, 'Spotted Towhee': 5}
    df['label'] = df.apply(lambda row: labels[row.en], axis=1)
    # Add filename and path
    df['filename'] = df['fileid'].apply(lambda x: audio_path_filename(x))
    df.set_index('fileid', inplace=True)
    
    print(f'The dataframe has a length of {len(df)} rows')
    
    return df


df_xeno_canto_import = load_xeno_canto_data()
display(df_xeno_canto_import.head(3))

The dataframe has a length of 2371 rows


Unnamed: 0_level_0,en,type,label,filename
fileid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
564895,Carolina Wren,"adult, sex uncertain, song",3,audio/audio_8sec/564895.wav
545775,Carolina Wren,call,3,audio/audio_8sec/545775.wav
540857,Carolina Wren,song,3,audio/audio_8sec/540857.wav


## 4. Find Best Sub-clip of each Audio File

In [7]:
def find_best_subclip(df, subclip_sec=1, sample_rate=22050):
    '''
    This function will find the best portion of an audio waveform for the requested subclip length. The best
    portion is defined as the loudest slice of the amplitdue vs time audio waveform (a one dimensional series of amplitude values at
    the requested sampling rate).
    The loudest portion is found by taking the area of the positve half of the audio waveform. The function returns
    the start and stop index values of the slice of the audio array for convenient further processing.
    
    :param df: dataframe containg list of audio filenames, created by function load_xeno_canto_data()
    :param subclip_sec: duration of desired best sub_clip, in seconds.
    :return: dataframe with start and end index values the best sub clip of each audio file.
    
    '''
    
    if subclip_sec < 0 or subclip_sec > 30:
        print("Error: You must specify a subclip between 0 and 30 seconds")
        return pd.DataFrame()
    
    for fileid in tqdm(list(df.index)[:]):
        
        y, sr = librosa.load(df.loc[fileid, 'filename'], sr=sample_rate, mono=True)

        ### Calc length of audio clip and sub-clip in samples (i.e. not seconds)
        audio_length = len(y)
        subclip_length = int(subclip_sec * sample_rate)
        
        ### Plot original audio file before any clipping or repeating/extending
        # fig, ax = plt.subplots(figsize=(8, 3.5))
        # librosa.display.waveplot(y, sr, x_axis='s')
        # plt.title(f'Original file id {fileid}')
        # plt.ylabel('Amplitude')
        
        ### Here check if y is shorter than the subclip_sec and if so, wrap y until length = subclip_sec
        if audio_length < subclip_length:
            number_repeats = subclip_length // audio_length
            remaining_samples = subclip_length % audio_length

            # Create new audio clip y by repeat
            y_new_repeat = np.tile(y,number_repeats)
            if remaining_samples > 0:
                y_new_remaining = y[:remaining_samples]
                y_new = np.append(y_new_repeat, y_new_remaining, axis=0)
            else:
                y_new = y_new_repeat
           
            #  Plot repeat audio file
            # fig, ax = plt.subplots(figsize=(8, 3.5))
            # librosa.display.waveplot(y_new, sr, x_axis='s')
            # plt.ylabel('Amplitude')
            # plt.title(f'File id {fileid} repeated to equal sub-clip length of {subclip_sec} sec')
            # print(f"The length of the new sample is {len(y_new)}")
            
            # Update start and stop index of repeated audio file
            df.loc[fileid,'start'] = 0
            df.loc[fileid,'end'] = len(y_new) + 1
            
            continue
        
        ### Calculate area of window for each hop along the audio waveform
        # print(f'length of audio clip is {int(audio_length/sample_rate)} sec, or {audio_length} samples')
        # print(f'The subclip length is {subclip_sec} sec, or {subclip_length} samples')
        hop_stride = int(min(subclip_length*sample_rate/5, audio_length/20))
        # print(f'Each window hop is {hop_stride/sample_rate} sec, or {hop_stride} samples')
        total_hops = int(audio_length / hop_stride)
        # print(f'There are {total_hops} total hops')
        
        # Store data on each hop
        hop_data = OrderedDict()
        hop_window_start = 0
        hop_window_end = subclip_length + 1
        hop=0
        # Clip negative amplitude values for area calculation for each hop
        y_positive = y.clip(min=0)
        
        # Keep hopping until just before a hop would overlap past the end of the audio file
        while hop_window_end <= audio_length:
            y_window = y_positive[hop_window_start:hop_window_end]
            hop_window_area = np.trapz(y_window, dx = 1/sample_rate, axis=0)
            hop_data[hop] = [hop_window_start, hop_window_end, hop_window_area]
            # print(f"hop {hop} with start:end of {hop_data[hop][0]}:{hop_data[hop][1]} samples",
            #       f"{hop_data[hop][0]/sample_rate:.2f}:{(hop_data[hop][1]-1)/sample_rate:.2f} sec",
            #       f"and area of {hop_data[hop][2]:.4f}")
            hop_window_start += hop_stride
            hop_window_end += hop_stride
            hop += 1
            
        # Add one window hop to cover remaining area at end of file if skipped above
        if hop_window_end > audio_length:
            hop_window_start = audio_length - subclip_length
            y_window = y_positive[hop_window_start:]
            hop_window_area = np.trapz(y_window, dx = 1/sample_rate, axis=0)
            hop_data[hop] = [hop_window_start, hop_window_end, hop_window_area]
            # print("Additional window to cover last portion of audio clip")
            # print(f"hop {hop} with start:end of {hop_data[hop][0]}:{hop_data[hop][1]} samples",
            #       f"{hop_data[hop][0]/sample_rate:.3f}:{(hop_data[hop][1]-1)/sample_rate:.3f} sec",
            #       f"and area of {hop_data[hop][2]:.4f}")
        
        ### Find hop with maximum area under the waveform. 
        maxhop = max(hop_data, key= lambda x: hop_data[x][-1])
        # print(f"\nHop with max area is hop {maxhop} with area of {hop_data[maxhop][2]:.4f}\n\n")
        max_y_window = y_positive[hop_data[maxhop][0]:hop_data[maxhop][1]]
        max_start_window_seconds = hop_data[maxhop][0] / sample_rate

        # Plot sub-clip of max area
        # fig, ax = plt.subplots(figsize=(8, 3.5))
        # original_x_ticks =  list(np.arange(0, len(max_y_window) / sample_rate))
        # librosa.display.waveplot(max_y_window, sr, offset = max_start_window_seconds, x_axis='s')
        # plt.ylabel('Amplitude')
        # plt.title(f'Best sub-clip of file id {fileid}')
        # plt.show()
        # Store start and stop of sub-clip with max area
        df.loc[fileid,'start'] = hop_data[maxhop][0]
        df.loc[fileid,'end'] = hop_data[maxhop][1] - 1
    
    # Uncomment only if running through all files otherwise will throw an error
    df = df.astype({'start': 'int32', 'end': 'int32'})
  
    return df

df_xeno_canto = find_best_subclip(df=df_xeno_canto_import, subclip_sec=SUBCLIP_SEC, sample_rate=SAMPLE_RATE)
df_xeno_canto.head(3)

  0%|          | 0/2371 [00:00<?, ?it/s]

Unnamed: 0_level_0,en,type,label,filename,start,end
fileid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
564895,Carolina Wren,"adult, sex uncertain, song",3,audio/audio_8sec/564895.wav,61740,94815
545775,Carolina Wren,call,3,audio/audio_8sec/545775.wav,61740,94815
540857,Carolina Wren,song,3,audio/audio_8sec/540857.wav,44100,77175


## 5. Create Mel Spectral Features

In [8]:
def create_mel_features_umap(df, n_mels=128, sample_rate=22050, best_subclip=False, save=False,
                             n_fft=1012, hop_length=512, fmin=500):
    '''
    Generate mel-spectrograms images without borders
    :param df: dataframe containing list of audio filenames, start and stop index for best clip,
               created by function find_best_subclip()
    :param sample_rate: sample rate to be used by librosa when loading audio file and creating mel spectrogram.
                        Default is 22,050 samples per second.
    :param best_clip: whether to use the best subclip, if true, or the entire file, if false. Default is False.
    :param n_fft: length of fast fourier transform window, used by librosa mel spectrogram. Default is 1,012.
    :param hop_length: number of samples between successive frames, used by librosa mel spectrogram. Default is 512.
    :param fmin: lowest frequency, in Hz, used by mel spectrogram. Default is 500 Hz.
    :return: dataframe containing n_mels mean mel intensities and n_mels standard deviation of mel itensities,
             for every mel frequency. 
             
    '''

    # Create df to hold mel spectral features
    df_mel = pd.DataFrame(columns = ['mean_max' + str(item+1) for item in list(range(n_mels))] +
                                    ['mel_std' + str(item+1) for item in list(range(n_mels))], 
                                    index=df.index)
        

    for fileid in tqdm(list(df.index)[:]):
        
        # Load audio file into librosa
        y, sr = librosa.load(df.loc[fileid, 'filename'], sr=sample_rate, mono=True)
        
        # Normalize audio file
        rms_level_db = 0
        sig = y
        r = 10**(rms_level_db / 20.0)
        a = np.sqrt( (len(sig) * r**2) / np.sum(sig**2) )
        # Normalized amplitude signal - to normalize the amplitude of a signal is based on the RMS amplitude, 
        # we multiply a scaling factor, a, by the sample values in our signal to change the amplitude such that 
        # the result has the desired RMS level, rms_level_db.
        # See https://www.hackaudio.com/digital-signal-processing/amplitude/rms-normalization/ for reference.
        # and https://www.youtube.com/watch?v=zyqb06g51jw
        y_norm = y * a
        
        # Clip file to best subclip, if requested
        if best_subclip==True:
            start = df.loc[fileid, 'start']
            end = df.loc[fileid, 'end']
            y_norm = y_norm[start:end]
        
        # Create mel spectrogram
        mels = librosa.feature.melspectrogram(y=y_norm, sr=sample_rate, n_mels=n_mels, 
                                              n_fft=n_fft, hop_length=hop_length, fmin=fmin)
        # Calculate mean and std dev of intensities of audio clip time t at each mel frequency
        mels_means = [np.mean(item) for item in mels]
        mels_stds = [np.std(item) for item in mels]
        # print("mel means:", mels_maxs, "\nmel std deviations:", mels_stds)
        df_mel.loc[fileid] = mels_means + mels_stds
  
    return df_mel


df_mel = create_mel_features_umap(df_xeno_canto, n_mels=NUMBER_MEL, sample_rate=SAMPLE_RATE, best_subclip=True,
                                  n_fft=N_FFT, hop_length=HOP_LENGTH, fmin=FMIN)
df_mel.head()

  0%|          | 0/2371 [00:00<?, ?it/s]

Unnamed: 0_level_0,mean_max1,mean_max2,mean_max3,mean_max4,mean_max5,mean_max6,mean_max7,mean_max8,mean_max9,mean_max10,...,mel_std11,mel_std12,mel_std13,mel_std14,mel_std15,mel_std16,mel_std17,mel_std18,mel_std19,mel_std20
fileid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
564895,0.004165,0.002527,0.001667,0.001217,0.000931,0.000725,0.000599,0.000529,0.00061,0.000306,...,0.003736,0.011941,0.028529,0.064208,0.066907,0.007948,0.026753,0.002157,0.000538,0.000566
545775,106.219368,70.989456,29.043142,13.224719,14.792793,10.568196,3.021468,1.804507,2.799922,1.374472,...,0.993762,0.567932,0.265743,0.174674,0.110344,0.062624,0.047842,0.040701,0.036344,0.034112
540857,0.042393,0.026865,0.059868,0.111168,0.078637,0.107656,0.19939,0.251003,0.107399,0.00125,...,0.011062,0.00954,0.008289,0.007484,0.006382,0.005897,0.005353,0.00505,0.004853,0.004615
540855,6.020388,7.667218,4.224151,1.592106,0.34318,0.076866,0.018153,0.009517,0.011956,0.005654,...,0.021398,0.021872,0.005196,0.002488,0.002392,0.001813,0.010027,0.012744,0.006896,7e-05
487506,1.612022,1.364514,3.793413,4.105425,3.493034,2.058453,1.414856,1.851382,1.369787,1.176851,...,1.650832,0.946211,3.442681,1.028871,0.542244,0.25409,0.174671,0.022859,0.004142,9.9e-05


## 6. UMAP Visualizations with Mel Spectral Features

In [9]:
def create_df_umap(df1, df2):
    '''Helper function to merge the df_xeno_canto dataframe, which contains the bird species and type information,
    with a dataframe containing spectral features'''
    df_umap_step1 = df1.merge(df2, how ='left', left_index=True, right_index=True)
  
    return df_umap_step1


df_umap_mel = create_df_umap(df_xeno_canto, df_mel)
# display(df_umap_mel.head())

In [10]:
def create_umap_chart(df, feature_start, chart_title='', color_feature='en'):
    '''Create UMAP visualization
    :param df: dataframe containing spectral features of bird audio recordings and info on bird species and types
    :param feature_start: column number that dentos teh beginning of the spectral features in the dataframe
    :param chart_title: chart title
    :param color_feature: feature to color code UMAP chart data points. Select en, type, or label.
    :return: UMAP 2D chart
    
    '''
    
    umap_2d = UMAP(n_components=2, init='random', random_state=42)
    umap_2d.fit(df.iloc[:, feature_start:].values)
    
    projections = umap_2d.transform(df.iloc[:, feature_start:].values)
    
    fig = px.scatter(
        projections, x=0, y=1,
        color=list(df[color_feature].astype(str)), labels={'color': color_feature},
        title = chart_title
    )
    
    return fig

feature_col_start = len(df_xeno_canto.columns)
chart_title = 'UMAP: Mel Spectral Features for Calls and Songs, colored by Species'
create_umap_chart(df_umap_mel, feature_start=feature_col_start, chart_title=chart_title, color_feature='en')

In [11]:
df_umap_mel_calls = df_umap_mel[df_umap_mel['type'].str.contains("call")]
df_umap_mel_calls = df_umap_mel[~df_umap_mel['type'].str.contains("song")]


feature_col_start = len(df_xeno_canto.columns)
chart_title = 'UMAP: Mel Spectral Features for Calls only, colored by Species'
create_umap_chart(df_umap_mel_calls, feature_start=feature_col_start, chart_title=chart_title, color_feature='en')

In [12]:
df_umap_mel_songs = df_umap_mel[df_umap_mel['type'].str.contains("song")]
df_umap_mel_songs = df_umap_mel[~df_umap_mel['type'].str.contains("call")]


feature_col_start = len(df_xeno_canto.columns)
chart_title = 'UMAP: Mel Spectral Features for Songs only, colored by Species'
create_umap_chart(df_umap_mel_songs, feature_start=feature_col_start, chart_title=chart_title, color_feature='en')

## 7. UMAP Visualizations with Spherical K-Means Mel Spectral Features

In [13]:
def load_spherical_kmeans_features(skmean_numpy_file):
    '''Load numpy array of spherical k-means features generate by notebook Spherical_K_Means.ipynb
    '''
    
    if run_on_colab == True:
        skmeans_features = np.load('/content/content/MyDrive/bird-songs/audio_8sec/' + skmean_numpy_file)
    else:
        skmeans_features = np.load(skmean_numpy_file)
    
    # Create df to hold sperical k means features
    df = pd.DataFrame(data = skmeans_features,
                      columns = ['skmeanfeat' + str(item+1) for item in list(range(skmeans_features.shape[1]))], 
                      index=df_xeno_canto.index)

    return df


df_skmean = load_spherical_kmeans_features('s_k_means_mel_features.npy')

# Merge dataframe of spherical k-means features with dataframe of xeno_canto information
df_umap_skmeans = create_df_umap(df_xeno_canto, df_skmean)
display(df_umap_skmeans.head(3))

Unnamed: 0_level_0,en,type,label,filename,start,end,skmeanfeat1,skmeanfeat2,skmeanfeat3,skmeanfeat4,...,skmeanfeat391,skmeanfeat392,skmeanfeat393,skmeanfeat394,skmeanfeat395,skmeanfeat396,skmeanfeat397,skmeanfeat398,skmeanfeat399,skmeanfeat400
fileid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
564895,Carolina Wren,"adult, sex uncertain, song",3,audio/audio_8sec/564895.wav,61740,94815,0.0,0.005161,0.0,0.0,...,0.001472,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.970952,0.0
545775,Carolina Wren,call,3,audio/audio_8sec/545775.wav,61740,94815,0.077037,0.0,0.59488,0.0,...,0.092877,0.717385,0.0,0.0,0.0,0.0,0.099724,0.0,0.0,0.0
540857,Carolina Wren,song,3,audio/audio_8sec/540857.wav,44100,77175,0.0,0.004728,0.0,0.0,...,0.000995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.968316,0.0


In [14]:
feature_col_start = len(df_xeno_canto.columns)
chart_title = 'UMAP: Spherical K-Means Features for Songs and Calls, colored by Species'
create_umap_chart(df_umap_skmeans, feature_start=feature_col_start, chart_title=chart_title, color_feature='en')

In [15]:
df_umap_skmeans_calls = df_umap_skmeans[df_umap_skmeans['type'].str.contains("call")]
df_umap_skmeans_calls = df_umap_skmeans[~df_umap_skmeans['type'].str.contains("song")]


feature_col_start = len(df_xeno_canto.columns)
chart_title = 'UMAP: Spherical K-Means Features for Calls only, colored by Species'
create_umap_chart(df_umap_skmeans_calls, feature_start=feature_col_start, chart_title=chart_title, color_feature='en')

In [16]:
df_umap_skmeans_songs = df_umap_skmeans[df_umap_skmeans['type'].str.contains("song")]
df_umap_skmeans_songs = df_umap_skmeans[~df_umap_skmeans['type'].str.contains("call")]

feature_col_start = len(df_xeno_canto.columns)
chart_title = 'UMAP: Spherical K-Means Features for Songs only, colored by Species'
create_umap_chart(df_umap_skmeans_songs, feature_start=feature_col_start, chart_title=chart_title, color_feature='en')