# UMAP for bird-songs  
---

In [1]:
## Installs for Coursera (on terminal only)
# conda install -c conda-forge librosa umap-learn
# conda install -c plotly plotly=4.14.3

In [2]:
## Installs for CoLab
# !pip install umap-learn
# !pip install plotly==4.14.3

In [3]:
# # For Colab only
# # 4/1AY0e-g6CTocvBK45PWlJu2ycBuOMTgu36b-VZgpnCXitW_Vy3ckpGGBzur8
# from google.colab import drive
# drive.mount('content')
# # /content/content/MyDrive/bird-songs/audio

In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
import os
from collections import OrderedDict
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings("ignore")

# Libraries for librosa
import librosa
import librosa.display

# Libraries for plotly charts
import plotly.express as px
import plotly.io as pio
pio.renderers.default = 'iframe'

# Libraries for UMAP and t-SNE
from umap import UMAP
from sklearn.manifold import TSNE

# Libraries for matplotlib
from matplotlib import pyplot as plt
%matplotlib inline
# Set general font size
plt.rcParams['font.size'] = '14'


# Check whether google drive mounted
path = '/content/content/MyDrive/bird-songs/'
if os.path.isdir(path) == True:
    print('Google Drive Mounted')
    run_on_colab = True
else:
    print('Using local drive')
    run_on_colab = False

Using local drive


In [86]:
# Define Global Varibles for analysis

SAMPLE_RATE = 22050 # Hz
SUBCLIP_SEC = 1.5 # sec
NUMBER_MEL = 20
FMIN = 4000 # Minimum Hz
N_FFT = 512 # Choose 2^n where n is integer
HOP_LENGTH = 256 # Choose equal, half, or quarter of N_FFT

In [87]:
def audio_path_filename(fileid):
    
    if run_on_colab == True:
        path_filename = '/content/content/MyDrive/bird-songs/audio_8sec/' + str(fileid) + '.wav'
                                
    else:
        path_filename = 'audio/audio_8sec/' + str(fileid) + '.wav'
    
    return path_filename


def load_xeno_canto_data():
    '''
    '''

    if run_on_colab == True:
        df = pd.read_csv('/content/content/MyDrive/bird-songs/features_filtered.csv', encoding='latin')
    else:
        df = pd.read_csv('features_filtered.csv', encoding='latin')
        
    df.rename(columns={"id": "fileid"}, inplace=True)

    # These are the top species we selected for our analysis
    selected_species = ['Red-winged Blackbird', 'Common Yellowthroat', 'Northern Cardinal', 
                        'Carolina Wren', 'Red Crossbill', 'Spotted Towhee']

    # We only need the fileid, labels, and type
    df = df[df.en.isin(selected_species)][['fileid', 'en', 'type']]
    df['type'] = df['type'].str.lower()
    
    # Now we need to change our labels into integers as torch does not support string format for labels.
    labels = {"Red-winged Blackbird": 0, 'Common Yellowthroat': 1, 'Northern Cardinal': 2,
              'Carolina Wren': 3,'Red Crossbill': 4, 'Spotted Towhee': 5}
    df['label'] = df.apply(lambda row: labels[row.en], axis=1)
    # Add filename and path
    df['filename'] = df['fileid'].apply(lambda x: audio_path_filename(x))
    df.set_index('fileid', inplace=True)
    
    return df


df_xeno_canto_import = load_xeno_canto_data()
display(df_xeno_canto_import.head(5))
print(len(df_xeno_canto_import))

Unnamed: 0_level_0,en,type,label,filename
fileid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
564895,Carolina Wren,"adult, sex uncertain, song",3,audio/audio_8sec/564895.wav
545775,Carolina Wren,call,3,audio/audio_8sec/545775.wav
540857,Carolina Wren,song,3,audio/audio_8sec/540857.wav
540855,Carolina Wren,song,3,audio/audio_8sec/540855.wav
487506,Carolina Wren,song,3,audio/audio_8sec/487506.wav


2371


In [88]:
a = np.array([1,2,3])
b = np.tile(a,2)
c = np.array([7,9])
np.append(b,c, axis=0)

c[-0:]

array([7, 9])

In [89]:
def find_best_subclip(df, subclip_sec=1, sample_rate=22050):
    ''' '''
     
    if subclip_sec < 0 or subclip_sec > 50:
        print("Error: You must specify a subclip between 0 and 50 seconds")
        return pd.DataFrame()
    
    for fileid in tqdm(list(df.index)[:]):
        
        y, sr = librosa.load(df.loc[fileid, 'filename'], sr=sample_rate, mono=True)
        
        ### Linear rms level and scaling factor
        # https://www.youtube.com/watch?v=zyqb06g51jw
        rms_level_db = 0
        sig = y
        r = 10**(rms_level_db / 20.0)
        a = np.sqrt( (len(sig) * r**2) / np.sum(sig**2) )
        # Normalized amplitude signal
        y_norm = y * a
        
        ### Calc length of audio clip and sub-clip in samples (i.e. not seconds)
        audio_length = len(y_norm)
        subclip_length = int(subclip_sec * sample_rate)
        
        ### Plot original audio file before any clipping or extending
#         fig, ax = plt.subplots(figsize=(8, 3.5))
#         librosa.display.waveplot(y_norm, sr, x_axis='s')
#         plt.title(f'Original file id {fileid}')
#         plt.ylabel('Amplitude')
        
        
        ### Here check if y is shorter than the subclip_sec and if so, wrap y until length = subclip_sec
        if audio_length < subclip_length:
            number_repeats = subclip_length // audio_length
            remaining_samples = subclip_length % audio_length
#             print(number_repeats, remaining_samples)
            # Create new audio clip y by repeat
            y_new_repeat = np.tile(y_norm,number_repeats)
            if remaining_samples > 0:
                y_new_remaining = y_norm[:remaining_samples]
                y_new = np.append(y_new_repeat, y_new_remaining, axis=0)
            else:
                y_new = y_new_repeat
            ### TO DO: Save this new file??
            
            #  Plot repeat audio file
#             fig, ax = plt.subplots(figsize=(8, 3.5))
#             librosa.display.waveplot(y_new, sr, x_axis='s')
#             plt.ylabel('Amplitude')
#             plt.title(f'File id {fileid} repeated to equal sub-clip length of {subclip_sec} sec')
#             print(f"The length of the new sample is {len(y_new)}")
            
            # Update start and stop index of repeated audio file
            df.loc[fileid,'start'] = 0
            df.loc[fileid,'end'] = len(y_new) + 1
            
            continue
        
        
        ### Calculate area of window for each hop along the audio waveform
#         print(f'length of audio clip is {int(audio_length/sample_rate)} sec, or {audio_length} samples')
#         print(f'The subclip length is {subclip_sec} sec, or {subclip_length} samples')
        hop_stride = int(min(subclip_length*sample_rate/5, audio_length/20))
#         print(f'Each window hop is {hop_stride/sample_rate} sec, or {hop_stride} samples')
        total_hops = int(audio_length / hop_stride)
#         print(f'There are {total_hops} total hops')
        
        # Store data on each hop
        hop_data = OrderedDict()
        hop_window_start = 0
        hop_window_end = subclip_length + 1
        hop=0
        # Clip negative amplitude values for area calculation for each hop
        y_norm_positive = y_norm.clip(min=0)
        
        # Keep hopping until just before a hop would overlap past the end of the audio file
        while hop_window_end <= audio_length:
            y_window = y_norm_positive[hop_window_start:hop_window_end]
            hop_window_area = np.trapz(y_window, dx = 1/sample_rate, axis=0)
            hop_data[hop] = [hop_window_start, hop_window_end, hop_window_area]
#             print(f"hop {hop} with start:end of {hop_data[hop][0]}:{hop_data[hop][1]} samples",
#                   f"{hop_data[hop][0]/sample_rate:.2f}:{(hop_data[hop][1]-1)/sample_rate:.2f} sec",
#                   f"and area of {hop_data[hop][2]:.4f}")
            hop_window_start += hop_stride
            hop_window_end += hop_stride
            hop += 1
            
        # Add one window hop to cover remaining area at end of file if skipped above
        if hop_window_end > audio_length:
            hop_window_start = audio_length - subclip_length
            y_window = y_norm_positive[hop_window_start:]
            hop_window_area = np.trapz(y_window, dx = 1/sample_rate, axis=0)
            hop_data[hop] = [hop_window_start, hop_window_end, hop_window_area]
#             print("Additional window to cover last portion of audio clip")
#             print(f"hop {hop} with start:end of {hop_data[hop][0]}:{hop_data[hop][1]} samples",
#                   f"{hop_data[hop][0]/sample_rate:.3f}:{(hop_data[hop][1]-1)/sample_rate:.3f} sec",
#                   f"and area of {hop_data[hop][2]:.4f}")
        
        
        ### Find hop with maximum area under the waveform. 
        maxhop = max(hop_data, key= lambda x: hop_data[x][-1])
#         print(f"\nHop with max area is hop {maxhop} with area of {hop_data[maxhop][2]:.4f}\n\n")
        max_y_window = y_norm_positive[hop_data[maxhop][0]:hop_data[maxhop][1]]
        max_start_window_seconds = hop_data[maxhop][0] / sample_rate

        # Plot sub-clip of max area
#         fig, ax = plt.subplots(figsize=(8, 3.5))
#         original_x_ticks =  list(np.arange(0, len(max_y_window) / sample_rate))
#         librosa.display.waveplot(max_y_window, sr, offset = max_start_window_seconds, x_axis='s')
#         plt.ylabel('Amplitude')
#         plt.title(f'Best sub-clip of file id {fileid}')
#         plt.show()
#         # Store start and stop of sub-clip with max area
        df.loc[fileid,'start'] = hop_data[maxhop][0]
        df.loc[fileid,'end'] = hop_data[maxhop][1] - 1
    
#    Uncomment only if running through all files otherwise will throw an error
    df = df.astype({'start': 'int32', 'end': 'int32'})
  
    return df

df_xeno_canto = find_best_subclip(df=df_xeno_canto_import, subclip_sec=2, sample_rate=SAMPLE_RATE)
df_xeno_canto.head(5)

  0%|          | 0/2371 [00:00<?, ?it/s]

Unnamed: 0_level_0,en,type,label,filename,start,end
fileid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
564895,Carolina Wren,"adult, sex uncertain, song",3,audio/audio_8sec/564895.wav,52920,97020
545775,Carolina Wren,call,3,audio/audio_8sec/545775.wav,52920,97020
540857,Carolina Wren,song,3,audio/audio_8sec/540857.wav,44100,88200
540855,Carolina Wren,song,3,audio/audio_8sec/540855.wav,70560,114660
487506,Carolina Wren,song,3,audio/audio_8sec/487506.wav,8820,52920


In [100]:
def create_mel_features_umap(df, n_mels=128, sample_rate=22050, best_subclip=False, save=False,
                             n_fft=1012, hop_length=512, fmin=500):
    ''' '''

    # Create df to hold mfcc spectral features
    df_mel = pd.DataFrame(columns = ['mean_max' + str(item+1) for item in list(range(n_mels))] +
                                    ['mel_std' + str(item+1) for item in list(range(n_mels))], 
                                    index=df.index)
        

    for fileid in tqdm(list(df.index)[:]):
        # Load audio file into librosa
        y, sr = librosa.load(df.loc[fileid, 'filename'], sr=sample_rate, mono=True)
        # Normalize audio file with linear rms level and scaling factor
        # https://www.youtube.com/watch?v=zyqb06g51jw
        rms_level_db = 0
        sig = y
        r = 10**(rms_level_db / 20.0)
        a = np.sqrt( (len(sig) * r**2) / np.sum(sig**2) )
        # Normalized amplitude signal
        y_norm = y * a
        
        # Clip file to best subclip, if requested
        if best_subclip==True:
            start = df.loc[fileid, 'start']
            end = df.loc[fileid, 'end']
            y_norm = y_norm[start:end]
        
        # Create mel spectrogram
        mels = librosa.feature.melspectrogram(y=y_norm, sr=sample_rate, n_mels=n_mels, 
                                              n_fft=n_fft, hop_length=hop_length, fmin=fmin)
#         # Calculate mean and std dev of intensities of audio clip time t at each mel frequency
        mels_means = [np.mean(item) for item in mels]
        mels_stds = [np.std(item) for item in mels]
#         # print("mel means:", mels_maxs, "\nmel std deviations:", mels_stds)
        df_mel.loc[fileid] = mels_means + mels_stds
  
    return df_mel


df_mel = create_mel_features_umap(df_xeno_canto, n_mels=NUMBER_MEL, sample_rate=SAMPLE_RATE, best_subclip=True,
                                  n_fft=N_FFT, hop_length=HOP_LENGTH, fmin=FMIN)
df_mel.head()

  0%|          | 0/2371 [00:00<?, ?it/s]

Unnamed: 0_level_0,mean_max1,mean_max2,mean_max3,mean_max4,mean_max5,mean_max6,mean_max7,mean_max8,mean_max9,mean_max10,...,mel_std11,mel_std12,mel_std13,mel_std14,mel_std15,mel_std16,mel_std17,mel_std18,mel_std19,mel_std20
fileid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
564895,6.4e-05,6.3e-05,5.3e-05,5e-05,5.3e-05,5.9e-05,6e-05,5.9e-05,0.000599,0.000471,...,0.003839,0.009613,0.022407,0.067722,0.049972,0.009803,0.039401,0.002714,8.9e-05,1.9e-05
545775,96.61084,66.104393,27.486958,12.580145,12.523952,8.638211,2.597655,2.283633,2.874307,1.082525,...,0.819352,0.481051,0.21368,0.147555,0.069599,0.01972,0.002185,0.001114,0.000243,0.0
540857,0.027131,0.016906,0.041497,0.081649,0.056583,0.079128,0.148193,0.187521,0.079785,9.7e-05,...,0.000339,0.001901,0.000309,0.000192,0.000181,0.000148,0.000121,0.00014,0.000123,9.8e-05
540855,6.255861,6.370545,3.421864,1.582147,0.295339,0.058947,0.012375,0.006532,0.007928,0.004679,...,0.020525,0.019157,0.004835,0.002542,0.004107,0.00195,0.008726,0.0115,0.005413,7e-06
487506,1.809918,1.737512,4.191457,4.896708,5.371802,3.192464,1.274053,1.510069,1.070158,0.905797,...,1.439245,0.829773,3.006304,0.913569,0.479613,0.227663,0.156404,0.020287,0.003636,4e-06


## UMAP: Mel Spectrogram Features

In [110]:
def create_df_umap(df1, df2):

    df_umap_step1 = df1.merge(df2, how ='left', left_index=True, right_index=True)
    # df_umap_step2 = df_umap_step1.merge(df3, how ='left', left_index=True, right_index=True)
    return df_umap_step1


df_umap_mel = create_df_umap(df_xeno_canto, df_mel)

# display(df_umap_mel.head())

In [111]:
def create_umap_chart(df, color_feature='label'):
    
    
    umap_2d = UMAP(n_components=2, init='random', random_state=42)
    umap_2d.fit(df.iloc[:, 6:].values)
    
    projections = umap_2d.transform(df.iloc[:, 6:].values)
    
    fig = px.scatter(
        projections, x=0, y=1,
        color=list(df[color_feature].astype(str)), labels={'color': color_feature},
        title = f"UMAP Plot of bird audio samples colored by {color_feature}"
    )
    
    return fig


create_umap_chart(df_umap_mel, color_feature='en')

In [112]:
df_umap_mel_calls = df_umap_mel[df_umap_mel['type'].str.contains("call")]
df_umap_mel_calls = df_umap_mel[~df_umap_mel['type'].str.contains("song")]
# display(df_umap_mel_calls)

create_umap_chart(df_umap_mel_calls, color_feature='en')

In [113]:
df_umap_mel_songs = df_umap_mel[df_umap_mel['type'].str.contains("song")]
df_umap_mel_songs = df_umap_mel[~df_umap_mel['type'].str.contains("call")]
# display(df_umap_mel_songs)

create_umap_chart(df_umap_mel_songs, color_feature='en')

## UMAP: Spherical K-Means Features

In [117]:
def load_spherical_kmeans_features(file):
    ''' '''
    
    if run_on_colab == True:
        skmeans_features = np.load('/content/content/MyDrive/bird-songs/audio_8sec/' + file)
    else:
        skmeans_features = np.load(file)
    
    # Create df to hold sperical k means features
    df = pd.DataFrame(data = skmeans_features,
                      columns = ['skmeanfeat' + str(item+1) for item in list(range(skmeans_features.shape[1]))], 
                      index=df_xeno_canto.index)

    return df


df_skmean = load_spherical_kmeans_features('s_k_means_mel_features.npy')
df_skmean.head()

Unnamed: 0_level_0,skmeanfeat1,skmeanfeat2,skmeanfeat3,skmeanfeat4,skmeanfeat5,skmeanfeat6,skmeanfeat7,skmeanfeat8,skmeanfeat9,skmeanfeat10,...,skmeanfeat391,skmeanfeat392,skmeanfeat393,skmeanfeat394,skmeanfeat395,skmeanfeat396,skmeanfeat397,skmeanfeat398,skmeanfeat399,skmeanfeat400
fileid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
564895,0.0,0.005161,0.0,0.0,0.041198,0.05214,0.0,0.0,0.0,0.0,...,0.001472,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.970952,0.0
545775,0.077037,0.0,0.59488,0.0,0.0,0.0,0.0,0.56496,0.0,0.189075,...,0.092877,0.717385,0.0,0.0,0.0,0.0,0.099724,0.0,0.0,0.0
540857,0.0,0.004728,0.0,0.0,0.040887,0.051217,0.0,0.0,0.0,0.0,...,0.000995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.968316,0.0
540855,0.0,0.001463,0.0,0.0,0.083723,0.049234,0.003468,0.0,0.037467,0.0,...,0.0,0.0,0.007735,0.0,0.0,0.0,0.047813,0.0,0.923891,0.060223
487506,0.0,0.001838,0.0,0.0,0.027714,0.033537,0.218872,0.0,0.0,0.0,...,0.0,0.0,0.106992,0.004156,0.0,0.037638,0.0,0.015705,0.834948,0.010631


In [118]:
df_umap_skmeans = create_df_umap(df_xeno_canto, df_skmean)

# display(df_umap_skmeans.head())

In [119]:
create_umap_chart(df_umap_skmeans, color_feature='en')

In [120]:
df_umap_skmeans_calls = df_umap_skmeans[df_umap_skmeans['type'].str.contains("call")]
df_umap_skmeans_calls = df_umap_skmeans[~df_umap_skmeans['type'].str.contains("song")]
# display(df_umap_skmeans_calls)

create_umap_chart(df_umap_skmeans_calls, color_feature='en')

In [121]:
df_umap_skmeans_songs = df_umap_skmeans[df_umap_skmeans['type'].str.contains("song")]
df_umap_skmeans_songs = df_umap_skmeans[~df_umap_skmeans['type'].str.contains("call")]
# display(df_umap_skmeans_songs)

create_umap_chart(df_umap_skmeans_songs, color_feature='en')