<a href="https://colab.research.google.com/github/21holden21/Analyzing-Cricket-Songs-with-Machine-Learning/blob/main/Cricket_Song_Analysis_and_Classification_Dimensionality_Reduction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Initial Setup/Loading the Audio Files into Waveforms

In [None]:
##mount to Google Drive to read in audio files from shared drive/create mel spectrograms/MFCCs/magnitude power spectrums and put inside shared drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import librosa #to extract audio features from raw audio .wav files
import librosa.display #to display extracted audio features
import numpy as np #for matrix manipulations
import matplotlib.pyplot as plt #to visualize audio features
import matplotlib as mpl #for color mapping
import os #to locate files
from sklearn.manifold import TSNE #to use t-sne DR technique
from sklearn.decomposition import PCA #PCA'd results are pushed into t-SNE algorithm

In [None]:
##Loading the cricket audio .wav files into waveform list
wav_dir = "/content/drive/Shareddrives/Analyzing Cricket Songs with Machine Learning/.wav Files/"

waveform_list = []
num_iter = 0

cricket_names = [] ##use parallel array to keep track of cricket name for each audio file

for wav_file in sorted(os.listdir(wav_dir)):  
  num_iter += 1
  print("ITERATION NUMBER: ", str(num_iter))

  if wav_file.find("Xenogryllus") != -1 and wav_file.find("(MCL)") != -1: ##Xenogryllus MCL files (no species name) assumed to be Xenogryllus uniparitus species
     cricket_names.append("Xenogryllus" + " " + "unipartitus" +  " " + "MCL")
  else:
     split_file = wav_file.split(" ")
     if wav_file.find("MCL") != -1: ##if file is from MCL
      cricket_names.append(split_file[0] + " " + split_file[1] + " " + "MCL")
     else: ##if file is from SINA
      cricket_names.append(split_file[0] + " " + split_file[1] + " " + "SINA")
      
  #create waveform from .wav file
  waveform, sr = librosa.load(wav_dir + wav_file)
  waveform_list.append(waveform)

In [None]:
print(cricket_names)

['Acheta domesticus SINA', 'Acheta domesticus MCL', 'Acheta domesticus MCL', 'Acheta domesticus MCL', 'Acheta domesticus MCL', 'Allonemobius allardi SINA', 'Allonemobius allardi MCL', 'Allonemobius allardi MCL', 'Allonemobius allardi MCL', 'Allonemobius allardi MCL', 'Allonemobius fasciatus SINA', 'Allonemobius fasciatus MCL', 'Allonemobius fasciatus MCL', 'Allonemobius fasciatus MCL', 'Allonemobius fasciatus MCL', 'Allonemobius fultoni SINA', 'Allonemobius fultoni MCL', 'Allonemobius fultoni MCL', 'Allonemobius fultoni MCL', 'Allonemobius griseus SINA', 'Allonemobius griseus MCL', 'Allonemobius griseus MCL', 'Allonemobius griseus MCL', 'Allonemobius maculatus SINA', 'Allonemobius maculatus MCL', 'Allonemobius maculatus MCL', 'Allonemobius maculatus MCL', 'Allonemobius shalontaki MCL', 'Allonemobius shalontaki SINA', 'Allonemobius socius SINA', 'Allonemobius socius MCL', 'Allonemobius socius MCL', 'Allonemobius socius MCL', 'Allonemobius socius MCL', 'Allonemobius sparsalus SINA', 'All

In [None]:
##high pass filter to zero out low entries below a "cutoff"
from scipy import signal
def butter_highpass(cutoff, fs, order=5):
    nyq = 0.5 * fs
    normal_cutoff = cutoff / nyq
    b, a = signal.butter(order, normal_cutoff, btype='high', analog=False)
    return b, a

def butter_highpass_filter(data, cutoff, fs, order=5):
    b, a = butter_highpass(cutoff, fs, order=order)
    y = signal.filtfilt(b, a, data)
    return y

# Creating Mel Spectrograms

In [None]:
##generate mel specs for t-sne: output --> numerical matrix, which is flattened into 1D for our dimensionality reduction algorithms
num_iter = 0

audio_length = librosa.time_to_frames(5) #take only 5 seconds from melspectrograms
num_mel_bands = 128

mel_spec_list = []

for waveform in waveform_list: ##go through waveforms for each of the audio files and create a mel spectrogram from them

  print("ITERATION NUMBER: ", str(num_iter+1))

  mel_spec = librosa.feature.melspectrogram(butter_highpass_filter(waveform, 2000, sr), sr=sr) #build mel spec for the entire (filtered) waveform (2000 is second arg to drop all frequencies below 2KHz)
  log_mel_spec = librosa.power_to_db(mel_spec) ##convert power to log scale to perceptually represent intensity of the sound
  num_frames = mel_spec.shape[1] #number of frames for particular file

  if cricket_names[num_iter].find("SINA") != -1: #SINA: take first 5 seconds from mel spectrogram (no voiceover at the beginning)

      if num_frames >= audio_length:
        mel_spec_list.append(log_mel_spec[:,0:audio_length].flatten())

      else: #file less than 5 seconds
        pass #disregard because we'd have to 0 pad, which could affect how well the data clusters in the t-sne plot
        
  else: #MCL --> take seconds 5.2-10.2 from mel spectrogram (voiceover of roughly 5 seconds at beginning)
      jump_about_5_seconds = librosa.time_to_frames(5.2) ##jump a tad bit over 5 seconds to not get voiceover data
      if num_frames >= audio_length+jump_about_5_seconds: 
        mel_spec_list.append(log_mel_spec[:,jump_about_5_seconds:audio_length+jump_about_5_seconds].flatten())
      else: #file less than 10.2 seconds
        pass #disregard because we'd have to 0 pad, which could affect how well the data clusters in the t-sne plot
      

  num_iter += 1

mel_spec_list = np.array(mel_spec_list)
print(mel_spec_list.shape)

In [None]:
#plot mel specs function --> this function is just used to visualize our mel specs in image format -- it's not used for the actual dimensionality reduction
def plot_mel_spec(folder, data, title, id):
  plt.figure(figsize=(25,10))
  librosa.display.specshow(
      data,
      x_axis= "time",
      y_axis = "mel",
      sr=sr,
      cmap='magma' #ensures colormaps all use the same coloring scheme
  )
  plt.xlim([0, 5])
  plt.clim(-100, 35) #set lower and upper limit on decibal to color mapping
  plt.title("Mel spectrogram for " + title)
  plt.xlabel("time")
  plt.ylabel("mel")
  plt.savefig(folder + title + str(id) + '.png')
  plt.close()

In [None]:
##generate mel specs images for VISUALIZATION PURPOSES ONLY: output --> .png files
mel_specs_folder = "/content/drive/Shareddrives/Analyzing Cricket Songs with Machine Learning/mel specs for SINA and MCL/"
num_iter = 0

audio_length = librosa.time_to_frames(5) #take only 5 seconds from melspectrograms
num_mel_bands = 128

for waveform in waveform_list: ##go through waveforms for each of the audio files and create a mel spectrogram from them

  print("ITERATION NUMBER: ", str(num_iter+1))

  mel_spec = librosa.feature.melspectrogram(butter_highpass_filter(waveform, 2000, sr), sr=sr) #build mel spec for the entire (filtered) waveform (2000 is second arg to drop all frequencies below 2KHz)
  log_mel_spec = librosa.power_to_db(mel_spec) ##convert power to log scale to perceptually represent intensity of the sound
  num_frames = mel_spec.shape[1] #number of frames for particular file

  if cricket_names[num_iter].find("SINA") != -1: #SINA: take first 5 seconds from mel spectrogram (no voiceover at the beginning)

      if num_frames >= audio_length:
        plot_mel_spec(mel_specs_folder,log_mel_spec[:,0:audio_length], cricket_names[num_iter], num_iter)

      else: #file less than 5 seconds
        pass #disregard -- this waveform won't be used as part of the dataset for dimensionality reduction so no need to visualize it
        
  else: #MCL --> take seconds 5.2-10.2 from mel spectrogram (voiceover of roughly 5 seconds at beginning)
    
      jump_about_5_seconds = librosa.time_to_frames(5.2) ##jump a tad bit over 5 seconds to not get voiceover data
      if num_frames >= audio_length+jump_about_5_seconds: 
        plot_mel_spec(mel_specs_folder,log_mel_spec[:,jump_about_5_seconds:audio_length+jump_about_5_seconds], cricket_names[num_iter], num_iter)
      else: #file less than 10.2 seconds
        pass #disregard -- this waveform won't be used as part of the dataset for dimensionality reduction so no need to visualize it
      

  num_iter += 1

In [None]:
mel_spec_list.shape

(433, 27520)

# Creating MFCCs

In [None]:
##generate mfccs for t-sne: output --> numerical matrix, which is flattened into 1D for our dimensionality reduction algorithms
mfccs_list = np.array([])
is_first_iteration = True #used to build the mfcc dataset
num_iter = 0
is_long_enough_file = False

for waveform in waveform_list: ##go through waveforms for each of the audio files and generate MFCCs for them

  num_iter += 1
  print("ITERATION NUMBER: ", str(num_iter))
    
    
  #extract MFCCs
  audio_length = librosa.time_to_frames(5) #take only 5 seconds of MFCCs
  mfccs = librosa.feature.mfcc(waveform, n_mfcc=13, sr=sr)
  num_frames = mfccs.shape[1]

  if cricket_names[num_iter-1].find("SINA") != -1: #SINA: take first 5 seconds from MFCCs (no voiceover at the beginning)

      if num_frames >= audio_length:
        mfccs = mfccs[:,:audio_length]
        is_long_enough_file = True

      else: 
        pass #disregard because we'd have to 0 pad, which could affect how well the data clusters in the t-sne plot
        
  else: #MCL --> take seconds 5.2-10.2 from MFCCs (voiceover of roughly 5 seconds at beginning)
    
      jump_about_5_seconds = librosa.time_to_frames(5.2) ##jump a tad bit over 5 seconds to not get voiceover data
      if num_frames >= audio_length + jump_about_5_seconds: #trim longer files
        mfccs = mfccs[:,jump_about_5_seconds:audio_length + jump_about_5_seconds]
        is_long_enough_file = True

      else:
        pass #disregard because we'd have to 0 pad, which could affect how well the data clusters in the t-sne plot
      
  if is_long_enough_file: ##only compute the delta MFCCs, delta delta MFCCs, and add to the dataset that dimensionality reduction algorithms will be applied to IF the file is long enough
    delta_mfccs = librosa.feature.delta(mfccs) ##number of delta MFCCs = 13
    delta_delta_mfccs = librosa.feature.delta(mfccs, order=2) ##number of delta delta MFCCs = 13

    #concatenate MFCCs, delta MFCCS, and delta delta MFCCs matrices
    comprehensive_mfccs = np.concatenate((mfccs, delta_mfccs, delta_delta_mfccs)) ##this turns the matrix from (13,215) to (39,215) (i.e., just tacks on the 13 rows for both the delta MFCCs and delta delta MFCCs on the bottom of the MFCC matrix)
                                                                                  ##you can use print out the shape of the matrices to better understand what's going on

    #add MFCCs for single cricket file to the mfcc dataset
    if is_first_iteration:
      mfccs_list = comprehensive_mfccs.flatten() ##this collapses the MFCCs, delta MFCCs, and delta delta MFCCs into 1 row
      is_first_iteration = False
    else:
      mfccs_list = np.vstack((mfccs_list, comprehensive_mfccs.flatten())) ##if the dataset already has one row, just append the next row to the end of the dataset

    is_long_enough_file = False
        
print(mfccs_list)
  

In [None]:
print(mfccs_list.shape)

(433, 8385)


In [None]:
#plot MFCCs function --> this function is just used to visualize our MFCCs in image format -- it's not used for the actual dimensionality reduction
def plot_mfcc(folder, data, title, id):
  plt.figure(figsize=(25,10))
  librosa.display.specshow(
      data,
      x_axis= "time",
      sr=sr,
      cmap='magma' #ensures colormaps all use the same coloring scheme
  )
  plt.xlim([0, 5])
  plt.title("MFCCs for " + title)
  plt.xlabel("time")
  plt.ylabel("coefficients")
  plt.savefig(folder + title + str(id) + '.png')
  plt.close()

In [None]:
##generate MFCCs images for VISUALIZATION PURPOSES ONLY: output --> .png files
mfccs_folder = "/content/drive/Shareddrives/Analyzing Cricket Songs with Machine Learning/mfccs for SINA and MCL/"
num_iter = 0
is_long_enough_file = False

for waveform in waveform_list: #go through waveforms for each of the audio files and generate MFCCs for them

  num_iter += 1
  print("ITERATION NUMBER: ", str(num_iter))
    
    
  #extract MFCCs
  audio_length = librosa.time_to_frames(5) #take only 5 seconds of MFCCs
  mfccs = librosa.feature.mfcc(waveform, n_mfcc=13, sr=sr)
  num_frames = mfccs.shape[1]

  if cricket_names[num_iter-1].find("SINA") != -1: #SINA: take first 5 seconds from MFCCs (no voiceover at the beginning)

      if num_frames >= audio_length:
        mfccs = mfccs[:,:audio_length]
        is_long_enough_file = True

      else: 
        pass  #disregard -- this waveform won't be used as part of the dataset for dimensionality reduction so no need to visualize it
        
  else: #MCL --> take seconds 5.2-10.2 from MFCCs (voiceover of roughly 5 seconds at beginning)
    
      jump_about_5_seconds = librosa.time_to_frames(5.2) ##jump a tad bit over 5 seconds to not get voiceover data
      if num_frames >= audio_length + jump_about_5_seconds: #trim longer files
        mfccs = mfccs[:,jump_about_5_seconds:audio_length + jump_about_5_seconds]
        is_long_enough_file = True

      else:
        pass  #disregard -- this waveform won't be used as part of the dataset for dimensionality reduction so no need to visualize it
      
  if is_long_enough_file: ##only compute the delta MFCCs, delta delta MFCCs, and add to the dataset that dimensionality reduction algorithms will be applied to IF the file is long enough
    delta_mfccs = librosa.feature.delta(mfccs) ##number of delta MFCCs = 13
    delta_delta_mfccs = librosa.feature.delta(mfccs, order=2) ##number of delta delta MFCCs = 13

    #concatenate MFCCs, delta MFCCS, and delta delta MFCCs matrices
    comprehensive_mfccs = np.concatenate((mfccs, delta_mfccs, delta_delta_mfccs)) ##this turns the matrix from (13,215) to (39,215) (i.e., just tacks on the 13 rows for both the delta MFCCs and delta delta MFCCs on the bottom of the MFCC matrix)
                                                                                  ##you can use print out the shape of the matrices to better understand what's going on

    plot_mfcc(mfccs_folder, comprehensive_mfccs, cricket_names[num_iter-1], num_iter-1)

    is_long_enough_file = False




# Creating magnitude power spectrums

In [None]:
##generate magnitude power spectrums for t-sne
audio_length = int(5 * sr) #determines how many time samples are in 5 seconds (sr = # samples/sec so 5 secs * #samples/sec = # samples in 5 secs)
                           #the length of the waveform list is the number of time samples and we need to only apply the Fourier Transform to the time samples occurring in the first 5 seconds
                           #Fourier transform moves audio from the time domain to the frequency domain (magnitude power spectrums just display the audio signal in the frequency domain)
                           #this is different from the mel specs/MFCCs as those deal with audio frames. For magnitude power spectrums, we have to deal with the time samples (frames are comprised of multiple time samples)
power_spec_list = []
num_iter = 0
is_long_enough_file = False

for waveform in waveform_list: ##go through waveforms for each of the audio files and generate magnitude power spectrums for them

  num_iter += 1
  print("ITERATION NUMBER: ", str(num_iter))

  if cricket_names[num_iter-1].find("SINA") != -1: #SINA: take first 5 seconds from MFCCs (no voiceover at the beginning)
     if len(waveform) >= audio_length:
       ft = np.fft.fft(butter_highpass_filter(waveform, 2000, sr)[:audio_length]) ##apply highpass filter to time samples within the first 5 seconds and apply the fourier transform to the filtered time samples
       is_long_enough_file = True
     else:
       pass #disregard because we'd have to 0 pad, which could affect how well the data clusters in the t-sne plot

  else: #MCL --> take seconds 5.2-10.2 from MFCCs (voiceover of roughly 5 seconds at beginning)
     jump_about_5_seconds = int(5.2 * sr) ##jump a tad bit over 5 seconds to not get voiceover data
     if len(waveform) >= audio_length + jump_about_5_seconds:
       ft = np.fft.fft(butter_highpass_filter(waveform, 2000, sr)[jump_about_5_seconds:audio_length + jump_about_5_seconds]) ##apply highpass filter to time samples within seconds 5.2-10.2 of the audio file and apply the fourier transform to the filtered time samples
       is_long_enough_file = True
     else:
       pass #disregard because we'd have to 0 pad, which could affect how well the data clusters in the t-sne plot

  if is_long_enough_file:
    power_spec = np.abs(ft) #Fourier transform returns a complex/imaginary number but we can take the absolute value to obtain the magnitude/the frequency's "relevance" in the sound
    frequency = np.linspace(0, sr, len(power_spec)) #the number of frequency bins is equal to the number of time samples in the waveform data
    num_frequency_bins = int(len(frequency) * 0.5) #only take the first half of the frequency bins (i.e. cut off at the nyquist frequency) --> the second half of the frequency bins are exact mirror images of the first half
    power_spec_list.append(np.array(power_spec[:num_frequency_bins]))
    is_long_enough_file = False
                         
power_spec_list = np.array(power_spec_list)

In [None]:
power_spec_list.shape

(433, 55125)

In [None]:
#plot magnitude power spectrum function --> this function is just used to visualize our magnitude power spectrums in image format -- it's not used for the actual dimensionality reduction
def plot_power_spec(folder_name, title, id, signal, sr, f_ratio=0.5):
  ft = np.fft.fft(signal)
  power_spec = np.abs(ft)
  plt.figure(figsize=(25,10))
  frequency = np.linspace(0, sr, len(power_spec))
  num_frequency_bins = int(len(frequency) * f_ratio)
  plt.title("Magnitude Power Spectrum for " + title)
  plt.xlabel("frequency")
  plt.ylabel("magnitude")
  plt.plot(frequency[:num_frequency_bins], power_spec[:num_frequency_bins])
  plt.savefig(folder_name + title + str(id) + '.png', bbox_inches='tight')
  plt.close()

In [None]:
##generate magnitude power spectrums plots for VISUALIZATION PURPOSES ONLY: output --> .png files
mps_plots_folder = "/content/drive/Shareddrives/Analyzing Cricket Songs with Machine Learning/mps for SINA and MCL/"
audio_length = int(5 * sr) #determines how many time samples are in 5 seconds (sr = # samples/sec so 5 secs * #samples/sec = # samples in 5 secs)
                           #the length of the waveform list is the number of time samples and we need to only apply the Fourier Transform to the time samples occurring in the first 5 seconds
                           #Fourier transform moves audio from the time domain to the frequency domain (magnitude power spectrums just display the audio signal in the frequency domain)
                           #this is different from the mel specs/MFCCs as those deal with audio frames. For magnitude power spectrums, we have to deal with the time samples (frames are comprised of multiple time samples)
num_iter = 0

for waveform in waveform_list: ##go through waveforms for each of the audio files and generate magnitude power spectrums for them

  num_iter += 1
  print("ITERATION NUMBER: ", str(num_iter))

  if cricket_names[num_iter-1].find("SINA") != -1: #SINA: take first 5 seconds from MFCCs (no voiceover at the beginning)
     if len(waveform) >= audio_length:
       plot_power_spec(mps_plots_folder, cricket_names[num_iter-1], num_iter-1, butter_highpass_filter(waveform, 2000, sr)[:audio_length], sr) ##apply highpass filter to time samples within the first 5 seconds and apply the fourier transform to the filtered time samples
     else:
       pass #disregard -- this waveform won't be used as part of the dataset for dimensionality reduction so no need to visualize it

  else: #MCL --> take seconds 5.2-10.2 from MFCCs (voiceover of roughly 5 seconds at beginning)
     jump_about_5_seconds = int(5.2 * sr) ##jump a tad bit over 5 seconds to not get voiceover data
     if len(waveform) >= audio_length + jump_about_5_seconds:
       plot_power_spec(mps_plots_folder, cricket_names[num_iter-1], num_iter-1, butter_highpass_filter(waveform, 2000, sr)[jump_about_5_seconds:audio_length + jump_about_5_seconds], sr) ##apply highpass filter to time samples within seconds 5.2-10.2 of the audio file and apply the fourier transform to the filtered time samples
     else:
       pass #disregard -- this waveform won't be used as part of the dataset for dimensionality reduction so no need to visualize it

# Applying PCA to Datasets for Mel Spectrograms, MFCCs, and Magnitude Power Spectrums

In [None]:
##pca for mel specs
num_components = 50 ##reduce to 50 dimensions because t-sne only works well with <= 50 dimensions
pca = PCA(n_components = num_components)
mel_spec_pca = pca.fit_transform(mel_spec_list)
print("TOTAL EXPLAINED VARIANCE WITH " + str(num_components) 
+ " components = " +  str(sum(pca.explained_variance_ratio_)))

TOTAL EXPLAINED VARIANCE WITH 50 components = 0.8996667571634519


In [None]:
##pca for mfccs
num_components = 50 ##reduce to 50 dimensions because t-sne only works well with <= 50 dimensions
pca = PCA(n_components = num_components)
mfccs_pca = pca.fit_transform(mfccs_list)
print("TOTAL EXPLAINED VARIANCE WITH " + str(num_components) 
+ " components = " +  str(sum(pca.explained_variance_ratio_)))

TOTAL EXPLAINED VARIANCE WITH 50 components = 0.9486747398041189


In [None]:
##pca for magnitude power spectrum
num_components = 50 ##reduce to 50 dimensions because t-sne only works well with <= 50 dimensions
pca = PCA(n_components = num_components)
mag_power_spec_pca = pca.fit_transform(power_spec_list)
print("TOTAL EXPLAINED VARIANCE WITH " + str(num_components) 
+ " components = " +  str(sum(pca.explained_variance_ratio_)))

TOTAL EXPLAINED VARIANCE WITH 50 components = 0.8100266327674646


# Apply T-SNE to PCA-Reduced Datasets for the 3 Audio Features

In [None]:
##apply t-sne to PCA-reduced mel spectrogram dataset
mel_spec_tsne_out = TSNE(
    n_components=2, perplexity=17.0, learning_rate=300.0, ##feel free to tune hyperparameters
    n_iter=15000, n_iter_without_progress=15000).fit_transform(mel_spec_pca)



In [None]:
##apply t-sne to PCA-reduced MFCCs dataset
mfccs_tsne_out = TSNE(
    n_components=2, perplexity=17.0, learning_rate=300.0, ##feel free to tune hyperparameters
    n_iter=15000, n_iter_without_progress=15000).fit_transform(mfccs_pca)



In [None]:
##apply t-sne to PCA-reduced magnitude power spectrums dataset
mag_power_spec_tnse_out = TSNE(
    n_components=2, perplexity=17.0, learning_rate=300.0, ##feel free to tune hyperparameters
    n_iter=15000, n_iter_without_progress=15000).fit_transform(mag_power_spec_pca)



# Filter Out Datapoints Belonging to a Genus or Species with < 30 Files and < 6 Files, Respectively 

In [None]:
##first, get information on how many files per genus and species
species_dict = {} 
genus_dict = {}

for name in cricket_names:
    #name = "Genus species MCL" or "Genus species SINA"
    genus_name = name.split(" ")[0]
    species_name = name.split(" ")[0] + " " + name.split(" ")[1]

    if species_name in species_dict: #have already seen species, increment counter by 1
        species_dict[species_name] += 1
    else: #haven't seen this species yet
        species_dict[species_name] = 1
        
    if genus_name in genus_dict: #have already seen genus, increment counter by 1
        genus_dict[genus_name] += 1
    else: #haven't seen this genus yet
        genus_dict[genus_name] = 1

In [None]:
#only load data in t-sne plot for genuses that have >= min_files for them
def filter_dataset_genus(x, y, min_count): #x = component 1 output from t-sne alg, y = component 2 output from t-sne alg
    filtered_x = np.array([])
    filtered_y = np.array([])
    for i in range(len(x)):
      genus_name = cricket_names[i].split(" ")[0]
      if genus_dict[genus_name] >= min_count:
          filtered_x = np.append(filtered_x, x[i])
          filtered_y = np.append(filtered_y, y[i])
    return (filtered_x, filtered_y)

In [None]:
#only load data in t-sne plot for species that have >= min_files for them
def filter_dataset_species(x, y, min_count): #x = component 1 output from t-sne alg, y = component 2 output from t-sne alg
    filtered_x = np.array([])
    filtered_y = np.array([])
    for i in range(len(x)):
      species_name = cricket_names[i].split(" ")[0] + " " + cricket_names[i].split(" ")[1]
      if species_dict[species_name] >= min_count:
          filtered_x = np.append(filtered_x, x[i])
          filtered_y = np.append(filtered_y, y[i])
    return (filtered_x, filtered_y)

In [None]:
#MEL SPECTROGRAM DATASET

##filtered mel spectrogram datapoints by GENUS
x_vals = mel_spec_tsne_out[:,0]
y_vals = mel_spec_tsne_out[:,1]
x_vals_mel_spec_genus, y_vals_mel_spec_genus = filter_dataset_genus(x_vals, y_vals, 30)

##filtered mel spectrogram datapoints by SPECIES
x_vals = mel_spec_tsne_out[:,0]
y_vals = mel_spec_tsne_out[:,1]
x_vals_mel_spec_species, y_vals_mel_spec_species = filter_dataset_species(x_vals, y_vals, 6)

In [None]:
#MFCCS DATASET

##filtered mfccs datapoints by GENUS
x_vals = mfccs_tsne_out[:,0]
y_vals = mfccs_tsne_out[:,1]
x_vals_mfccs_genus, y_vals_mfccs_genus = filter_dataset_genus(x_vals, y_vals, 30)

##filtered mfccs datapoints by SPECIES
x_vals = mfccs_tsne_out[:,0]
y_vals = mfccs_tsne_out[:,1]
x_vals_mfccs_species, y_vals_mfccs_species = filter_dataset_species(x_vals, y_vals, 6)

In [None]:
#MAGNITUDE POWER SPECTRUM DATASET

##filtered magnitude power spectrum datapoints by GENUS
x_vals = mag_power_spec_tnse_out[:,0]
y_vals = mag_power_spec_tnse_out[:,1]
x_vals_mps_genus, y_vals_mps_genus = filter_dataset_genus(x_vals, y_vals, 30)

##filtered magnitude power spectrum datapoints by SPECIES
x_vals = mag_power_spec_tnse_out[:,0]
y_vals = mag_power_spec_tnse_out[:,1]
x_vals_mps_species, y_vals_mps_species = filter_dataset_species(x_vals, y_vals, 6)

# Creating Genus and Species Color Mappings for the t-SNE Plots

In [None]:
#obtain genus mappings for each data point in the dataset
genus_mapping = [] ##will contain what genus id/"map value" each filtered data point belongs to (filtered --> datapoints for genera that have less than 30 files aren't considered since not a large enough sample size)
                   ##this is necessary because we must color code each datapoint according to its genus
                   
min_file_count = 30 #datapoints for genera that have less than 30 files aren't considered/plotted since not a large enough sample size
map_value = -1
prev_name = ""
filtered_genus_names = [] #will contain the genus names that have >= 30 files so that we know what genus each genus id/"map value" belongs to

for name in cricket_names:
  genus_name = name.split(" ")[0] #name = "Genus species MCL" or "Genus species SINA"
  if genus_dict[genus_name] >= min_file_count and genus_name == prev_name: #this is true when the current datapoint belongs to a genus with >= 30 files that was seen in the previous iteration (just reuse the same id/"map value")
    genus_mapping.append(map_value)
  elif genus_dict[genus_name] >= min_file_count: #this is true when the current datapoint belongs to a genus with >= 30 files but the genus name hasn't been seen before (increment the id/"map value")
    map_value += 1
    filtered_genus_names.append(genus_name)
    genus_mapping.append(map_value) 
  prev_name = genus_name

In [None]:
filtered_genus_names

['Allonemobius', 'Anaxipha', 'Cycloptilum', 'Gryllus', 'Oecanthus']

In [None]:
len(genus_mapping) ##this is the number of datapoints which belong to genera with >= 30 files

281

In [None]:
num_genuses_with_enough_files = genus_mapping[-1] + 1 #since the highest genus id/"map value" is 1 less than the total number of genera we found to have >= 30 files

In [None]:
# creating genus color map
cmap = plt.cm.jet
# extract all colors from the .jet map
cmaplist = [cmap(i) for i in range(cmap.N)]
# create the new map
genus_cmap = cmap.from_list('Custom cmap', cmaplist, cmap.N)
# define the bins and normalize
genus_bounds = np.linspace(0, num_genuses_with_enough_files, num_genuses_with_enough_files+1)
genus_norm = mpl.colors.BoundaryNorm(genus_bounds, genus_cmap.N)

In [None]:
#obtain species mappings for each data entry
species_mapping = [] ##will contain what species id/"map value" each filtered data point belongs to (filtered --> datapoints for species that have less than 6 files aren't considered since not a large enough sample size)
                     ##this is necessary because we must color code each datapoint according to its genus
min_file_count = 6 #datapoints for genera that have less than 6 files aren't considered/plotted since not a large enough sample size
map_value = -1
prev_name = ""
filtered_species_names= [] #will contain the species names that have >= 6 files so that we know what species each species id/"map value" belongs to

for name in cricket_names: 
  species_name = name.split(" ")[0] + " " + name.split(" ")[1] #name = "Genus species MCL" or "Genus species SINA"
  if species_dict[species_name] >= min_file_count and species_name == prev_name: #this is true when the current datapoint belongs to a species with >= 6 files that was seen in the previous iteration (just reuse the same id/"map value")
    species_mapping.append(map_value)
  elif species_dict[species_name] >= min_file_count: #this is true when the current datapoint belongs to a species with >= 6 files but the genus name hasn't been seen before (increment the id/"map value")
    map_value += 1
    filtered_species_names.append(species_name)
    species_mapping.append(map_value) 
  prev_name = species_name

In [None]:
filtered_species_names

['Gryllotalpa major',
 'Gryllus integer',
 'Gryllus ovisopis',
 'Gryllus pennsylvanicus',
 'Gryllus rubens',
 'Hapithus melodius',
 'Neocurtilla hexadactyla']

In [None]:
num_species_with_enough_files = species_mapping[-1] + 1 #since the highest species id/"map value" is 1 less than the total number of species we found to have >= 6 files

In [None]:
# creating species color map
cmap = plt.cm.jet
# extract all colors from the .jet map
cmaplist = [cmap(i) for i in range(cmap.N)]
# create the new map
species_cmap = cmap.from_list('Custom cmap', cmaplist, cmap.N)
# define the bins and normalize
species_bounds = np.linspace(0, num_species_with_enough_files, num_species_with_enough_files+1)
species_norm = mpl.colors.BoundaryNorm(species_bounds, species_cmap.N)

# Make t-SNE Plots

In [None]:
tsne_folder = mfccs_folder = "/content/drive/Shareddrives/Analyzing Cricket Songs with Machine Learning/t-sne plots/"

In [None]:
##MEL SPECTROGRAM BY GENUS

# make the scatter plot
fig, ax = plt.subplots(1,1, figsize=(20,20))
scat = ax.scatter(x_vals_mel_spec_genus, y_vals_mel_spec_genus, c=genus_mapping, cmap=genus_cmap, norm=genus_norm)
ax.set_title("t-SNE Map of Mel Spectrograms for Cricket Songs Labeled By Genus")
ax.set_xlabel("Dimension 1")
ax.set_ylabel("Dimension 2")

# create the colorbar and label it
cb = plt.colorbar(scat, spacing='proportional',ticks=genus_bounds)
cb.ax.set_yticklabels(filtered_genus_names)

# save to correct folder in shared drive
plt.savefig(tsne_folder + "mel_spec_genus" + '.png')

In [None]:
##MEL SPECTROGRAM BY SPECIES

# make the scatter plot
fig, ax = plt.subplots(1,1, figsize=(20,20))
scat = ax.scatter(x_vals_mel_spec_species, y_vals_mel_spec_species, c=species_mapping, cmap=species_cmap, norm=species_norm)
ax.set_title("t-SNE Map of Mel Spectrograms for Cricket Songs Labeled By Species")
ax.set_xlabel("Dimension 1")
ax.set_ylabel("Dimension 2")

# create the colorbar and label it
cb = plt.colorbar(scat, spacing='proportional',ticks=species_bounds)
cb.ax.set_yticklabels(filtered_species_names)

# save to correct folder in shared drive
plt.savefig(tsne_folder + "mel_spec_species" + '.png')

In [None]:
## MFCCS BY GENUS

# make the scatter plot
fig, ax = plt.subplots(1,1, figsize=(20,20))
scat = ax.scatter(x_vals_mfccs_genus, y_vals_mfccs_genus, c=genus_mapping, cmap=genus_cmap, norm=genus_norm)
ax.set_title("t-SNE Map of MFCCs for Cricket Songs Labeled By Genus")
ax.set_xlabel("Dimension 1")
ax.set_ylabel("Dimension 2")

# create the colorbar and label it
cb = plt.colorbar(scat, spacing='proportional',ticks=genus_bounds)
cb.ax.set_yticklabels(filtered_genus_names)

# save to correct folder in shared drive
plt.savefig(tsne_folder + "mfccs_genus" + '.png')

In [None]:
##MFCCs BY SPECIES

# make the scatter plot
fig, ax = plt.subplots(1,1, figsize=(20,20))
scat = ax.scatter(x_vals_mfccs_species, y_vals_mfccs_species, c=species_mapping, cmap=species_cmap, norm=species_norm)
ax.set_title("t-SNE Map of MFCCs for Cricket Songs Labeled By Species")
ax.set_xlabel("Dimension 1")
ax.set_ylabel("Dimension 2")

# create the colorbar and label it
cb = plt.colorbar(scat, spacing='proportional',ticks=species_bounds)
cb.ax.set_yticklabels(filtered_species_names)

# save to correct folder in shared drive
plt.savefig(tsne_folder + "mfccs_species" + '.png')

In [None]:
## MAGNITUDE POWER SPECS BY GENUS

# make the scatter plot
fig, ax = plt.subplots(1,1, figsize=(20,20))
scat = ax.scatter(x_vals_mps_genus, y_vals_mps_genus, c=genus_mapping, cmap=genus_cmap, norm=genus_norm)
ax.set_title("t-SNE Map of Magnitude Power Spectrums for Cricket Songs Labeled By Genus")
ax.set_xlabel("Dimension 1")
ax.set_ylabel("Dimension 2")

# create the colorbar and label it
cb = plt.colorbar(scat, spacing='proportional',ticks=genus_bounds)
cb.ax.set_yticklabels(filtered_genus_names)

# save to correct folder in shared drive
plt.savefig(tsne_folder + "mps_genus" + '.png')

In [None]:
##MAGNITUDE POWER SPECS BY SPECIES

# make the scatter plot
fig, ax = plt.subplots(1,1, figsize=(20,20))
scat = ax.scatter(x_vals_mps_species, y_vals_mps_species, c=species_mapping, cmap=species_cmap, norm=species_norm)
ax.set_title("t-SNE Map of Magnitude Power Spectrums for Cricket Songs Labeled By Species")
ax.set_xlabel("Dimension 1")
ax.set_ylabel("Dimension 2")

# create the colorbar and label it
cb = plt.colorbar(scat, spacing='proportional',ticks=species_bounds)
cb.ax.set_yticklabels(filtered_species_names)

# save to correct folder in shared drive
plt.savefig(tsne_folder + "mps_species" + '.png')