In [9]:

import os
import re
import glob
import librosa
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

In [10]:
import seaborn as sns
from glob import glob 
import librosa.display
import IPython.display as ipd
from itertools import cycle

sns.set_theme(style="white",palette=None)
color_pal = plt.rcParams["axes.prop_cycle"].by_key()["color"]
color_cycle = cycle(plt.rcParams["axes.prop_cycle"].by_key()["color"])

In [11]:
import glob
from collections import Counter

audio_dir = '/Users/shreyajaiswal/Desktop/BirdSong/'

# Set the pattern to match the audio files
audio_pattern = '*.wav'

# Get a list of file paths matching the pattern
audio_files = glob.glob(audio_dir + audio_pattern)

#audio_files.sort()

# Extract species names from the file paths
species_names = [re.search(r"(.*) \d{2}", os.path.basename(file_path)).group(1) for file_path in audio_files]

# Count the frequency of each species
species_counter = Counter(species_names)

# Sort the species names by their frequency in descending order
sorted_species = [item[0] for item in sorted(species_counter.items(), key=lambda item: item[1], reverse=True)]


# Now, sort the audio_files based on the sorted_species
audio_files.sort(key=lambda file_path: sorted_species.index(re.search(r"(.*) \d{2}", os.path.basename(file_path)).group(1)))
# Print the first 10 audio files




In [52]:

features_list = []
spectral_centroid_list = []
spectral_bandwidth_list = []
spectral_rolloff_list = []
chroma_stft_list = []
species_list = []

# Loop through all audio files and extract features
for file_path in audio_files[:400]:  # assuming you want to process first 400 files
    # Load the audio file
    audio_data, sampling_rate = librosa.load(file_path, sr=None)

    # Extract the MFCC features
    mfcc_features = np.mean(librosa.feature.mfcc(y=audio_data, sr=sampling_rate, n_mfcc=20).T,axis=0)

    # Extract the spectral centroid
    spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=audio_data, sr=sampling_rate))

    # Extract the spectral bandwidth
    spectral_bandwidth = np.mean(librosa.feature.spectral_bandwidth(y=audio_data, sr=sampling_rate))

    # Extract the spectral rolloff
    spectral_rolloff = np.mean(librosa.feature.spectral_rolloff(y=audio_data, sr=sampling_rate))

    # Extract the chroma_stft features
    chroma_stft_features = np.mean(librosa.feature.chroma_stft(y=audio_data, sr=sampling_rate), axis=1)

    # Append the features to the lists
    features_list.append(mfcc_features)
    spectral_centroid_list.append(spectral_centroid)
    spectral_bandwidth_list.append(spectral_bandwidth)
    spectral_rolloff_list.append(spectral_rolloff)
    chroma_stft_list.append(chroma_stft_features)

    # Extract the species name from the file path and append to species list
    file_name = os.path.basename(file_path)
    species_name = re.search(r"(.*) \d{2}", file_name).group(1)
    species_list.append(species_name)

# Create a DataFrame from the list of features and species list
df_mfcc = pd.DataFrame(features_list)
df_spectral_centroid = pd.DataFrame(spectral_centroid_list, columns=['spectral_centroid'])
df_spectral_bandwidth = pd.DataFrame(spectral_bandwidth_list, columns=['spectral_bandwidth'])
df_spectral_rolloff = pd.DataFrame(spectral_rolloff_list, columns=['spectral_rolloff'])
df_chroma_stft = pd.DataFrame(chroma_stft_list)
df_species = pd.DataFrame(species_list, columns=['species'])


In [88]:
pca = PCA(n_components=6) # Set the number of components you want to keep
df_chroma_pca = pca.fit_transform(df_chroma_stft)

pca = PCA(n_components=11) # Set the number of components you want to keep
df_mfcc_pca = pca.fit_transform(df_mfcc)

#covert to data frame
df_mfcc_pca = pd.DataFrame(df_mfcc_pca, columns=['P1', 'P2','P3','P4','P5','P6','P7','P8','P9','P10','P11'])
df_chroma_pca = pd.DataFrame(df_chroma_pca,columns=['C1', 'C2','C3','C4','C5','C6'])

# Concatenate all dataframes
df_final = pd.concat([df_mfcc_pca, df_spectral_centroid, df_spectral_bandwidth, df_spectral_rolloff, df_chroma_pca, df_species], axis=1)

# Print first 4 lines of the final DataFrame
print(df_final)


             P1         P2         P3         P4         P5         P6   
0    101.696465 -25.357204 -34.506519   7.781126 -23.723894   1.817387  \
1     39.061710  -2.491984   6.243211  -6.135143 -17.833326   1.374085   
2    -92.350388 -25.242985  22.780188 -27.908342  -0.224662  -1.257373   
3   -149.870239  21.877363   6.021046   2.627782  21.334085  -8.839947   
4     76.680748 -15.968580 -20.725445  17.597298 -35.518772  -3.563405   
..          ...        ...        ...        ...        ...        ...   
395  -26.629295  -0.235400  41.512203  17.428923  -9.282797   6.101490   
396  102.352554 -52.860287 -39.117264  -6.681496   4.113475  24.094801   
397  -73.494255 -14.535129  19.972717  -9.061548  -5.716305   1.912291   
398  -88.391769  -5.337788 -20.167135  -6.387761  -0.422997  -0.140305   
399  -80.870857  15.431854 -16.977528  -3.290994  -1.042109  -0.853286   

            P7         P8        P9       P10  ...  spectral_centroid   
0    -8.227084   6.946238  5.586599  0

In [101]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from imblearn.over_sampling import SMOTE

# Define the SMOTE instance
smote = SMOTE(random_state=42)



# Remove the species column from the features
X = df_final.drop('species', axis=1)

# Use the species column as the target
y = df_final['species']

# Apply SMOTE to your data
X_res, y_res = smote.fit_resample(X, y)


# Split the RESAMPLED data into training and test sets
X_train_res, X_test_res, y_train_res, y_test_res = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

# Create a random forest classifier
clf = RandomForestClassifier(n_estimators=400, 
                             min_samples_split=2, 
                             min_samples_leaf=1,  
                             max_depth=None, 
                             bootstrap=True,
                             random_state=42)

# Train the model on the RESAMPLED data
clf.fit(X_train_res, y_train_res)

# Make predictions on the test set
y_pred_res = clf.predict(X_test_res)

# Print a classification report
print(classification_report(y_test_res, y_pred_res))


                         precision    recall  f1-score   support

          American Crow       1.00      1.00      1.00         5
     American Goldfinch       0.57      1.00      0.73         8
         American Robin       0.88      0.88      0.88         8
       Baltimore Oriole       0.83      0.83      0.83         6
 Black-capped Chickadee       0.75      0.86      0.80         7
               Blue Jay       0.80      0.67      0.73         6
       Brewer's Sparrow       0.00      0.00      0.00         2
        Dark-eyed Junco       0.00      0.00      0.00         3
     Eastern Meadowlark       1.00      0.86      0.92         7
            Fox Sparrow       1.00      0.67      0.80         9
             House Wren       1.00      0.30      0.46        10
             Marsh Wren       0.60      0.60      0.60         5
      Nashville Warbler       0.50      0.67      0.57         3
          Red Crossbill       0.20      0.25      0.22         4
   Red-winged Blackbird 

In [98]:
from sklearn.neighbors import KNeighborsClassifier

# Create a k-NN classifier
knn = KNeighborsClassifier(n_neighbors=1, weights='uniform',p=1)

# Train the model on the RESAMPLED data
knn.fit(X_train_res, y_train_res)

# Make predictions on the test set
y_pred_knn = knn.predict(X_test_res)

# Print a classification report
print(classification_report(y_test_res, y_pred_knn))


                         precision    recall  f1-score   support

          American Crow       0.60      0.60      0.60         5
     American Goldfinch       0.50      0.25      0.33         8
         American Robin       0.50      0.62      0.56         8
       Baltimore Oriole       0.50      0.67      0.57         6
 Black-capped Chickadee       0.40      0.29      0.33         7
               Blue Jay       0.25      0.33      0.29         6
       Brewer's Sparrow       0.00      0.00      0.00         2
        Dark-eyed Junco       0.00      0.00      0.00         3
     Eastern Meadowlark       0.33      0.29      0.31         7
            Fox Sparrow       0.60      0.33      0.43         9
             House Wren       0.50      0.10      0.17        10
             Marsh Wren       0.33      0.40      0.36         5
      Nashville Warbler       0.14      0.33      0.20         3
          Red Crossbill       0.14      0.25      0.18         4
   Red-winged Blackbird 

In [97]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

# Define the parameter values that should be searched
k_range = list(range(1, 31))  # We'll tune the number of neighbors from 1 to 30
weight_options = ['uniform', 'distance']  # Two types of weights to be considered
p_values = [1, 2]  # 1 for Manhattan distance and 2 for Euclidean distance

# Create a parameter grid: map the parameter names to the values that should be searched
param_grid = dict(n_neighbors=k_range, weights=weight_options, p=p_values)

# Instantiate the grid
knn = KNeighborsClassifier()
grid = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy')

# Fit the grid with data
grid.fit(X_train_res, y_train_res)

# View the complete results (list of named tuples)
grid.cv_results_

# Examine the best model
print(grid.best_score_)
print(grid.best_params_)
print(grid.best_estimator_)


0.4137931034482759
{'n_neighbors': 1, 'p': 1, 'weights': 'uniform'}
KNeighborsClassifier(n_neighbors=1, p=1)


In [99]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200, 300, 400],  # The number of trees in the forest.
    'max_depth': [None, 10, 20, 30, 40, 50],  # The maximum depth of the tree.
    'min_samples_split': [2, 5, 10],  # The minimum number of samples required to split an internal node.
    'min_samples_leaf': [1, 2, 4],  # The minimum number of samples required to be at a leaf node.
    'max_features': ['auto', 'sqrt'],  # The number of features to consider when looking for the best split.
    'bootstrap': [True, False]  # Whether bootstrap samples are used when building trees.
}

# Create a base model
rf = RandomForestClassifier(random_state=42)

# Instantiate the grid search model
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, 
                           cv=3, n_jobs=-1, verbose=2, return_train_score=True)

# Fit the grid search to the data
grid_search.fit(X_train_res, y_train_res)

# Get the best parameters
best_params = grid_search.best_params_

print("Best parameters: ", best_params)

# Train and predict using the model with the best parameters
best_grid = grid_search.best_estimator_
grid_accuracy = evaluate(best_grid, X_test_res, y_test_res)

print("Grid accuracy: ", grid_accuracy)


Fitting 3 folds for each of 1080 candidates, totalling 3240 fits


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


[CV] END bootstrap=True, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.5s
[CV] END bootstrap=True, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.8s
[CV] END bootstrap=True, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=   2.6s
[CV] END bootstrap=True, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=400; total time=   3.8s
[CV] END bootstrap=True, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=   1.5s
[CV] END bootstrap=True, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time=   2.0s
[CV] END bootstrap=True, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=400; total time=   2.9s
[CV] END bootstrap=True, max_depth=

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=50; total time=   0.3s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=50; total time=   0.3s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=50; total time=   0.3s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=   0.7s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time=   1.3s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time=   1.8s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=400; total time=   2.5s
[CV] END bootstrap=True, max_depth=10, max_features

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=50; total time=   0.4s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=100; total time=   0.9s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=100; total time=   0.8s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=200; total time=   1.6s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=400; total time=   3.2s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=100; total time=   0.6s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=100; total time=   0.6s
[CV] END bootstrap=True, max_depth=10, max_featur

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=100; total time=   0.6s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=200; total time=   1.2s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=300; total time=   2.1s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=400; total time=   3.1s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=300; total time=   2.5s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=400; total time=   2.8s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=200; total time=   1.1s
[CV] END bootstrap=True, max_depth=10, max_f

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


[CV] END bootstrap=True, max_depth=30, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=400; total time=   3.1s
[CV] END bootstrap=True, max_depth=30, max_features=auto, min_samples_leaf=4, min_samples_split=5, n_estimators=200; total time=   1.3s
[CV] END bootstrap=True, max_depth=30, max_features=auto, min_samples_leaf=4, min_samples_split=5, n_estimators=400; total time=   2.7s
[CV] END bootstrap=True, max_depth=30, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time=   0.6s
[CV] END bootstrap=True, max_depth=30, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=200; total time=   1.5s
[CV] END bootstrap=True, max_depth=30, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=300; total time=   2.4s
[CV] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.5s
[CV] END bootstrap=True, max_depth=30, max_fea

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


[CV] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time=   0.6s
[CV] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=200; total time=   1.3s
[CV] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=400; total time=   2.5s
[CV] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   0.6s
[CV] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   0.6s
[CV] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time=   1.3s
[CV] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=400; total time=   2.6s
[CV] END bootstrap=True, max_depth=30, max_fe

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


[CV] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=   2.1s
[CV] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=400; total time=   2.8s
[CV] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time=   2.0s
[CV] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=400; total time=   2.7s
[CV] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=200; total time=   1.3s
[CV] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=300; total time=   1.9s
[CV] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=50; total time=   0.3s
[CV] END bootstrap=True, max_depth=30, max_feat

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


[CV] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=400; total time=   2.6s
[CV] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=300; total time=   1.9s
[CV] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=400; total time=   2.6s
[CV] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time=   1.3s
[CV] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time=   1.9s
[CV] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=50; total time=   0.3s
[CV] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=50; total time=   0.3s
[CV] END bootstrap=True, max_depth=30, max_featu

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


[CV] END bootstrap=True, max_depth=50, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time=   2.1s
[CV] END bootstrap=True, max_depth=50, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=400; total time=   2.8s
[CV] END bootstrap=True, max_depth=50, max_features=auto, min_samples_leaf=1, min_samples_split=10, n_estimators=200; total time=   1.3s
[CV] END bootstrap=True, max_depth=50, max_features=auto, min_samples_leaf=1, min_samples_split=10, n_estimators=300; total time=   1.9s
[CV] END bootstrap=True, max_depth=50, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=50; total time=   0.4s
[CV] END bootstrap=True, max_depth=50, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=50; total time=   0.3s
[CV] END bootstrap=True, max_depth=50, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=50; total time=   0.4s
[CV] END bootstrap=True, max_depth=50, max_featur

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


[CV] END bootstrap=True, max_depth=50, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   0.8s
[CV] END bootstrap=True, max_depth=50, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=   1.5s
[CV] END bootstrap=True, max_depth=50, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time=   2.0s
[CV] END bootstrap=True, max_depth=50, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=400; total time=   2.8s
[CV] END bootstrap=True, max_depth=50, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=300; total time=   1.8s
[CV] END bootstrap=True, max_depth=50, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=400; total time=   2.5s
[CV] END bootstrap=True, max_depth=50, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time=   1.2s
[CV] END bootstrap=True, max_depth=50, max_fea

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


[CV] END bootstrap=False, max_depth=None, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=50; total time=   0.3s
[CV] END bootstrap=False, max_depth=None, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=50; total time=   0.3s
[CV] END bootstrap=False, max_depth=None, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=50; total time=   0.3s
[CV] END bootstrap=False, max_depth=None, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=100; total time=   0.6s
[CV] END bootstrap=False, max_depth=None, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=200; total time=   1.1s
[CV] END bootstrap=False, max_depth=None, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=300; total time=   1.8s
[CV] END bootstrap=False, max_depth=None, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=400; total time=   2.8s
[CV] END bootstrap=False, max_

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(



[CV] END bootstrap=False, max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   0.7s
[CV] END bootstrap=False, max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=   1.4s
[CV] END bootstrap=False, max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=400; total time=   2.8s
[CV] END bootstrap=False, max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time=   0.9s
[CV] END bootstrap=False, max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=10, n_estimators=200; total time=   1.6s
[CV] END bootstrap=False, max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=10, n_estimators=300; total time=   2.7s
[CV] END bootstrap=False, max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=10, n_estimators=400; total time=   2.8s
[CV] END bootstrap=False, max_depth=

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


[CV] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=400; total time=   3.5s
[CV] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   0.7s
[CV] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=   1.7s
[CV] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time=   2.3s
[CV] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=50; total time=   0.4s
[CV] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=50; total time=   0.3s
[CV] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=50; total time=   0.6s
[CV] END bootstrap=False, max_depth=20, m

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


[CV] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time=   1.4s
[CV] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time=   2.0s
[CV] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=400; total time=   4.7s
[CV] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=300; total time=   7.6s
[CV] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=400; total time=   5.3s
[CV] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=200; total time=   2.3s
[CV] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=300; total time=   3.2s
[CV] END bootstrap=False, max_depth=20,

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


[CV] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=400; total time=   4.5s
[CV] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=200; total time=   1.6s
[CV] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=300; total time=   2.5s
[CV] END bootstrap=False, max_depth=30, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.5s
[CV] END bootstrap=False, max_depth=30, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.6s
[CV] END bootstrap=False, max_depth=30, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.6s
[CV] END bootstrap=False, max_depth=30, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.0s
[CV] END bootstrap=False, max_depth=30, ma

  warn(


Best parameters:  {'bootstrap': True, 'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 400}


NameError: name 'evaluate' is not defined

In [102]:
from sklearn.preprocessing import StandardScaler

# Remove the species column from the features
X = df_final.drop('species', axis=1)

# Use the species column as the target
y = df_final['species']

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the scaler to the features and transform
X_scaled = scaler.fit_transform(X)

# Apply SMOTE to your data
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X_scaled, y)

# Split the RESAMPLED data into training and test sets
X_train_res, X_test_res, y_train_res, y_test_res = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

# Create a k-NN classifier
knn = KNeighborsClassifier(n_neighbors=1, weights='uniform', p=1)

# Train the model on the RESAMPLED data
knn.fit(X_train_res, y_train_res)

# Make predictions on the test set
y_pred_knn = knn.predict(X_test_res)

# Print a classification report
print(classification_report(y_test_res, y_pred_knn))


                         precision    recall  f1-score   support

          American Crow       0.83      1.00      0.91         5
     American Goldfinch       0.75      0.75      0.75         8
         American Robin       1.00      0.75      0.86         8
       Baltimore Oriole       1.00      0.67      0.80         6
 Black-capped Chickadee       0.62      0.71      0.67         7
               Blue Jay       0.71      0.83      0.77         6
       Brewer's Sparrow       0.00      0.00      0.00         2
        Dark-eyed Junco       0.25      0.33      0.29         3
     Eastern Meadowlark       0.88      1.00      0.93         7
            Fox Sparrow       0.88      0.78      0.82         9
             House Wren       0.75      0.30      0.43        10
             Marsh Wren       0.60      0.60      0.60         5
      Nashville Warbler       0.67      0.67      0.67         3
          Red Crossbill       0.50      0.75      0.60         4
   Red-winged Blackbird 

In [14]:
import pywt

features_list = []
spectral_centroid_list = []
spectral_bandwidth_list = []
spectral_rolloff_list = []
chroma_stft_list = []
wavelet_features_list = []  # Create a new list to hold the wavelet features
species_list = []

# Loop through all audio files and extract features
for file_path in audio_files[:400]:  # assuming you want to process first 100 files
    # Load the audio file
    audio_data, sampling_rate = librosa.load(file_path, sr=None)

    # Extract the MFCC features
    mfcc_features = np.mean(librosa.feature.mfcc(y=audio_data, sr=sampling_rate, n_mfcc=20).T,axis=0)

    # Extract the spectral centroid
    spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=audio_data, sr=sampling_rate))

    # Extract the spectral bandwidth
    spectral_bandwidth = np.mean(librosa.feature.spectral_bandwidth(y=audio_data, sr=sampling_rate))

    # Extract the spectral rolloff
    spectral_rolloff = np.mean(librosa.feature.spectral_rolloff(y=audio_data, sr=sampling_rate))

    # Extract the chroma_stft features
    chroma_stft_features = np.mean(librosa.feature.chroma_stft(y=audio_data, sr=sampling_rate), axis=1)

    # Perform a Continuous Wavelet Transform (CWT) with a Ricker wavelet (also known as the "Mexican hat wavelet")
    wavelet = 'mexh'
    coeffs, freqs = pywt.cwt(audio_data, np.arange(1, 31), wavelet)
    wavelet_features = np.mean(coeffs, axis=1)  # Extract some feature from the CWT, e.g., the mean of each scale

    # Append the features to the lists
    features_list.append(mfcc_features)
    spectral_centroid_list.append(spectral_centroid)
    spectral_bandwidth_list.append(spectral_bandwidth)
    spectral_rolloff_list.append(spectral_rolloff)
    chroma_stft_list.append(chroma_stft_features)
    wavelet_features_list.append(wavelet_features)  # Append the wavelet features

    # Extract the species name from the file path and append to species list
    file_name = os.path.basename(file_path)
    species_name = re.search(r"(.*) \d{2}", file_name).group(1)
    species_list.append(species_name)

# Create a DataFrame from the list of features and species list
df_mfcc = pd.DataFrame(features_list)
df_spectral_centroid = pd.DataFrame(spectral_centroid_list, columns=['spectral_centroid'])
df_spectral_bandwidth = pd.DataFrame(spectral_bandwidth_list, columns=['spectral_bandwidth'])
df_spectral_rolloff = pd.DataFrame(spectral_rolloff_list, columns=['spectral_rolloff'])
df_chroma_stft = pd.DataFrame(chroma_stft_list)
df_wavelet_features = pd.DataFrame(wavelet_features_list)  # Create a DataFrame for the wavelet features
df_species = pd.DataFrame(species_list, columns=['species'])


In [49]:
pca = PCA(n_components=6) # Set the number of components you want to keep
df_chroma_pca = pca.fit_transform(df_chroma_stft)

pca = PCA(n_components=11) # Set the number of components you want to keep
df_mfcc_pca = pca.fit_transform(df_mfcc)

pca = PCA(n_components= 10) # Set the number of components you want to keep
df_wavelet_pca = pca.fit_transform(df_wavelet_features)

#covert to data frame
df_mfcc_pca = pd.DataFrame(df_mfcc_pca, columns=['P'+str(i+1) for i in range(df_mfcc_pca.shape[1])])
df_chroma_pca = pd.DataFrame(df_chroma_pca,columns=['C'+str(i+1) for i in range(df_chroma_pca.shape[1])])
df_wavelet_pca = pd.DataFrame(df_wavelet_pca,columns=['W'+str(i+1) for i in range(df_wavelet_pca.shape[1])])

# Concatenate all dataframes
df_final = pd.concat([df_mfcc_pca,df_wavelet_pca,df_chroma_pca, df_spectral_centroid, df_spectral_bandwidth, df_spectral_rolloff,df_wavelet_pca, df_species], axis=1)

# Print first 4 lines of the final DataFrame
print(df_final.head(2))

ValueError: Input X contains NaN.
PCA does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [43]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from imblearn.over_sampling import SMOTE

# Define the SMOTE instance
smote = SMOTE(random_state=42)



# Remove the species column from the features
X = df_selected.drop('species', axis=1)

# Use the species column as the target
y = df_selected['species']

# Apply SMOTE to your data
X_res, y_res = smote.fit_resample(X, y)


# Split the RESAMPLED data into training and test sets
X_train_res, X_test_res, y_train_res, y_test_res = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

# Create a random forest classifier
clf = RandomForestClassifier(n_estimators=400, 
                             min_samples_split=2, 
                             min_samples_leaf=1,  
                             max_depth=None, 
                             bootstrap=True,
                             random_state=42)

# Train the model on the RESAMPLED data
clf.fit(X_train_res, y_train_res)

# Make predictions on the test set
y_pred_res = clf.predict(X_test_res)

# Print a classification report
print(classification_report(y_test_res, y_pred_res))


                         precision    recall  f1-score   support

          American Crow       0.62      1.00      0.77         5
     American Goldfinch       0.45      0.62      0.53         8
         American Robin       0.83      0.62      0.71         8
       Baltimore Oriole       0.57      0.67      0.62         6
 Black-capped Chickadee       0.83      0.71      0.77         7
               Blue Jay       0.33      0.50      0.40         6
       Brewer's Sparrow       0.50      1.00      0.67         2
        Dark-eyed Junco       0.00      0.00      0.00         3
     Eastern Meadowlark       0.75      0.86      0.80         7
            Fox Sparrow       0.75      0.33      0.46         9
             House Wren       0.50      0.10      0.17        10
             Marsh Wren       0.60      0.60      0.60         5
      Nashville Warbler       0.33      0.33      0.33         3
          Red Crossbill       0.50      0.75      0.60         4
   Red-winged Blackbird 

In [44]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif

# Assuming X and y are your data
X = df_final.drop('species', axis=1)
y = df_final['species']

# Apply SelectKBest class to extract top 10 best features
bestfeatures = SelectKBest(score_func=mutual_info_classif, k=10)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)

# Concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  # Naming the dataframe columns

# Print 10 best features
print(featureScores.nlargest(10,'Score'))  



ValueError: Input X contains NaN.
SelectKBest does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [40]:
# Assume pca is your fitted PCA model, and X_pca is your transformed data
from sklearn.decomposition import PCA

pca = PCA(n_components=11) # Set the number of components you want to keep
df_mfcc_pca = pca.fit_transform(df_mfcc_pca)

# Convert the PCA-transformed data into a DataFrame
df_mfcc_pca = pd.DataFrame(data=df_mfcc_pca, columns=['P'+str(i+1) for i in range(df_mfcc_pca.shape[1])])

# Now, you can isolate the columns you're interested in
selected_mfcc_df = df_mfcc_pca[['P1','P2', 'P4', 'P5','P7']]

pca = PCA(n_components= 10) # Set the number of components you want to keep
df_wavelet_pca = pca.fit_transform(df_wavelet_features)

# Convert the PCA-transformed data into a DataFrame
df_wavelet_pca = pd.DataFrame(data=df_wavelet_pca, columns=['W'+str(i+1) for i in range(df_wavelet_pca.shape[1])])

# Now, you can isolate the columns you're interested in
selected_wavelet_df = df_wavelet_pca[['W6','W9']]

# Concatenate all dataframes
df_selected = pd.concat([selected_wavelet_df,selected_mfcc_df, df_spectral_centroid, df_species], axis=1)

# Print first 4 lines of the final DataFrame
print(df_selected.head(5))


             W6            W9          P1         P2         P4         P5   
0 -2.014480e-12  2.474424e-13  101.696449 -25.357132   7.781139 -23.723890  \
1  6.997543e-14 -8.800660e-13   39.061707  -2.491925  -6.135122 -17.833321   
2 -6.213174e-13 -4.780710e-13  -92.350395 -25.243011 -27.908333  -0.224669   
3  8.352304e-14 -1.351431e-13 -149.870239  21.877340   2.627777  21.334093   
4  1.979608e-13 -4.560778e-13   76.680740 -15.968537  17.597319 -35.518776   

          P7  spectral_centroid          species  
0  -8.227090        7589.398711  Dark-eyed Junco  
1  11.028903        6907.514585  Dark-eyed Junco  
2   5.509142        6749.148852  Dark-eyed Junco  
3   2.998488        7441.418947  Dark-eyed Junco  
4  -7.280901        6868.312117  Dark-eyed Junco  


In [50]:
df_wavelet_features

Unnamed: 0,W1,W2,W3,W4,W5,W6,W7,W8,W9,W10,...,W21,W22,W23,W24,W25,W26,W27,W28,W29,W30
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,,,,,,,,,,,...,,,,,,,,,,
396,,,,,,,,,,,...,,,,,,,,,,
397,,,,,,,,,,,...,,,,,,,,,,
398,,,,,,,,,,,...,,,,,,,,,,
