In [None]:
# OpenSoundscape imports
from opensoundscape import Audio, Spectrogram
from opensoundscape.annotations import BoxedAnnotations

# General-purpose packages
import numpy as np
import pandas as pd
from glob import glob
from pathlib import Path


from matplotlib import pyplot as plt
plt.rcParams['figure.figsize']=[15,5] #for big visuals
%config InlineBackend.figure_format = 'retina'


Loading Raven Files

In [None]:
from glob import glob
import os
import re

# Assuming your current working directory is the base path
base_path = os.getcwd()

# Folder name
folder_name = "ravenSelectionTables"

# Construct the relative path
relative_folder_path = os.path.join(base_path, folder_name)

# Use glob to find all .txt files directly in the folder
selections = glob(os.path.join(relative_folder_path, "*.txt"))
# Regular expression pattern to extract the date and time part
pattern = r"(\d{8}_\d{6})" #NOTE THIS MAY NOT WORK FOR ALL RECORDINGS IN OTHER FILES

# Iterate through files and rename to include only the date and time part
for old_file_path in selections:
    old_file_name = os.path.basename(old_file_path)
    
    # Use regex to find the date and time part
    match = re.search(pattern, old_file_name)
    
    if match:
        new_file_name = match.group(1)
        
        # Construct the new file path
        new_file_path = os.path.join(relative_folder_path, new_file_name + ".txt")
        
        # Rename the file
        os.rename(old_file_path, new_file_path)
        print(f"Renamed: {old_file_path} to {new_file_path}")
    else:
        print(f"No match found in filename: {old_file_name}")

# Optional: Print the updated list of files in ravenSelectionTables
updated_selections = glob(os.path.join(relative_folder_path, "*.txt"))
print(updated_selections)




Rename audio files to match raven txt selection file names

In [None]:
# create a list of audio files, one corresponding to each Raven file
'''so this is going through the ravenSelectionTables folder and finidng the audio files. 
For now, we manually make sure the audio name is EXACT SAME as the txt file name.'''
audio_files = glob(os.path.join(relative_folder_path, "*.wav"))

# Regular expression pattern to extract the date and time part
pattern = r"(\d{8}_\d{6})" #NOTE THIS MAY NOT WORK FOR ALL RECORDINGS IN OTHER FILES

# Iterate through files and rename to include only the date and time part
for old_file_path in audio_files:
    old_file_name = os.path.basename(old_file_path)
    
    # Use regex to find the date and time part
    match = re.search(pattern, old_file_name)
    
    if match:
        new_file_name = match.group(1)
        
        # Construct the new file path
        new_file_path = os.path.join(relative_folder_path, new_file_name + ".wav")
        
        # Rename the file
        os.rename(old_file_path, new_file_path)
        print(f"Renamed: {old_file_path} to {new_file_path}")
    else:
        print(f"No match found in filename: {old_file_name}")

# Optional: Print the updated list of files in ravenSelectionTables
updated_audio_files = glob(os.path.join(relative_folder_path, "*.wav"))
print(updated_audio_files)


In [None]:
import os

def find_matching_pairs(folder_path):
    matching_pairs = []
    missing_pairs = []
    duplicate_files = []

    all_files = [file for file in os.listdir(folder_path) if file.endswith((".wav", ".txt"))]
    wav_files = set(file for file in all_files if file.endswith(".wav"))
    txt_files = set(file for file in all_files if file.endswith(".txt"))

    # Check for matching pairs
    for wav_file in wav_files:
        txt_file = wav_file.replace(".wav", ".txt")
        if txt_file in txt_files:
            matching_pairs.append((wav_file, txt_file))
        else:
            missing_pairs.append(wav_file)

    for txt_file in txt_files:
        wav_file = txt_file.replace(".txt", ".wav")
        if wav_file not in wav_files:
            missing_pairs.append(txt_file)

    # Check for duplicates
    file_count = {}
    for file in all_files:
        if file in file_count:
            file_count[file] += 1
        else:
            file_count[file] = 1

    for file, count in file_count.items():
        if count > 1:
            duplicate_files.append(file)

    # Print warnings
    if missing_pairs:
        print("Warning: Some files do not have corresponding pairs:")
        for file in missing_pairs:
            print(file)

    if duplicate_files:
        print("Warning: Some files are duplicated:")
        for file in duplicate_files:
            print(file)

    return matching_pairs

# Replace 'folder_path' with the path to your ravenSelectiontable folder
folder_path = './ravenSelectionTables'
matching_pairs = find_matching_pairs(folder_path)
#for pair in matching_pairs:
    #print(pair)


It seems like the column "annotations" is a number, while at the end there is a column "Annotations" thats not supposed to be there

In [None]:
from opensoundscape import BoxedAnnotations

all_annotations = BoxedAnnotations.from_raven_files(
    updated_selections,audio_files)

#RENAME DUPLICATE ANNOTATION TEST
all_annotations.df.rename(columns={'annotation': 'length'}, inplace=True)
all_annotations.df.rename(columns={'Annotation': 'annotation'}, inplace=True)

all_annotations.df.head(10)




In [None]:
all_annotations.df.annotation.value_counts() 

In [None]:
#keep only annotations that say "NOBO"
class_list = ['NOBO']

Now time for one hot encoding for the model


In [None]:
# create labels for fixed-duration (2 second) clips 

'''since we aren't in the ravenselectiontable directory since the beginning, 
I ASSUME that i need to reference the audio file path manually under audio_files parameter'''

audio_location = glob(os.path.join(relative_folder_path, "*.wav"))

#print(audio_location)

#change labels
labels = all_annotations.one_hot_clip_labels(
  clip_duration=1,
  clip_overlap=0.5,
  min_label_overlap=0.1,
  class_subset=class_list, #maybe we can add NOBO?
  audio_files= audio_location
)

# Set the option to display all rows
pd.set_option('display.max_rows', None)

labels.head()



In [None]:
labels.to_csv("labels.csv", index=False)

In [None]:
columns_list = labels.columns.tolist()
print(columns_list)

Training the model

In [None]:
from sklearn.model_selection import train_test_split
from opensoundscape import CNN
from opensoundscape.ml.cnn import load_model

import shutup; shutup.please()

#train_df, validation_df = train_test_split(labels, test_size=0.2, random_state=1) #old

# Reset the index to convert "file" index into a column
labels_reset_index = labels.reset_index()

# Split the DataFrame based on unique values in the 'file' column
train_files, test_files = train_test_split(labels_reset_index['file'].unique(), test_size=0.2, random_state=1)

# Filter the labels DataFrame based on the split file names
train_df = labels_reset_index[labels_reset_index['file'].isin(train_files)]
test_df = labels_reset_index[labels_reset_index['file'].isin(test_files)]

# Set "file" column as the index again
train_df = train_df.set_index(['file', 'start_time', 'end_time'])
test_df = test_df.set_index(['file', 'start_time', 'end_time'])

# just make two different csv of train and validation
# Define constants
SAMPLE_DURATION = 1.0 #what is this; has to be same as clip duration, so 1
NUM_EPOCHS = 10
BATCH_SIZE = 32 #too SLOW, maybe 256, 
SAVE_INTERVAL = 1000

# Specify paths
model_path = './bird_training/bobwhite/best.model'
save_path = './bird_training/bobwhite'

if os.path.exists(model_path):
    model = load_model(model_path)
else:
    model = CNN('resnet18', classes=class_list, sample_duration=SAMPLE_DURATION)

    model.train(
        train_df=train_df,
        validation_df=test_df,
        save_path=save_path,
        epochs=NUM_EPOCHS,
        batch_size=BATCH_SIZE,
        save_interval=SAVE_INTERVAL,
        log_interval=720,
        num_workers=4, #specify 4 if you have 4 CPU processes, eg; 0 means only the root process i have 8
    )

    model.save(model_path)
    model = load_model(model_path)

After training

In [None]:
from glob import glob

# Search for .wav and .WAV files in all subdirectories of the directory
forest_recordings = glob('./analysis_recording/2022Forest/**/*.[wW][aA][vV]', recursive=True)
grasslandfall_recordings = glob('./analysis_recording/2022GrasslandFall/**/*.[wW][aA][vV]', recursive=True)
grasslandsummer_recordings = glob('./analysis_recording/2022GrasslandSummer/**/*.[wW][aA][vV]', recursive=True)

everything_list = [forest_recordings, grasslandfall_recordings, grasslandsummer_recordings]

#maybe add code here that says if file name is in csv_output, remove from this list
# Path to the csv_output folder
csv_output_folder = './csv_output'

# Iterate over files in the csv_output folder
for file in os.listdir(csv_output_folder):
    # Read the CSV file
    csv_file_path = os.path.join(csv_output_folder, file)
    df = pd.read_csv(csv_file_path)
    # Get the filenames from the "file" column
    filenames = df.iloc[:, 0].tolist()
    # Iterate over each list in everything_list
    for recording_list in everything_list:
        # Check if any filename matches a value in the "file" column
        for filename in filenames:
            if filename in recording_list:
                # Remove the filename from the list
                recording_list.remove(filename)

print(everything_list)

In [None]:
def predict(file,name):
    prediction_scores_df = model.predict(file, activation_layer='sigmoid')

    # Add a new column 'Presence' based on the value 
    prediction_scores_df['Presence'] = ['Present' if float(score) > 0.99 else '' for score in prediction_scores_df['NOBO']]

    prediction_scores_df.to_csv(name)
    
    # Read the CSV file into a pandas DataFrame
    df = pd.read_csv(name)
    
    # Convert the seconds column to a "minute:second" format
    df['start_time'] = pd.to_datetime(df['start_time'], unit='s').apply(lambda x: x.strftime('%H:%M:%S'))
    df['end_time'] = pd.to_datetime(df['end_time'], unit='s').apply(lambda x: x.strftime('%H:%M:%S'))

    # Save the updated DataFrame to the same CSV file
    df.to_csv(name, index=False)

In [None]:

# import Audio and Spectrogram classes from OpenSoundscape
from opensoundscape.audio import Audio
from opensoundscape.spectrogram import Spectrogram

count = 0
     
for recording_list in everything_list:
    
    if recording_list:
        #doesnt work for multiple files, figure out
        if count == 0:
            for file in recording_list:
                # Extract the base name of the file (excluding path and extension)
                recording_name = os.path.splitext(os.path.basename(file))[0]
                
                # Construct the CSV file name using the recording name
                name = f"csv_output/forest_recordings_{recording_name}.csv"
                
                print(f"In progress: {recording_name}")
                
                predict([file], name)
        elif count == 1:
             for file in recording_list:
                # Extract the base name of the file (excluding path and extension)
                recording_name = os.path.splitext(os.path.basename(file))[0]
                
                # Construct the CSV file name using the recording name
                name = f"csv_output/grasslandfall_recordings_{recording_name}.csv"
                
                print(f"In progress: {recording_name}")
                
                predict([file], name)
        elif count == 2:
            for file in recording_list:
                # Extract the base name of the file (excluding path and extension)
                recording_name = os.path.splitext(os.path.basename(file))[0]
                
                # Construct the CSV file name using the recording name
                name = f"csv_output/grasslandsummer_recordings_{recording_name}.csv"
                
                print(f"In progress: {recording_name}")
                
                predict([file], name)

    count += 1
        
        