In [99]:
# OpenSoundscape imports
from opensoundscape import Audio, Spectrogram
from opensoundscape.annotations import BoxedAnnotations

# General-purpose packages
import numpy as np
import pandas as pd
from glob import glob
from pathlib import Path


from matplotlib import pyplot as plt
plt.rcParams['figure.figsize']=[15,5] #for big visuals
%config InlineBackend.figure_format = 'retina'


Loading Raven Files

In [100]:
from glob import glob
import os
import re

# Assuming your current working directory is the base path
base_path = os.getcwd()

# Folder name
folder_name = "ravenSelectionTables"

# Construct the relative path
relative_folder_path = os.path.join(base_path, folder_name)

# Use glob to find all .txt files directly in the folder
selections = glob(os.path.join(relative_folder_path, "*.txt"))
# Regular expression pattern to extract the date and time part
pattern = r"(\d{8}_\d{6})" #NOTE THIS MAY NOT WORK FOR ALL RECORDINGS IN OTHER FILES

# Iterate through files and rename to include only the date and time part
for old_file_path in selections:
    old_file_name = os.path.basename(old_file_path)
    
    # Use regex to find the date and time part
    match = re.search(pattern, old_file_name)
    
    if match:
        new_file_name = match.group(1)
        
        # Construct the new file path
        new_file_path = os.path.join(relative_folder_path, new_file_name + ".txt")
        
        # Rename the file
        os.rename(old_file_path, new_file_path)
        print(f"Renamed: {old_file_path} to {new_file_path}")
    else:
        print(f"No match found in filename: {old_file_name}")

# Optional: Print the updated list of files in ravenSelectionTables
updated_selections = glob(os.path.join(relative_folder_path, "*.txt"))
print(updated_selections)




Renamed: c:\Users\jonat\Documents\bobwhite\ravenSelectionTables\20221022_060000.txt to c:\Users\jonat\Documents\bobwhite\ravenSelectionTables\20221022_060000.txt
Renamed: c:\Users\jonat\Documents\bobwhite\ravenSelectionTables\20221024_060000.txt to c:\Users\jonat\Documents\bobwhite\ravenSelectionTables\20221024_060000.txt
Renamed: c:\Users\jonat\Documents\bobwhite\ravenSelectionTables\20221025_060000.txt to c:\Users\jonat\Documents\bobwhite\ravenSelectionTables\20221025_060000.txt
Renamed: c:\Users\jonat\Documents\bobwhite\ravenSelectionTables\20221027_060000.txt to c:\Users\jonat\Documents\bobwhite\ravenSelectionTables\20221027_060000.txt
Renamed: c:\Users\jonat\Documents\bobwhite\ravenSelectionTables\20221029_060000.txt to c:\Users\jonat\Documents\bobwhite\ravenSelectionTables\20221029_060000.txt
Renamed: c:\Users\jonat\Documents\bobwhite\ravenSelectionTables\20221030_060000.txt to c:\Users\jonat\Documents\bobwhite\ravenSelectionTables\20221030_060000.txt
Renamed: c:\Users\jonat\Docu

Rename audio files to match raven txt selection file names

In [101]:
# create a list of audio files, one corresponding to each Raven file
'''so this is going through the ravenSelectionTables folder and finidng the audio files. 
For now, we manually make sure the audio name is EXACT SAME as the txt file name.'''
audio_files = glob(os.path.join(relative_folder_path, "*.wav"))

# Regular expression pattern to extract the date and time part
pattern = r"(\d{8}_\d{6})" #NOTE THIS MAY NOT WORK FOR ALL RECORDINGS IN OTHER FILES

# Iterate through files and rename to include only the date and time part
for old_file_path in audio_files:
    old_file_name = os.path.basename(old_file_path)
    
    # Use regex to find the date and time part
    match = re.search(pattern, old_file_name)
    
    if match:
        new_file_name = match.group(1)
        
        # Construct the new file path
        new_file_path = os.path.join(relative_folder_path, new_file_name + ".wav")
        
        # Rename the file
        os.rename(old_file_path, new_file_path)
        print(f"Renamed: {old_file_path} to {new_file_path}")
    else:
        print(f"No match found in filename: {old_file_name}")

# Optional: Print the updated list of files in ravenSelectionTables
updated_audio_files = glob(os.path.join(relative_folder_path, "*.wav"))
print(updated_audio_files)


Renamed: c:\Users\jonat\Documents\bobwhite\ravenSelectionTables\20221022_060000.wav to c:\Users\jonat\Documents\bobwhite\ravenSelectionTables\20221022_060000.wav
Renamed: c:\Users\jonat\Documents\bobwhite\ravenSelectionTables\20221024_060000.wav to c:\Users\jonat\Documents\bobwhite\ravenSelectionTables\20221024_060000.wav
Renamed: c:\Users\jonat\Documents\bobwhite\ravenSelectionTables\20221025_060000.wav to c:\Users\jonat\Documents\bobwhite\ravenSelectionTables\20221025_060000.wav
Renamed: c:\Users\jonat\Documents\bobwhite\ravenSelectionTables\20221027_060000.wav to c:\Users\jonat\Documents\bobwhite\ravenSelectionTables\20221027_060000.wav
Renamed: c:\Users\jonat\Documents\bobwhite\ravenSelectionTables\20221029_060000.wav to c:\Users\jonat\Documents\bobwhite\ravenSelectionTables\20221029_060000.wav
Renamed: c:\Users\jonat\Documents\bobwhite\ravenSelectionTables\20221030_060000.wav to c:\Users\jonat\Documents\bobwhite\ravenSelectionTables\20221030_060000.wav
Renamed: c:\Users\jonat\Docu

In [102]:
import os

def find_matching_pairs(folder_path):
    matching_pairs = []
    missing_pairs = []
    duplicate_files = []

    all_files = [file for file in os.listdir(folder_path) if file.endswith((".wav", ".txt"))]
    wav_files = set(file for file in all_files if file.endswith(".wav"))
    txt_files = set(file for file in all_files if file.endswith(".txt"))

    # Check for matching pairs
    for wav_file in wav_files:
        txt_file = wav_file.replace(".wav", ".txt")
        if txt_file in txt_files:
            matching_pairs.append((wav_file, txt_file))
        else:
            missing_pairs.append(wav_file)

    for txt_file in txt_files:
        wav_file = txt_file.replace(".txt", ".wav")
        if wav_file not in wav_files:
            missing_pairs.append(txt_file)

    # Check for duplicates
    file_count = {}
    for file in all_files:
        if file in file_count:
            file_count[file] += 1
        else:
            file_count[file] = 1

    for file, count in file_count.items():
        if count > 1:
            duplicate_files.append(file)

    # Print warnings
    if missing_pairs:
        print("Warning: Some files do not have corresponding pairs:")
        for file in missing_pairs:
            print(file)

    if duplicate_files:
        print("Warning: Some files are duplicated:")
        for file in duplicate_files:
            print(file)

    return matching_pairs

# Replace 'folder_path' with the path to your ravenSelectiontable folder
folder_path = './ravenSelectionTables'
matching_pairs = find_matching_pairs(folder_path)
#for pair in matching_pairs:
    #print(pair)


It seems like the column "annotations" is a number, while at the end there is a column "Annotations" thats not supposed to be there

In [103]:
from opensoundscape import BoxedAnnotations

all_annotations = BoxedAnnotations.from_raven_files(
    updated_selections,audio_files)

#RENAME DUPLICATE ANNOTATION TEST
all_annotations.df.rename(columns={'annotation': 'length'}, inplace=True)
all_annotations.df.rename(columns={'Annotation': 'annotation'}, inplace=True)

all_annotations.df.head(10)




Unnamed: 0,audio_file,annotation_file,length,start_time,end_time,low_f,high_f,Delta Freq (Hz),Selection,Avg Power Density (dB FS/Hz),View,Channel,annotation
0,c:\Users\jonat\Documents\bobwhite\ravenSelectionTables\20221022_060000.wav,c:\Users\jonat\Documents\bobwhite\ravenSelectionTables\20221022_060000.txt,0.4319,3.965968,4.397906,875.324,2379.786,1504.462,1,-69.59,Spectrogram 1,1,NOBO?
1,c:\Users\jonat\Documents\bobwhite\ravenSelectionTables\20221022_060000.wav,c:\Users\jonat\Documents\bobwhite\ravenSelectionTables\20221022_060000.txt,0.3829,6.253273,6.636125,711.201,2407.14,1695.939,2,-69.65,Spectrogram 1,1,NOBO?
2,c:\Users\jonat\Documents\bobwhite\ravenSelectionTables\20221022_060000.wav,c:\Users\jonat\Documents\bobwhite\ravenSelectionTables\20221022_060000.txt,0.4319,7.794502,8.226439,711.201,2407.14,1695.939,3,-71.17,Spectrogram 1,1,NOBO?
3,c:\Users\jonat\Documents\bobwhite\ravenSelectionTables\20221022_060000.wav,c:\Users\jonat\Documents\bobwhite\ravenSelectionTables\20221022_060000.txt,0.4319,9.090314,9.522251,656.493,2434.494,1778.001,4,-69.66,Spectrogram 1,1,NOBO?
4,c:\Users\jonat\Documents\bobwhite\ravenSelectionTables\20221022_060000.wav,c:\Users\jonat\Documents\bobwhite\ravenSelectionTables\20221022_060000.txt,0.3534,173.404513,173.757917,1340.34,2434.494,1094.154,5,-75.79,Spectrogram 1,1,NOBO?
5,c:\Users\jonat\Documents\bobwhite\ravenSelectionTables\20221022_060000.wav,c:\Users\jonat\Documents\bobwhite\ravenSelectionTables\20221022_060000.txt,0.5497,203.125508,203.675246,1312.986,2325.079,1012.093,6,-75.56,Spectrogram 1,1,NOBO?
6,c:\Users\jonat\Documents\bobwhite\ravenSelectionTables\20221022_060000.wav,c:\Users\jonat\Documents\bobwhite\ravenSelectionTables\20221022_060000.txt,0.9762,228.023939,229.000169,1340.34,2543.91,1203.57,7,-76.43,Spectrogram 1,1,NOBO?
7,c:\Users\jonat\Documents\bobwhite\ravenSelectionTables\20221022_060000.wav,c:\Users\jonat\Documents\bobwhite\ravenSelectionTables\20221022_060000.txt,1.0652,238.451164,239.5164,957.385,2352.433,1395.048,8,-73.78,Spectrogram 1,1,NOBO?
8,c:\Users\jonat\Documents\bobwhite\ravenSelectionTables\20221022_060000.wav,c:\Users\jonat\Documents\bobwhite\ravenSelectionTables\20221022_060000.txt,0.9087,381.698337,382.607028,1121.509,2243.017,1121.508,9,-69.67,Spectrogram 1,1,NOBO?
9,c:\Users\jonat\Documents\bobwhite\ravenSelectionTables\20221022_060000.wav,c:\Users\jonat\Documents\bobwhite\ravenSelectionTables\20221022_060000.txt,1.0671,389.420798,390.487866,1340.34,2379.786,1039.446,10,-74.7,Spectrogram 1,1,NOBO?


In [104]:
all_annotations.df.annotation.value_counts() 

annotation
NOBO     636
NOBOF    150
NOBO?     21
NOOB       1
Name: count, dtype: int64

In [105]:
#keep only annotations that say "NOBO"
class_list = ['NOBO']

Now time for one hot encoding for the model


In [106]:
# create labels for fixed-duration (2 second) clips 

'''since we aren't in the ravenselectiontable directory since the beginning, 
I ASSUME that i need to reference the audio file path manually under audio_files parameter'''

audio_location = glob(os.path.join(relative_folder_path, "*.wav"))

#print(audio_location)

#change labels
labels = all_annotations.one_hot_clip_labels(
  clip_duration=1,
  clip_overlap=0.5,
  min_label_overlap=0.1,
  class_subset=class_list, #maybe we can add NOBO?
  audio_files= audio_location
)

# Set the option to display all rows
pd.set_option('display.max_rows', None)

labels.head()



Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,NOBO
file,start_time,end_time,Unnamed: 3_level_1
c:\Users\jonat\Documents\bobwhite\ravenSelectionTables\20221027_060000.wav,0.0,1.0,0.0
c:\Users\jonat\Documents\bobwhite\ravenSelectionTables\20221027_060000.wav,0.5,1.5,0.0
c:\Users\jonat\Documents\bobwhite\ravenSelectionTables\20221027_060000.wav,1.0,2.0,0.0
c:\Users\jonat\Documents\bobwhite\ravenSelectionTables\20221027_060000.wav,1.5,2.5,0.0
c:\Users\jonat\Documents\bobwhite\ravenSelectionTables\20221027_060000.wav,2.0,3.0,0.0


In [107]:
labels.to_csv("labels.csv")

In [108]:
# Count the occurrences of each value in the 'NOBO' column
value_counts = labels['NOBO'].value_counts()

# Get the count of 1s and 0s
count_ones = value_counts.get(1, 0)  # Get the count of 1s, default to 0 if not found
count_zeros = value_counts.get(0, 0)  # Get the count of 0s, default to 0 if not found

print("Number of 1s in 'NOBO':", count_ones)
print("Number of 0s in 'NOBO':", count_zeros)

Number of 1s in 'NOBO': 1066
Number of 0s in 'NOBO': 13321


Training the model

In [109]:
from sklearn.model_selection import train_test_split
import shutup; shutup.please()


# Calculate 20% of unique file names
num_files_to_remove = int(0.1 * labels.index.get_level_values('file').nunique())

# Get a random sample of file names to remove
files_to_remove = labels.index.get_level_values('file').unique().to_series().sample(n=num_files_to_remove, random_state=5)

# Print the filenames that are being removed
print("Files to be removed:")
print(files_to_remove)

# Create a DataFrame containing the files to remove
testing_df = labels[labels.index.get_level_values('file').isin(files_to_remove)]

# Remove the files to be removed from the original DataFrame
labels_filtered = labels[~labels.index.get_level_values('file').isin(files_to_remove)]

# Split the DataFrame based on unique values in the 'file' column
train_files, validation_files = train_test_split(labels_filtered.index.get_level_values('file').unique(), test_size=0.2, random_state=1)

# Filter the labels DataFrame based on the split file names
train_df = labels_filtered[labels_filtered.index.get_level_values('file').isin(train_files)]
validation_df = labels_filtered[labels_filtered.index.get_level_values('file').isin(validation_files)]


Files to be removed:
file
c:\Users\jonat\Documents\bobwhite\ravenSelectionTables\20221106_060000.wav    c:\Users\jonat\Documents\bobwhite\ravenSelectionTables\20221106_060000.wav
Name: file, dtype: object


In [110]:
print(len(train_df))
print(len(validation_df))

9591
3597


In [111]:

from opensoundscape import CNN
from opensoundscape.ml.cnn import load_model

import shutup; shutup.please()

# just make two different csv of train and validation
# Define constants
SAMPLE_DURATION = 1.0 #what is this; has to be same as clip duration, so 1
NUM_EPOCHS = 5
BATCH_SIZE = 64 #too SLOW, maybe 256, 
SAVE_INTERVAL = 1000

# Specify paths
model_path = './bird_training/bobwhite/best.model'
save_path = './bird_training/bobwhite'

if os.path.exists(model_path):
    model = load_model(model_path)
else:
    model = CNN('resnet34', classes=class_list, sample_duration=SAMPLE_DURATION) #resnet 18 previously

    model.train(
        train_df=train_df,
        validation_df=validation_df,
        save_path=save_path,
        epochs=NUM_EPOCHS,
        batch_size=BATCH_SIZE,
        save_interval=SAVE_INTERVAL,
        log_interval=720,
        num_workers=4, #specify 4 if you have 4 CPU processes, eg; 0 means only the root process i have 8
    )

    model.save(model_path)
    model = load_model(model_path)


Training Epoch 0


  0%|          | 0/150 [00:00<?, ?it/s]

Epoch: 0 [batch 0/150, 0.00%] 
	DistLoss: 0.895
Metrics:
Metrics:
	MAP: 0.156

Validation.


  0%|          | 0/57 [00:00<?, ?it/s]

In [None]:
validation_df.columns

In [None]:
validation_df.to_csv("test.csv")
len(validation_df)
train_df.to_csv("train.csv")


In [None]:
len(train_df)


In [None]:
len(testing_df)

After training

In [None]:
from glob import glob

# Search for .wav and .WAV files in all subdirectories of the directory
forest_recordings = glob('./analysis_recording/2022Forest/**/*.[wW][aA][vV]', recursive=True)
grasslandfall_recordings = glob('./analysis_recording/2022GrasslandFall/**/*.[wW][aA][vV]', recursive=True)
grasslandsummer_recordings = glob('./analysis_recording/2022GrasslandSummer/**/*.[wW][aA][vV]', recursive=True)

everything_list = [forest_recordings, grasslandfall_recordings, grasslandsummer_recordings]

#maybe add code here that says if file name is in csv_output, remove from this list
# Path to the csv_output folder
csv_output_folder = './csv_output'

# Iterate over files in the csv_output folder
for file in os.listdir(csv_output_folder):
    # Read the CSV file
    csv_file_path = os.path.join(csv_output_folder, file)
    df = pd.read_csv(csv_file_path)
    # Get the filenames from the "file" column
    filenames = df.iloc[:, 0].tolist()
    # Iterate over each list in everything_list
    for recording_list in everything_list:
        # Check if any filename matches a value in the "file" column
        for filename in filenames:
            if filename in recording_list:
                # Remove the filename from the list
                recording_list.remove(filename)

print(everything_list)

In [None]:
def predict(file, name, overlap=0):
    prediction_scores_df = model.predict(file, activation_layer='sigmoid', overlap_fraction=overlap)
    
    # Reset index to move index levels to columns
    prediction_scores_df.reset_index(inplace=True)

    # Convert the seconds column to a "hour:minute:second" format
    prediction_scores_df['start_time'] = pd.to_datetime(prediction_scores_df['start_time'], unit='s').apply(lambda x: x.strftime('%H:%M:%S.%f'))
    prediction_scores_df['end_time'] = pd.to_datetime(prediction_scores_df['end_time'], unit='s').apply(lambda x: x.strftime('%H:%M:%S.%f'))


    # Save the DataFrame to a CSV file
    prediction_scores_df.to_csv(name, index=False)
    
from sklearn.mixture import GaussianMixture

def add_presence_column(file_name, cluster=4):
    # Read the CSV file into a pandas DataFrame
    df = pd.read_csv(file_name)

    # Assuming 'NOBO' column contains the scores
    scores = df['NOBO'].values.reshape(-1, 1)

    # Fit GMM model
    gmm = GaussianMixture(n_components=cluster)  # Assuming two clusters, adjust as needed
    gmm.fit(scores)

    # Get the cluster means
    cluster_means = gmm.means_

    # Determine the top cluster (highest mean)
    top_cluster = cluster_means.argmax()

    # Add a new column 'Presence' based on the top cluster
    df['Presence'] = ['Present' if cluster == top_cluster else '' for cluster in gmm.predict(scores)]

    # Save the updated DataFrame to the same CSV file
    df.to_csv(file_name, index=False)


In [None]:

# import Audio and Spectrogram classes from OpenSoundscape
from opensoundscape.audio import Audio
from opensoundscape.spectrogram import Spectrogram

count = 0
     
for recording_list in everything_list:
    
    if recording_list:
        #doesnt work for multiple files, figure out
        if count == 0:
            for file in recording_list:
                # Extract the base name of the file (excluding path and extension)
                recording_name = os.path.splitext(os.path.basename(file))[0]
                
                # Construct the CSV file name using the recording name
                name = f"csv_output/forest_recordings_{recording_name}.csv"
                
                
                if not os.path.exists(name):
                # Perform prediction and save to CSV
                    print(f"In progress: {recording_name}")
                    predict([file], name)
                    
                # Add the 'Presence' column to the CSV file
                add_presence_column(name)
        elif count == 1:
             for file in recording_list:
                # Extract the base name of the file (excluding path and extension)
                recording_name = os.path.splitext(os.path.basename(file))[0]
                
                # Construct the CSV file name using the recording name
                name = f"csv_output/grasslandfall_recordings_{recording_name}.csv"
            
                
                if not os.path.exists(name):
                # Perform prediction and save to CSV
                    print(f"In progress: {recording_name}")
                    predict([file], name)
                    
                # Add the 'Presence' column to the CSV file
                add_presence_column(name)
        elif count == 2:
            for file in recording_list:
                # Extract the base name of the file (excluding path and extension)
                recording_name = os.path.splitext(os.path.basename(file))[0]
                
                # Construct the CSV file name using the recording name
                name = f"csv_output/grasslandsummer_recordings_{recording_name}.csv"
                
                print(f"In progress: {recording_name}")
                
                if not os.path.exists(name):
                # Perform prediction and save to CSV
                    print(f"In progress: {recording_name}")
                    predict([file], name)
                    
                # Add the 'Presence' column to the CSV file
                add_presence_column(name)

    count += 1
        
        

Check metrics

In [None]:
testing_df.to_csv("testing.csv", index=False)

import pandas as pd

# Read the CSV file into a DataFrame
testing_df = pd.read_csv("testing.csv")

# Define the regex pattern
pattern = r"(\d{8}_\d{6})"

# Extract the pattern from filenames and get unique values
unique_patterns = testing_df['file'].str.extract(pattern).dropna().squeeze().unique().tolist()

# Print the unique patterns
print(unique_patterns)

# List to store the total number of rows from each CSV
total_rows = []

for file in unique_patterns:
    filename = f"ravenSelectionTables/{file}.WAV"
    path = f"testing_files/{file}.csv"
    
    if not os.path.exists(path):
    # Perform prediction and save to CSV
        print(f"In progress: {filename}")
        predict([filename], path, overlap=.5)
        
        add_presence_column(path,3)
    
    df = pd.read_csv(path)
    
    # Get the number of rows in the DataFrame
    num_rows = len(df)
    
    # Print the number of rows for the current CSV file
    print(f"Number of rows in {path}: {num_rows}")
    
    # Get the number of rows in the DataFrame and append to the list
    total_rows.append(len(df))
    
# Calculate the total number of rows across all CSV files
total_rows_all_files = sum(total_rows)

# Print the total number of rows
print("Total number of rows from all CSV files:", total_rows_all_files)



split testing.csv to separate csv based on unique filename

In [None]:
# Define the regex pattern
pattern = r"(\d{8}_\d{6})"

# Get unique filenames and filter them to match the regex pattern
unique_filenames = testing_df['file'].str.extract(pattern).dropna().squeeze().unique()

# Iterate over unique filenames and create separate CSV files
for filename in unique_filenames:
    # Filter DataFrame for current filename
    subset_df = testing_df[testing_df['file'].str.contains(filename)]
    
    # Print the number of rows in the subset DataFrame
    num_rows = len(subset_df)
    print(f"Number of rows in {filename}: {num_rows}")
    
    # Define the filename for the new CSV file
    new_filename = f"testing_files/original_{filename}.csv"
    
    # Write the subset DataFrame to a new CSV file
    subset_df.to_csv(new_filename, index=False)


merge dataframes and get metrics

In [None]:
import os
import pandas as pd

# Define the regex pattern
pattern = r"(\d{8}_\d{6})"

# Function to extract the filename pattern
def extract_pattern(filename):
    match = re.search(pattern, filename)
    if match:
        return match.group()
    else:
        return None

# Dictionary to store DataFrames for each unique filename pattern
merged_dfs = {}

# Iterate over files in the testing_files folder
for filename in os.listdir("testing_files"):
    if filename.endswith(".csv"):
        # Read the CSV file into a DataFrame
        df = pd.read_csv(os.path.join("testing_files", filename))
        
        # Extract the filename pattern
        file_pattern = extract_pattern(filename)
        
        if file_pattern:
            # Check if the filename pattern already exists in the dictionary
            if file_pattern in merged_dfs:
                # Concatenate the current DataFrame with the existing one in the dictionary
                merged_dfs[file_pattern] = pd.concat([merged_dfs[file_pattern], df], axis=1, join='inner')
            else:
                # If the filename pattern is not in the dictionary, add the DataFrame to the dictionary
                merged_dfs[file_pattern] = df

# Save each merged DataFrame to a separate CSV file with only the first 5 columns and the last one
for pattern, df in merged_dfs.items():
    merged_filename = f"testing_files/merged_{pattern}.csv"
    
    # Rename the last column to "Actual NOBO"
    df.columns = list(df.columns[:-1]) + ['Actual NOBO']
    
    # Save the DataFrame to a CSV file
    df.to_csv(merged_filename, index=False)
    print(f"Merged CSV file for {pattern} saved as: {merged_filename}")


In [None]:
# Iterate over merged files
for pattern, df in merged_dfs.items():
    # Print column names of the DataFrame
    print(f"Columns in DataFrame for {pattern}: {df.columns.tolist()}")


In [None]:
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, confusion_matrix

# Dictionary to store classification metrics and confusion matrices for each merged file
classification_results = {}

# Iterate over merged files
for pattern, df in merged_dfs.items():
    
    # Mapping string values to integers
    presence_mapping = {'Present': 1, '': 0}
    # Convert "Presence" column to integers
    nobo_values = df['Presence'].map(presence_mapping).fillna(0).astype(int)
    actual_nobo_values = df['Actual NOBO'].astype(int)
    
    # Compute classification metrics
    precision = precision_score(actual_nobo_values, nobo_values)
    recall = recall_score(actual_nobo_values, nobo_values)
    accuracy = accuracy_score(actual_nobo_values, nobo_values)
    f1 = f1_score(actual_nobo_values, nobo_values)
    
    # Compute confusion matrix
    cm = confusion_matrix(actual_nobo_values, nobo_values)
    
    # Add classification metrics and confusion matrix to dictionary
    classification_results[pattern] = {'Precision': precision,
                                       'Recall': recall,
                                       'Accuracy': accuracy,
                                       'F1 Score': f1,
                                       'Confusion Matrix': cm}

# Print classification metrics and confusion matrices
for pattern, results in classification_results.items():
    print(f"Classification results for {pattern}:")
    for metric, value in results.items():
        if metric == 'Confusion Matrix':
            print(f"{metric}:\n{value}")
        else:
            print(f"{metric}: {value}")
    print()
