In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.neighbors import NearestCentroid
from sklearn.impute import SimpleImputer

np.random.seed(42)

def parse_length_to_seconds(length_str):
    try:
        minutes, seconds = map(int, length_str.split(":"))
        return minutes * 60 + seconds
    except ValueError:
        return np.nan

train_file_location = "../../data/cleaned/80_20_cleaned_train.parquet"
val_file_location = "../../data/cleaned/80_20_cleaned_test.parquet"
train_df = pd.read_parquet(train_file_location)
val_df = pd.read_parquet(val_file_location)

train_df['length'] = train_df['length'].apply(parse_length_to_seconds)
val_df['length'] = val_df['length'].apply(parse_length_to_seconds)

feature_columns = ['lat', 'lng', 'length']

imputer = SimpleImputer(strategy='mean')

X_train = imputer.fit_transform(train_df[feature_columns])
X_val = imputer.transform(val_df[feature_columns])

y_train = train_df['en']
y_val = val_df['en']

clf = NearestCentroid()
clf.fit(X_train, y_train)

y_pred_centroid = clf.predict(X_val)

accuracy = accuracy_score(y_val, y_pred_centroid)
precision = precision_score(y_val, y_pred_centroid, average='macro', zero_division=1)
recall = recall_score(y_val, y_pred_centroid, average='macro', zero_division=1)
f1 = f1_score(y_val, y_pred_centroid, average='macro', zero_division=1)

print("Nearest Centroid Classifier Baseline Performance:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

print("\nClassification Report:")
print(classification_report(y_val, y_pred_centroid, zero_division=1))

print("\nConfusion Matrix:")
print(confusion_matrix(y_val, y_pred_centroid))


Nearest Centroid Classifier Baseline Performance:
Accuracy: 0.0089
Precision: 0.1704
Recall: 0.0280
F1 Score: 0.0069

Classification Report:
                                precision    recall  f1-score   support

              African Blue Tit       0.03      0.17      0.06         6
               Alpine Accentor       0.02      0.05      0.02        21
                 Alpine Chough       0.00      0.00      0.00        12
                  Alpine Swift       1.00      0.00      0.00        13
                   Arctic Tern       0.00      0.00      0.00        24
                Arctic Warbler       0.00      0.00      0.00         5
               Atlantic Canary       0.06      0.25      0.10        12
                Audouin's Gull       0.00      0.00      0.00         5
               Baillon's Crake       0.00      0.00      0.00        17
             Bar-tailed Godwit       1.00      0.00      0.00        23
                  Barn Swallow       1.00      0.00      0.00     