In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.neighbors import NearestCentroid
from sklearn.impute import SimpleImputer

# Function to convert time duration from 'MM:SS' format to seconds
def parse_length_to_seconds(length_str):
    try:
        minutes, seconds = map(int, length_str.split(":"))
        return minutes * 60 + seconds
    except ValueError:
        return np.nan

# Load the training and validation datasets
train_file_location = "../../data/cleaned/70_15_15_cleaned_train.csv"
val_file_location = "../../data/cleaned/70_15_15_cleaned_val.csv"
train_df = pd.read_csv(train_file_location)
val_df = pd.read_csv(val_file_location)

# Convert 'length' from 'MM:SS' format to seconds
train_df['length'] = train_df['length'].apply(parse_length_to_seconds)
val_df['length'] = val_df['length'].apply(parse_length_to_seconds)

# Define feature columns
feature_columns = ['lat', 'lng', 'length', 'temperature']

# Use SimpleImputer to handle missing values
imputer = SimpleImputer(strategy='mean')

# Apply imputation on training and validation data
X_train = imputer.fit_transform(train_df[feature_columns])
X_val = imputer.transform(val_df[feature_columns])

# Extract target labels
y_train = train_df['en']
y_val = val_df['en']

# Initialize and fit the Nearest Centroid model
clf = NearestCentroid()
clf.fit(X_train, y_train)

# Predict on validation set
y_pred_centroid = clf.predict(X_val)

# Calculate and display metrics
accuracy = accuracy_score(y_val, y_pred_centroid)
precision = precision_score(y_val, y_pred_centroid, average='weighted', zero_division=1)
recall = recall_score(y_val, y_pred_centroid, average='weighted', zero_division=1)
f1 = f1_score(y_val, y_pred_centroid, average='weighted', zero_division=1)

print("Nearest Centroid Classifier Baseline Performance:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# Print classification report
print("\nClassification Report:")
print(classification_report(y_val, y_pred_centroid, zero_division=1))

# Print confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_val, y_pred_centroid))


Nearest Centroid Classifier Baseline Performance:
Accuracy: 0.0232
Precision: 0.3328
Recall: 0.0232
F1 Score: 0.0170

Classification Report:
                                precision    recall  f1-score   support

               Alpine Accentor       0.00      0.00      0.00         8
                 Alpine Chough       0.00      0.00      0.00         4
                   Arctic Tern       0.00      0.00      0.00         4
               Atlantic Canary       0.05      0.33      0.08         3
               Baillon's Crake       1.00      0.00      0.00        10
             Bar-tailed Godwit       0.00      0.00      0.00         8
                  Barn Swallow       1.00      0.00      0.00        16
                Barnacle Goose       1.00      0.00      0.00         4
                Barred Warbler       0.01      0.17      0.01         6
              Bearded Reedling       1.00      0.00      0.00         8
                    Black Kite       1.00      0.00      0.00     

