In [7]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer

np.random.seed(42)

# Function to convert time duration from 'MM:SS' format to seconds
def parse_length_to_seconds(length_str):
    try:
        minutes, seconds = map(int, length_str.split(":"))
        return minutes * 60 + seconds
    except ValueError:
        return np.nan

# Load the training and validation datasets
train_file_location = "../../data/cleaned/80_20_cleaned_train.parquet"
val_file_location = "../../data/cleaned/80_20_cleaned_test.parquet"
train_df = pd.read_parquet(train_file_location)
val_df = pd.read_parquet(val_file_location)

# Convert 'length' from 'MM:SS' format to seconds
train_df['length'] = train_df['length'].apply(parse_length_to_seconds)
val_df['length'] = val_df['length'].apply(parse_length_to_seconds)

# Define feature columns (excluding 'temperature' if it contains only NaNs)
feature_columns = ['lat', 'lng', 'length']

# Impute missing values with column means
imputer = SimpleImputer(strategy='mean')
X_train = imputer.fit_transform(train_df[feature_columns])
X_val = imputer.transform(val_df[feature_columns])

# Extract target labels
y_train = train_df['en']
y_val = val_df['en']

# Perform clustering on training features
n_clusters = 10  # Set number of clusters; adjust based on dataset
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans.fit(X_train)

# Map each cluster to the most frequent bird species within it using `value_counts`
cluster_to_species = {}
for cluster in range(n_clusters):
    species_in_cluster = y_train[kmeans.labels_ == cluster]
    if not species_in_cluster.empty:
        cluster_to_species[cluster] = species_in_cluster.value_counts().idxmax()
    else:
        cluster_to_species[cluster] = None  # Handle any empty clusters

# Predict clusters for the validation data
val_clusters = kmeans.predict(X_val)
y_pred_cluster = [cluster_to_species[cluster] for cluster in val_clusters]

# Calculate and display metrics
accuracy = accuracy_score(y_val, y_pred_cluster)
precision = precision_score(y_val, y_pred_cluster, average='weighted', zero_division=1)
recall = recall_score(y_val, y_pred_cluster, average='weighted', zero_division=1)
f1 = f1_score(y_val, y_pred_cluster, average='weighted', zero_division=1)

print("Clustering + Majority Voting Baseline Performance:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# Print classification report
print("\nClassification Report:")
print(classification_report(y_val, y_pred_cluster, zero_division=1))

# Print confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_val, y_pred_cluster))


Clustering + Majority Voting Baseline Performance:
Accuracy: 0.0443
Precision: 0.8992
Recall: 0.0443
F1 Score: 0.0098

Classification Report:
                                precision    recall  f1-score   support

               Alpine Accentor       1.00      0.00      0.00         8
                 Alpine Chough       1.00      0.00      0.00         4
                   Arctic Tern       1.00      0.00      0.00         4
               Atlantic Canary       1.00      0.00      0.00         3
               Baillon's Crake       1.00      0.00      0.00        10
             Bar-tailed Godwit       1.00      0.00      0.00         8
                  Barn Swallow       1.00      0.00      0.00        16
                Barnacle Goose       1.00      0.00      0.00         4
                Barred Warbler       1.00      0.00      0.00         6
              Bearded Reedling       1.00      0.00      0.00         8
                    Black Kite       1.00      0.00      0.00    