# 1nn

## NCBI 2020

In [18]:
# 3
#!/usr/bin/env python3

"""
leave_one_out_knn_eval_distance.py

Script to perform a leave-one-out 1-NN classification from a precomputed NxN
distance matrix, and then compute various metrics:
- Accuracy
- Balanced Accuracy
- Precision, Recall, F1
- ROC AUC (multi-class or binary)
"""

import numpy as np
from sklearn.metrics import (
    accuracy_score,
    balanced_accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)
import argparse
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler


def leave_one_out_knn(distance_matrix, labels, n_neighbors=1):
    """
    Perform leave-one-out k-NN (here k=1 by default) classification using
    a precomputed NxN distance matrix.

    For each sample i, we:
      - Look at distance_matrix[i], which contains distances to all other samples.
      - Exclude itself by setting distance_matrix[i,i] = np.inf (if not already).
      - Find the nearest neighbor(s).
      - Predict the label of the chosen neighbor(s).

    Args:
        distance_matrix (ndarray): shape (N, N), distance_matrix[i,j] is the
                                   distance between samples i and j.
        labels (ndarray): shape (N,), integer or string labels.
        n_neighbors (int): number of neighbors (default=1).

    Returns:
        y_pred (ndarray): predicted labels for each sample (length N)
    """
    N = distance_matrix.shape[0]
    y_pred = np.zeros(N, dtype=labels.dtype)

    for i in range(N):
        # distances from sample i to all others
        dist = distance_matrix[i].copy()
        # exclude itself
        dist[i] = np.inf

        # find the index/indices of the nearest neighbor(s)
        nn_idx = np.argpartition(dist, n_neighbors)[:n_neighbors]
        if n_neighbors == 1:
            chosen_label = labels[nn_idx[0]]
        else:
            # If k>1, we can do majority vote among chosen neighbors
            chosen_labels = labels[nn_idx]
            chosen_label = np.bincount(chosen_labels).argmax()

        y_pred[i] = chosen_label

    return y_pred


def compute_metrics(y_true, y_pred, average='weighted'):
    """
    Compute various classification metrics.
    For multi-class, use 'average' = 'macro' or 'weighted'.

    Returns a dict of metrics:
    - accuracy
    - balanced_accuracy
    - precision_{average}
    - recall_{average}
    - f1_{average}
    - roc_auc_{average}  (if applicable)
    """
    acc = accuracy_score(y_true, y_pred)
    bal_acc = balanced_accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, average=average, zero_division=0)
    rec = recall_score(y_true, y_pred, average=average, zero_division=0)
    f1 = f1_score(y_true, y_pred, average=average, zero_division=0)

    metrics_dict = {
        'accuracy': acc,
        'balanced_accuracy': bal_acc,
        f'precision_{average}': prec,
        f'recall_{average}': rec,
        f'f1_{average}': f1
    }

    # For ROC AUC (multi-class or binary), we only have discrete predictions.
    # We convert them to one-hot format for a 'one-vs-rest' approach in roc_auc_score.
    num_classes = len(np.unique(y_true))
    try:
        y_true_1hot = np.zeros((len(y_true), num_classes), dtype=int)
        y_pred_1hot = np.zeros((len(y_pred), num_classes), dtype=int)

        for i, lbl in enumerate(y_true):
            y_true_1hot[i, lbl] = 1
        for i, lbl in enumerate(y_pred):
            y_pred_1hot[i, lbl] = 1

        roc = roc_auc_score(
            y_true_1hot,
            y_pred_1hot,
            multi_class='ovr',  # or 'ovo'
            average=average
        )
        metrics_dict[f'roc_auc_{average}'] = roc

    except ValueError:
        # This can happen if there's only one class in y_true, etc.
        metrics_dict[f'roc_auc_{average}'] = None

    return metrics_dict


from collections import Counter

def find_labels_with_min_count(labels, min_count=15):
    """
    Find the positions of labels whose count is greater than or equal to `min_count`.

    Args:
        labels (list): A list of labels.
        min_count (int): The minimum count to filter labels (default=15).

    Returns:
        list: Indices of labels that meet the count condition.
    """
    # Count occurrences of each label
    label_counts = Counter(labels)
    
    # Find labels with count >= min_count
    valid_labels = {label for label, count in label_counts.items() if count >= min_count}
    
    # Find indices of valid labels
    indices = [i for i, label in enumerate(labels) if label in valid_labels]
    
    return indices






encoders = [
    'PSRT',
]

csv_names = [
    'Yau2020_record_processed',
    # 'Yau2022_record_processed',
    # 'NCBI_record_valid_nucleotide',
    # 'NCBI_record_valid_count',
]

features_needed = [('facet', 0)]

if __name__ == "__main__":




    for encoder in encoders:
        print("encoder ", encoder)
        for data_name in csv_names:
            print("data ", data_name)
            for k in range(4,5):
                print("k ", k)

                for feature, max_dim in features_needed:
                    print(feature)
                    print(max_dim)
                    path_to_data = f'data/{data_name}'
                    df = pd.read_csv(f"{path_to_data}.csv")
                    x_list = df['Accession (version)'].to_list()
                    y_list = df['Family'].to_list()
                    x_list, y_list = zip(*sorted(zip(x_list, y_list)))
                    x_list = list(x_list)
                    y_list = list(y_list)
                    indices = find_labels_with_min_count(y_list, min_count=3)
                    x_list = [x_list[i] for i in indices]
                    y_list = [y_list[i] for i in indices]


                    path_to_features = f'distances/{encoder}/{data_name}'
                    # for d in range(max_dim + 1):
                    # print(d)
                    distance_matrix = np.load(path_to_features+f'/k{k}_distance_{feature}{max_dim}.npy')
                    subD = distance_matrix[np.ix_(indices, indices)]


                    # path_to_features = f'distances/{encoder}/{data_name}'
                    # # for d in range(max_dim + 1):
                    # distance_matrix = np.load(path_to_features+f'/k{k}_distance.npy')
                    # subD = distance_matrix[np.ix_(indices, indices)]

                    le = LabelEncoder()
                    y_int = le.fit_transform(y_list)  # shape (N,)
                    y_np = np.array(y_int, dtype=np.int32)
                    labels = y_np


                    # 1-NN or k-NN using the distance matrix
                    y_pred = leave_one_out_knn(subD, labels, n_neighbors=1)

                    # Compute metrics
                    metrics_result = compute_metrics(labels, y_pred, average='macro')

                    # Print results
                    print("Leave-One-Out k-NN Results (k={})".format(k))
                    for s, v in metrics_result.items():
                        if v is not None:
                            print(f"{s}: {v:.4f}")
                        else:
                            print(f"{s}: None")
                    print("===============================================")
                    print("===============================================")
                    print("===============================================")

encoder  PSRT
data  Yau2020_record_processed
k  4
facet
0
Leave-One-Out k-NN Results (k=4)
accuracy: 0.9319
balanced_accuracy: 0.8664
precision_macro: 0.8903
recall_macro: 0.8664
f1_macro: 0.8717
roc_auc_macro: 0.9328


## NCBI 2022

In [19]:
# 3
#!/usr/bin/env python3

"""
leave_one_out_knn_eval_distance.py

Script to perform a leave-one-out 1-NN classification from a precomputed NxN
distance matrix, and then compute various metrics:
- Accuracy
- Balanced Accuracy
- Precision, Recall, F1
- ROC AUC (multi-class or binary)
"""

import numpy as np
from sklearn.metrics import (
    accuracy_score,
    balanced_accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)
import argparse
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler


def leave_one_out_knn(distance_matrix, labels, n_neighbors=1):
    """
    Perform leave-one-out k-NN (here k=1 by default) classification using
    a precomputed NxN distance matrix.

    For each sample i, we:
      - Look at distance_matrix[i], which contains distances to all other samples.
      - Exclude itself by setting distance_matrix[i,i] = np.inf (if not already).
      - Find the nearest neighbor(s).
      - Predict the label of the chosen neighbor(s).

    Args:
        distance_matrix (ndarray): shape (N, N), distance_matrix[i,j] is the
                                   distance between samples i and j.
        labels (ndarray): shape (N,), integer or string labels.
        n_neighbors (int): number of neighbors (default=1).

    Returns:
        y_pred (ndarray): predicted labels for each sample (length N)
    """
    N = distance_matrix.shape[0]
    y_pred = np.zeros(N, dtype=labels.dtype)

    for i in range(N):
        # distances from sample i to all others
        dist = distance_matrix[i].copy()
        # exclude itself
        dist[i] = np.inf

        # find the index/indices of the nearest neighbor(s)
        nn_idx = np.argpartition(dist, n_neighbors)[:n_neighbors]
        if n_neighbors == 1:
            chosen_label = labels[nn_idx[0]]
        else:
            # If k>1, we can do majority vote among chosen neighbors
            chosen_labels = labels[nn_idx]
            chosen_label = np.bincount(chosen_labels).argmax()

        y_pred[i] = chosen_label

    return y_pred


def compute_metrics(y_true, y_pred, average='weighted'):
    """
    Compute various classification metrics.
    For multi-class, use 'average' = 'macro' or 'weighted'.

    Returns a dict of metrics:
    - accuracy
    - balanced_accuracy
    - precision_{average}
    - recall_{average}
    - f1_{average}
    - roc_auc_{average}  (if applicable)
    """
    acc = accuracy_score(y_true, y_pred)
    bal_acc = balanced_accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, average=average, zero_division=0)
    rec = recall_score(y_true, y_pred, average=average, zero_division=0)
    f1 = f1_score(y_true, y_pred, average=average, zero_division=0)

    metrics_dict = {
        'accuracy': acc,
        'balanced_accuracy': bal_acc,
        f'precision_{average}': prec,
        f'recall_{average}': rec,
        f'f1_{average}': f1
    }

    # For ROC AUC (multi-class or binary), we only have discrete predictions.
    # We convert them to one-hot format for a 'one-vs-rest' approach in roc_auc_score.
    num_classes = len(np.unique(y_true))
    try:
        y_true_1hot = np.zeros((len(y_true), num_classes), dtype=int)
        y_pred_1hot = np.zeros((len(y_pred), num_classes), dtype=int)

        for i, lbl in enumerate(y_true):
            y_true_1hot[i, lbl] = 1
        for i, lbl in enumerate(y_pred):
            y_pred_1hot[i, lbl] = 1

        roc = roc_auc_score(
            y_true_1hot,
            y_pred_1hot,
            multi_class='ovr',  # or 'ovo'
            average=average
        )
        metrics_dict[f'roc_auc_{average}'] = roc

    except ValueError:
        # This can happen if there's only one class in y_true, etc.
        metrics_dict[f'roc_auc_{average}'] = None

    return metrics_dict


from collections import Counter

def find_labels_with_min_count(labels, min_count=15):
    """
    Find the positions of labels whose count is greater than or equal to `min_count`.

    Args:
        labels (list): A list of labels.
        min_count (int): The minimum count to filter labels (default=15).

    Returns:
        list: Indices of labels that meet the count condition.
    """
    # Count occurrences of each label
    label_counts = Counter(labels)
    
    # Find labels with count >= min_count
    valid_labels = {label for label, count in label_counts.items() if count >= min_count}
    
    # Find indices of valid labels
    indices = [i for i, label in enumerate(labels) if label in valid_labels]
    
    return indices






encoders = [
    'PSRT',
]

csv_names = [
    # 'Yau2020_record_processed',
    'Yau2022_record_processed',
    # 'NCBI_record_valid_nucleotide',
    # 'NCBI_record_valid_count',
]

features_needed = [('facet', 0)]

if __name__ == "__main__":




    for encoder in encoders:
        print("encoder ", encoder)
        for data_name in csv_names:
            print("data ", data_name)
            for k in range(4,5):
                print("k ", k)

                for feature, max_dim in features_needed:
                    print(feature)
                    print(max_dim)
                    path_to_data = f'data/{data_name}'
                    df = pd.read_csv(f"{path_to_data}.csv")
                    x_list = df['Accession (version)'].to_list()
                    y_list = df['Family'].to_list()
                    x_list, y_list = zip(*sorted(zip(x_list, y_list)))
                    x_list = list(x_list)
                    y_list = list(y_list)
                    indices = find_labels_with_min_count(y_list, min_count=3)
                    x_list = [x_list[i] for i in indices]
                    y_list = [y_list[i] for i in indices]


                    path_to_features = f'distances/{encoder}/{data_name}'
                    # for d in range(max_dim + 1):
                    # print(d)
                    distance_matrix = np.load(path_to_features+f'/k{k}_distance_{feature}{max_dim}.npy')
                    subD = distance_matrix[np.ix_(indices, indices)]


                    # path_to_features = f'distances/{encoder}/{data_name}'
                    # # for d in range(max_dim + 1):
                    # distance_matrix = np.load(path_to_features+f'/k{k}_distance.npy')
                    # subD = distance_matrix[np.ix_(indices, indices)]

                    le = LabelEncoder()
                    y_int = le.fit_transform(y_list)  # shape (N,)
                    y_np = np.array(y_int, dtype=np.int32)
                    labels = y_np


                    # 1-NN or k-NN using the distance matrix
                    y_pred = leave_one_out_knn(subD, labels, n_neighbors=1)

                    # Compute metrics
                    metrics_result = compute_metrics(labels, y_pred, average='macro')

                    # Print results
                    print("Leave-One-Out k-NN Results (k={})".format(k))
                    for s, v in metrics_result.items():
                        if v is not None:
                            print(f"{s}: {v:.4f}")
                        else:
                            print(f"{s}: None")
                    print("===============================================")
                    print("===============================================")
                    print("===============================================")

encoder  PSRT
data  Yau2022_record_processed
k  4
facet
0
Leave-One-Out k-NN Results (k=4)
accuracy: 0.9198
balanced_accuracy: 0.7668
precision_macro: 0.8001
recall_macro: 0.7668
f1_macro: 0.7737
roc_auc_macro: 0.8831


## NCBI 2024

In [20]:
# 3
#!/usr/bin/env python3

"""
leave_one_out_knn_eval_distance.py

Script to perform a leave-one-out 1-NN classification from a precomputed NxN
distance matrix, and then compute various metrics:
- Accuracy
- Balanced Accuracy
- Precision, Recall, F1
- ROC AUC (multi-class or binary)
"""

import numpy as np
from sklearn.metrics import (
    accuracy_score,
    balanced_accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)
import argparse
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler


def leave_one_out_knn(distance_matrix, labels, n_neighbors=1):
    """
    Perform leave-one-out k-NN (here k=1 by default) classification using
    a precomputed NxN distance matrix.

    For each sample i, we:
      - Look at distance_matrix[i], which contains distances to all other samples.
      - Exclude itself by setting distance_matrix[i,i] = np.inf (if not already).
      - Find the nearest neighbor(s).
      - Predict the label of the chosen neighbor(s).

    Args:
        distance_matrix (ndarray): shape (N, N), distance_matrix[i,j] is the
                                   distance between samples i and j.
        labels (ndarray): shape (N,), integer or string labels.
        n_neighbors (int): number of neighbors (default=1).

    Returns:
        y_pred (ndarray): predicted labels for each sample (length N)
    """
    N = distance_matrix.shape[0]
    y_pred = np.zeros(N, dtype=labels.dtype)

    for i in range(N):
        # distances from sample i to all others
        dist = distance_matrix[i].copy()
        # exclude itself
        dist[i] = np.inf

        # find the index/indices of the nearest neighbor(s)
        nn_idx = np.argpartition(dist, n_neighbors)[:n_neighbors]
        if n_neighbors == 1:
            chosen_label = labels[nn_idx[0]]
        else:
            # If k>1, we can do majority vote among chosen neighbors
            chosen_labels = labels[nn_idx]
            chosen_label = np.bincount(chosen_labels).argmax()

        y_pred[i] = chosen_label

    return y_pred


def compute_metrics(y_true, y_pred, average='weighted'):
    """
    Compute various classification metrics.
    For multi-class, use 'average' = 'macro' or 'weighted'.

    Returns a dict of metrics:
    - accuracy
    - balanced_accuracy
    - precision_{average}
    - recall_{average}
    - f1_{average}
    - roc_auc_{average}  (if applicable)
    """
    acc = accuracy_score(y_true, y_pred)
    bal_acc = balanced_accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, average=average, zero_division=0)
    rec = recall_score(y_true, y_pred, average=average, zero_division=0)
    f1 = f1_score(y_true, y_pred, average=average, zero_division=0)

    metrics_dict = {
        'accuracy': acc,
        'balanced_accuracy': bal_acc,
        f'precision_{average}': prec,
        f'recall_{average}': rec,
        f'f1_{average}': f1
    }

    # For ROC AUC (multi-class or binary), we only have discrete predictions.
    # We convert them to one-hot format for a 'one-vs-rest' approach in roc_auc_score.
    num_classes = len(np.unique(y_true))
    try:
        y_true_1hot = np.zeros((len(y_true), num_classes), dtype=int)
        y_pred_1hot = np.zeros((len(y_pred), num_classes), dtype=int)

        for i, lbl in enumerate(y_true):
            y_true_1hot[i, lbl] = 1
        for i, lbl in enumerate(y_pred):
            y_pred_1hot[i, lbl] = 1

        roc = roc_auc_score(
            y_true_1hot,
            y_pred_1hot,
            multi_class='ovr',  # or 'ovo'
            average=average
        )
        metrics_dict[f'roc_auc_{average}'] = roc

    except ValueError:
        # This can happen if there's only one class in y_true, etc.
        metrics_dict[f'roc_auc_{average}'] = None

    return metrics_dict


from collections import Counter

def find_labels_with_min_count(labels, min_count=15):
    """
    Find the positions of labels whose count is greater than or equal to `min_count`.

    Args:
        labels (list): A list of labels.
        min_count (int): The minimum count to filter labels (default=15).

    Returns:
        list: Indices of labels that meet the count condition.
    """
    # Count occurrences of each label
    label_counts = Counter(labels)
    
    # Find labels with count >= min_count
    valid_labels = {label for label, count in label_counts.items() if count >= min_count}
    
    # Find indices of valid labels
    indices = [i for i, label in enumerate(labels) if label in valid_labels]
    
    return indices






encoders = [
    'PSRT',
]

csv_names = [
    # 'Yau2020_record_processed',
    # 'Yau2022_record_processed',
    'NCBI_record_valid_nucleotide',
    # 'NCBI_record_valid_count',
]

features_needed = [('facet', 0)]

if __name__ == "__main__":




    for encoder in encoders:
        print("encoder ", encoder)
        for data_name in csv_names:
            print("data ", data_name)
            for k in range(4,5):
                print("k ", k)

                for feature, max_dim in features_needed:
                    print(feature)
                    print(max_dim)
                    path_to_data = f'data/{data_name}'
                    df = pd.read_csv(f"{path_to_data}.csv")
                    x_list = df['Accession (version)'].to_list()
                    y_list = df['Family'].to_list()
                    x_list, y_list = zip(*sorted(zip(x_list, y_list)))
                    x_list = list(x_list)
                    y_list = list(y_list)
                    indices = find_labels_with_min_count(y_list, min_count=3)
                    x_list = [x_list[i] for i in indices]
                    y_list = [y_list[i] for i in indices]


                    path_to_features = f'distances/{encoder}/{data_name}'
                    # for d in range(max_dim + 1):
                    # print(d)
                    distance_matrix = np.load(path_to_features+f'/k{k}_distance_{feature}{max_dim}.npy')
                    subD = distance_matrix[np.ix_(indices, indices)]


                    # path_to_features = f'distances/{encoder}/{data_name}'
                    # # for d in range(max_dim + 1):
                    # distance_matrix = np.load(path_to_features+f'/k{k}_distance.npy')
                    # subD = distance_matrix[np.ix_(indices, indices)]

                    le = LabelEncoder()
                    y_int = le.fit_transform(y_list)  # shape (N,)
                    y_np = np.array(y_int, dtype=np.int32)
                    labels = y_np


                    # 1-NN or k-NN using the distance matrix
                    y_pred = leave_one_out_knn(subD, labels, n_neighbors=1)

                    # Compute metrics
                    metrics_result = compute_metrics(labels, y_pred, average='macro')

                    # Print results
                    print("Leave-One-Out k-NN Results (k={})".format(k))
                    for s, v in metrics_result.items():
                        if v is not None:
                            print(f"{s}: {v:.4f}")
                        else:
                            print(f"{s}: None")
                    print("===============================================")
                    print("===============================================")
                    print("===============================================")

encoder  PSRT
data  NCBI_record_valid_nucleotide
k  4
facet
0
Leave-One-Out k-NN Results (k=4)
accuracy: 0.8913
balanced_accuracy: 0.7831
precision_macro: 0.8321
recall_macro: 0.7831
f1_macro: 0.7962
roc_auc_macro: 0.8913


## NCBI 2024 All

In [21]:
# 3
#!/usr/bin/env python3

"""
leave_one_out_knn_eval_distance.py

Script to perform a leave-one-out 1-NN classification from a precomputed NxN
distance matrix, and then compute various metrics:
- Accuracy
- Balanced Accuracy
- Precision, Recall, F1
- ROC AUC (multi-class or binary)
"""

import numpy as np
from sklearn.metrics import (
    accuracy_score,
    balanced_accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)
import argparse
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler


def leave_one_out_knn(distance_matrix, labels, n_neighbors=1):
    """
    Perform leave-one-out k-NN (here k=1 by default) classification using
    a precomputed NxN distance matrix.

    For each sample i, we:
      - Look at distance_matrix[i], which contains distances to all other samples.
      - Exclude itself by setting distance_matrix[i,i] = np.inf (if not already).
      - Find the nearest neighbor(s).
      - Predict the label of the chosen neighbor(s).

    Args:
        distance_matrix (ndarray): shape (N, N), distance_matrix[i,j] is the
                                   distance between samples i and j.
        labels (ndarray): shape (N,), integer or string labels.
        n_neighbors (int): number of neighbors (default=1).

    Returns:
        y_pred (ndarray): predicted labels for each sample (length N)
    """
    N = distance_matrix.shape[0]
    y_pred = np.zeros(N, dtype=labels.dtype)

    for i in range(N):
        # distances from sample i to all others
        dist = distance_matrix[i].copy()
        # exclude itself
        dist[i] = np.inf

        # find the index/indices of the nearest neighbor(s)
        nn_idx = np.argpartition(dist, n_neighbors)[:n_neighbors]
        if n_neighbors == 1:
            chosen_label = labels[nn_idx[0]]
        else:
            # If k>1, we can do majority vote among chosen neighbors
            chosen_labels = labels[nn_idx]
            chosen_label = np.bincount(chosen_labels).argmax()

        y_pred[i] = chosen_label

    return y_pred


def compute_metrics(y_true, y_pred, average='weighted'):
    """
    Compute various classification metrics.
    For multi-class, use 'average' = 'macro' or 'weighted'.

    Returns a dict of metrics:
    - accuracy
    - balanced_accuracy
    - precision_{average}
    - recall_{average}
    - f1_{average}
    - roc_auc_{average}  (if applicable)
    """
    acc = accuracy_score(y_true, y_pred)
    bal_acc = balanced_accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, average=average, zero_division=0)
    rec = recall_score(y_true, y_pred, average=average, zero_division=0)
    f1 = f1_score(y_true, y_pred, average=average, zero_division=0)

    metrics_dict = {
        'accuracy': acc,
        'balanced_accuracy': bal_acc,
        f'precision_{average}': prec,
        f'recall_{average}': rec,
        f'f1_{average}': f1
    }

    # For ROC AUC (multi-class or binary), we only have discrete predictions.
    # We convert them to one-hot format for a 'one-vs-rest' approach in roc_auc_score.
    num_classes = len(np.unique(y_true))
    try:
        y_true_1hot = np.zeros((len(y_true), num_classes), dtype=int)
        y_pred_1hot = np.zeros((len(y_pred), num_classes), dtype=int)

        for i, lbl in enumerate(y_true):
            y_true_1hot[i, lbl] = 1
        for i, lbl in enumerate(y_pred):
            y_pred_1hot[i, lbl] = 1

        roc = roc_auc_score(
            y_true_1hot,
            y_pred_1hot,
            multi_class='ovr',  # or 'ovo'
            average=average
        )
        metrics_dict[f'roc_auc_{average}'] = roc

    except ValueError:
        # This can happen if there's only one class in y_true, etc.
        metrics_dict[f'roc_auc_{average}'] = None

    return metrics_dict


from collections import Counter

def find_labels_with_min_count(labels, min_count=15):
    """
    Find the positions of labels whose count is greater than or equal to `min_count`.

    Args:
        labels (list): A list of labels.
        min_count (int): The minimum count to filter labels (default=15).

    Returns:
        list: Indices of labels that meet the count condition.
    """
    # Count occurrences of each label
    label_counts = Counter(labels)
    
    # Find labels with count >= min_count
    valid_labels = {label for label, count in label_counts.items() if count >= min_count}
    
    # Find indices of valid labels
    indices = [i for i, label in enumerate(labels) if label in valid_labels]
    
    return indices






encoders = [
    'PSRT',
]

csv_names = [
    # 'Yau2020_record_processed',
    # 'Yau2022_record_processed',
    # 'NCBI_record_valid_nucleotide',
    'NCBI_record_valid_count',
]

features_needed = [('facet', 0)]

if __name__ == "__main__":




    for encoder in encoders:
        print("encoder ", encoder)
        for data_name in csv_names:
            print("data ", data_name)
            for k in range(4,5):
                print("k ", k)

                for feature, max_dim in features_needed:
                    print(feature)
                    print(max_dim)
                    path_to_data = f'data/{data_name}'
                    df = pd.read_csv(f"{path_to_data}.csv")
                    x_list = df['Accession (version)'].to_list()
                    y_list = df['Family'].to_list()
                    x_list, y_list = zip(*sorted(zip(x_list, y_list)))
                    x_list = list(x_list)
                    y_list = list(y_list)
                    indices = find_labels_with_min_count(y_list, min_count=3)
                    x_list = [x_list[i] for i in indices]
                    y_list = [y_list[i] for i in indices]


                    path_to_features = f'distances/{encoder}/{data_name}'
                    # for d in range(max_dim + 1):
                    # print(d)
                    distance_matrix = np.load(path_to_features+f'/k{k}_distance_{feature}{max_dim}.npy')
                    subD = distance_matrix[np.ix_(indices, indices)]


                    # path_to_features = f'distances/{encoder}/{data_name}'
                    # # for d in range(max_dim + 1):
                    # distance_matrix = np.load(path_to_features+f'/k{k}_distance.npy')
                    # subD = distance_matrix[np.ix_(indices, indices)]

                    le = LabelEncoder()
                    y_int = le.fit_transform(y_list)  # shape (N,)
                    y_np = np.array(y_int, dtype=np.int32)
                    labels = y_np


                    # 1-NN or k-NN using the distance matrix
                    y_pred = leave_one_out_knn(subD, labels, n_neighbors=1)

                    # Compute metrics
                    metrics_result = compute_metrics(labels, y_pred, average='macro')

                    # Print results
                    print("Leave-One-Out k-NN Results (k={})".format(k))
                    for s, v in metrics_result.items():
                        if v is not None:
                            print(f"{s}: {v:.4f}")
                        else:
                            print(f"{s}: None")
                    print("===============================================")
                    print("===============================================")
                    print("===============================================")

encoder  PSRT
data  NCBI_record_valid_count
k  4
facet
0
Leave-One-Out k-NN Results (k=4)
accuracy: 0.8922
balanced_accuracy: 0.7807
precision_macro: 0.8243
recall_macro: 0.7807
f1_macro: 0.7936
roc_auc_macro: 0.8901
