In [None]:
# -------------------------------
# Import necessary libraries
# -------------------------------
import os
import sys
import random
from itertools import product
from collections import Counter

import numpy as np
import pandas as pd
from Bio import SeqIO

from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import pdist, squareform

# Local module
from psrt import *



csv_names = [
    'Yau2020_record_processed',
    # 'Yau2022_record_processed',
    # 'NCBI_record_valid_nucleotide',
    # 'NCBI_record_valid_count',
]




# -------------------------------
# Distance
# -------------------------------

# -------------------------------
# Command-line Arguments
# -------------------------------
encoder = 'PSRT'
k_chosen = 4

# -------------------------------
# Feature Configuration
# -------------------------------
features_needed = [('betti', 2), ('f', 2), ('facet', 2), ('h', 3)]

# -------------------------------
# Distance Computation Function
# -------------------------------
def compute_pairwise_distances(X, metric='euclidean', **kwargs):
    """
    Compute pairwise distances for the dataset X.
    """
    condensed_dist = pdist(X, metric=metric, **kwargs)
    return squareform(condensed_dist)

# -------------------------------
# Main Script
# -------------------------------
if __name__ == "__main__":

    print(f"Encoder: {encoder}")

    for data_name in csv_names:
        print(f"Dataset: {data_name}")

        for k in range(k_chosen, k_chosen + 1):
            print(f"k = {k}")

            for feature, max_dim in features_needed:
                print(f"  Feature: {feature}")

                for d in range(max_dim + 1):
                    input_path = f'features/{encoder}/{data_name}/k{k}_{feature}{d}.npy'

                    if not os.path.exists(input_path):
                        print(f"    [Skipped] File not found: {input_path}")
                        continue

                    # Load and normalize data
                    X = np.load(input_path)
                    X = StandardScaler().fit_transform(X)

                    # Compute distance matrix
                    metric = 'minkowski'
                    p = 2
                    print(f"    Computing pairwise distances (metric: {metric})...")
                    dist_matrix = compute_pairwise_distances(X, metric=metric, p=p)
                    dist_matrix /= np.max(dist_matrix)

                    # Save output
                    output_dir = f'distances/{encoder}/{data_name}'
                    os.makedirs(output_dir, exist_ok=True)
                    output_path = f'{output_dir}/k{k}_distance_{feature}{d}.npy'
                    np.save(output_path, dist_matrix)

                    print(f"    [Saved] {output_path}")

                print("  " + "-" * 50)
            print("=" * 60)



















