In [None]:
# -------------------------------
# Import necessary libraries
# -------------------------------
import os
import sys
import random
from itertools import product
from collections import Counter

import numpy as np
import pandas as pd
from Bio import SeqIO

from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import pdist, squareform

# Local module
from psrt import *




encoder = 'PSRT'

csv_names = [
    'Yau2020_record_processed',
    # 'Yau2022_record_processed',
    # 'NCBI_record_valid_nucleotide',
    # 'NCBI_record_valid_count',
]


# -------------------------------
# Stacking
# -------------------------------




def stack_arrays_vertically(file_paths):
    """
    Load NumPy arrays from given file paths and stack them vertically.

    Parameters:
        file_paths (list of str): List of file paths to the NumPy arrays.

    Returns:
        np.ndarray: A single vertically stacked NumPy array.
    """
    # Load each array and store it in a list
    # arrays = [np.load(path) for path in file_paths]
    arrays = [np.load(path).reshape(-1) for path in file_paths]
    
    # Stack the arrays vertically
    stacked_array = np.vstack(arrays)
    
    return stacked_array


# encoder = 'PSRT'

features_needed = [('betti', 2), ('f', 2), ('facet', 2), ('h', 3)]


for data_name in csv_names:
    print(data_name)
    for k in range(k_chosen,k_chosen+1):
        print(k)
        path_to_features = f'features/{encoder}/{data_name}/{k}'
        path_to_data = f'data/{data_name}'

        # Read CSV file into DataFrame
        df = pd.read_csv(f"{path_to_data}.csv")
        # Load and sort accession IDs from metadata
        accessions = sorted(df['Accession (version)'].to_list())

        # Path to folder with feature files
        folder_path = path_to_features

        for considered_feature, considered_max_dimension in features_needed:
            print(considered_feature, considered_max_dimension)
            for d in range(considered_max_dimension+1):
                print("d ", d)

                # List all betti0 .npy files
                file_names = [
                    f for f in os.listdir(folder_path)
                    if f.endswith('.npy') and f'{considered_feature}{d}' in f and os.path.isfile(os.path.join(folder_path, f))
                ]

                # Extract accession IDs from filenames (assumes format: accession_betti0.npy)
                modified_names = [f.rsplit(f'_{considered_feature}{d}', 1)[0] for f in file_names]

                # Filter accession IDs that have corresponding feature files
                matched_accessions = [acc for acc in accessions if acc in modified_names]
                print("accessions ", len(accessions))
                print("matched" , len(matched_accessions))

                # Map accession → index in file_names
                element_to_index = {name: idx for idx, name in enumerate(modified_names)}

                # Build index list (skip any accessions not found in file_names)
                indices = [element_to_index[acc] for acc in matched_accessions if acc in element_to_index]

                # Sort file names to match accession order
                file_names_sorted = [file_names[i] for i in indices]

                # Full paths to sorted .npy files
                file_paths = [os.path.join(folder_path, fname) for fname in file_names_sorted]

                np.save(f'features/{encoder}/{data_name}/k{k}_{considered_feature}{d}.npy', stack_arrays_vertically(file_paths))



