In [None]:
# -------------------------------
# Import necessary libraries
# -------------------------------
import os
import sys
import random
from itertools import product
from collections import Counter

import numpy as np
import pandas as pd
from Bio import SeqIO

from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import pdist, squareform

# Local module
from psrt import *




# -------------------------------
# Featurization
# -------------------------------





encoder = 'PSRT'
max_dimension = 2
num = 2
k_chosen = 4

csv_names = [
    'Yau2020_record_processed',
    # 'Yau2022_record_processed',
    # 'NCBI_record_valid_nucleotide',
    # 'NCBI_record_valid_count',
]


# -------------------------------
# Configuration parameters
# -------------------------------
# encoder = 'PSRT'
# max_dimension = 2
alphabet = ["A", "C", "G", "T"]
# num = 0

# List of dataset base names (CSV and FASTA share the same prefix)


# -------------------------------
# Loop over datasets
# -------------------------------
for data_name in csv_names:
    print(data_name)
    path_to_data = f"./data/{data_name}"

    # Loop over k-mer sizes from 1 to 7
    for k in range(k_chosen, k_chosen+1):
        print(k)

        # Load accessions from CSV
        df = pd.read_csv(f"{path_to_data}.csv")
        x = df['Accession (version)'].to_list()

        # Parse FASTA sequences into a list of SeqRecord objects
        file_path = f'{path_to_data}.fasta'
        DNAs = list(SeqIO.parse(file_path, "fasta"))

        # Create a dictionary: accession → SeqRecord
        dna_dict = {record.id: record for record in DNAs}

        # Match sequences with accessions from CSV, preserving order
        ordered_dnas = [dna_dict[accession] for accession in x if accession in dna_dict]
        dna_list = [str(record.seq) for record in ordered_dnas]

        # Generate all possible k-mers of length k
        kmers = generate_all_kmers(alphabet, k)

        # Define filtration values (uniform grid scaled by 4^k)
        specific_filtration = np.array([i * 4**k for i in range(0, num + 1)])

        # -------------------------------
        # Process each DNA sequence
        # -------------------------------
        # for i in range(len(dna_list)):
        for i in range(len(dna_list)):
            print("sample ", i)
            print("accession ", ordered_dnas[i].id)
            dna = dna_list[i]  # ← fix: define current sequence

            # Initialize per-dimension feature accumulators
            betti_result = (max_dimension + 1) * [None]
            f_result = (max_dimension + 1) * [None]
            h_result = (max_dimension + 2) * [None]
            facet_result = (max_dimension + 1) * [None]

            # -------------------------------
            # Loop over all possible k-mers
            # -------------------------------
            for kmer in kmers:
                points = occurrence(dna, kmer)

                # Handle case where k-mer is absent
                if len(points) == 0:
                    zero_curve = np.array((1 + num) * [0])
                    betti_curves = (max_dimension + 1) * [zero_curve]
                    f_curves = (max_dimension + 1) * [zero_curve]
                    h_curves = (max_dimension + 2) * [zero_curve]
                    facet_curves = (max_dimension + 1) * [zero_curve]

                    # Append zero curves to result arrays
                    for d in range(max_dimension + 1):
                        if betti_result[d] is None:
                            betti_result[d] = betti_curves[d]
                        else:
                            betti_result[d] = np.vstack([betti_result[d], betti_curves[d]])

                        if f_result[d] is None:
                            f_result[d] = f_curves[d]
                        else:
                            f_result[d] = np.vstack([f_result[d], f_curves[d]])

                        if facet_result[d] is None:
                            facet_result[d] = facet_curves[d]
                        else:
                            facet_result[d] = np.vstack([facet_result[d], facet_curves[d]])

                    # Append zero curves to result arrays
                    for d in range(max_dimension + 2):
                        if h_result[d] is None:
                            h_result[d] = h_curves[d]
                        else:
                            h_result[d] = np.vstack([h_result[d], h_curves[d]])
                    continue

                # -------------------------------
                # Compute PH features for current k-mer
                # -------------------------------
                points = np.array(points)[:, np.newaxis]  # Reshape to 2D

                ph = PH(
                    points,
                    max_dimension=max_dimension,
                    max_edge_length=2.0,
                    specific_filtration=specific_filtration
                )

                alphas, betti_curves = ph.betti_curves()
                f_curves = ph.compute_f_vector_curves()
                h_curves = ph.compute_h_vector_curves()
                facet_curves = ph.facet_curves()

                # Stack curves per dimension
                for d in range(max_dimension + 1):
                    if betti_result[d] is None:
                        betti_result[d] = betti_curves.get(d, np.zeros_like(alphas))
                    else:
                        betti_result[d] = np.vstack([betti_result[d], betti_curves.get(d, np.zeros_like(alphas))])

                    if f_result[d] is None:
                        f_result[d] = f_curves.get(d, np.zeros_like(alphas))
                    else:
                        f_result[d] = np.vstack([f_result[d], f_curves.get(d, np.zeros_like(alphas))])

                    if facet_result[d] is None:
                        facet_result[d] = facet_curves.get(d, np.zeros_like(alphas))
                    else:
                        facet_result[d] = np.vstack([facet_result[d], facet_curves.get(d, np.zeros_like(alphas))])

                # Stack curves per dimension
                for d in range(max_dimension + 2):
                    if h_result[d] is None:
                        h_result[d] = h_curves.get(d, np.zeros_like(alphas))
                    else:
                        h_result[d] = np.vstack([h_result[d], h_curves.get(d, np.zeros_like(alphas))])

            # -------------------------------
            # Save results to .npy files
            # -------------------------------
            save_path = f"features/{encoder}/{data_name}/{k}"
            os.makedirs(save_path, exist_ok=True)   # create folder path if missing
            for d in range(max_dimension+1):
                np.save(f"./features/{encoder}/{data_name}/{k}/{x[i]}_betti{d}.npy", betti_result[d])
                np.save(f"./features/{encoder}/{data_name}/{k}/{x[i]}_f{d}", f_result[d])
                np.save(f"./features/{encoder}/{data_name}/{k}/{x[i]}_facet{d}", facet_result[d])

            # -------------------------------
            # Save results to .npy files
            # -------------------------------
            for d in range(max_dimension+2):
                np.save(f"./features/{encoder}/{data_name}/{k}/{x[i]}_h{d}", h_result[d])

