# Cluster Gene Neighborhoods

In [4]:
import os
# Ensure multiprocessing is imported for os.cpu_count()
import multiprocessing # <-- Ensure this is here if not already.
from threadpoolctl import threadpool_limits 

# Set OMP_NUM_THREADS as early as possible.
# This variable primarily controls internal threading for numerical libraries like OpenBLAS/MKL
# used by NumPy/SciPy.
# When using joblib with n_jobs > 1 (process-based parallelism), each child process
# will inherit this setting. However, to avoid oversubscription and ensure optimal performance
# when a single joblib process performs an operation that *could* be multi-threaded itself
# (like sparse matrix dot products), we will use `threadpool_limits` within the joblib task.
# So, setting it globally is fine, but `threadpool_limits` will override it locally.
num_logical_cores = os.cpu_count()
if num_logical_cores:
    # For OMP_NUM_THREADS, typically use all logical cores.
    os.environ["OMP_NUM_THREADS"] = str(num_logical_cores) 
    print(f"Set OMP_NUM_THREADS to {num_logical_cores} for SciPy/NumPy internal multi-threading.")
else:
    print("Could not detect CPU count. OMP_NUM_THREADS not explicitly set.")


import pandas as pd
import sqlite3
from collections import defaultdict
import re
from scipy.cluster.hierarchy import linkage, fcluster, dendrogram
import matplotlib.pyplot as plt
import numpy as np
import datetime
import string
import scipy.sparse as sp
import time
import scipy.sparse as sp
from tqdm import tqdm
import gc
from joblib import Parallel, delayed 


# --- Configuration for the SQLite DB ---
# SQLITE_DB_PATH will be set in the __main__ block for convenience
GENES_TABLE = 'attributes' # Main table for hit gene data
NEIGHBORS_TABLE = 'neighbors' # Table for neighboring gene data

COL_NEIGHBORHOOD_ID = 'organism' # Use 'organism' as the unique identifier for each neighborhood for our clustering
COL_GENE_ID = 'id' # Unique identifier for each gene/protein in both tables
COL_LINKING_KEY = 'id' # The linking column between 'attributes' and 'neighbors'
COL_ACCESSION_ID = 'accession' # Column for accession (UniProt) ID

COL_FUNCTION_DESC = 'desc' # Main functional description in both tables
COL_PFAM_IDS = 'family' # Column for PFAM family IDs in both tables
COL_INTERPRO_IDS = 'ipro_family' # Column for InterPro family IDs in both tables
COL_REL_START = 'rel_start' # Column for relative start position
COL_REL_STOP = 'rel_stop' # Column for relative stop position

HIT_GENE_WEIGHT_FACTOR = 10 # Factor by which hit gene features are "copied" for emphasis
DIRECT_NEIGHBOR_WEIGHT_FACTOR = 3 # Factor for direct neighbor domain features

COL_SSN_CLUSTER_ID = 'cluster_num' # Column in 'attributes' that holds the SSN cluster ID
# DEFAULT_SSN_CLUSTER_VALUE_TO_FILTER can be a list of values that should be ignored as valid SSN clusters
DEFAULT_SSN_CLUSTER_VALUE_TO_FILTER = [None, 0] # Example: Filter out None or 0 as valid SSN IDs

SAVE_PLOTS = True # Set to True to save plots to files
OUTPUT_DIR = 'gnn_cluster_plots_OvoA' # Directory to save plots
REPORT_FILENAME_BASE = 'gnn_clustering_report' # Base name, will append info dynamically
OUTPUT_FORMATS = ['pdf'] # ['svg', 'png', 'pdf'] # List of formats to save plots in
DPI = 300 # Dots per inch for raster formats like 'png'
HIGHLIGHT_COLOR = 'red' # Color for the original input sequence's leaf label

# Dynamic plot sizing parameters
MIN_PLOT_HEIGHT = 8 # Minimum height of the plot in inches
HEIGHT_PER_LEAF = 0.05 # Adjust this value (e.g., 0.2 to 0.5) to compress/stretch the Y-axis
MAX_PLOT_HEIGHT = 40 # Maximum height to prevent excessively tall plots

MIN_PLOT_WIDTH = 10 # Minimum width of the plot in inches
WIDTH_PER_LEAF = 0.3 # Adjust this value (e.g., 0.3 to 0.7) to compress/stretch the X-axis (more horizontal space for branches)
MAX_PLOT_WIDTH = 150 # Maximum width to prevent excessively wide plots

# Configuration for collapsing similar neighborhoods
COLLAPSE_IDENTICAL_NEIGHBORHOODS = True # Set to True to enable collapsing
COLLAPSE_CORE_SIMILARITY_THRESHOLD = 0.0 # Strict similarity for hit gene + direct neighbors. Usually 0.0 for exact matches.
COLLAPSE_FULL_NEIGHBORHOOD_SIMILARITY_THRESHOLD = 0.3 # Similarity for the entire neighborhood. e.g., 0.3 for 70% similarity.

# Define a constant for the minimum number of samples to trigger parallel pdist
MIN_ITEMS_FOR_PARALLEL_PROCESSING = 20 # Adjust this value as needed

# Compile regex patterns and create frozenset for performance
_IPR_REGEX = re.compile(r'IPR\d+', re.IGNORECASE)
_PFAM_REGEX = re.compile(r'PF\d+', re.IGNORECASE)
_UNINFORMATIVE_TERMS = frozenset(['none', '', 'null', 'uncharacterized protein'])
# ----------------------------------------------------------------------


def parse_annotation_string(annotation_str, prefix=""):
    """
    Parses a string containing annotations (e.g., function, InterPro, PFAM)
    to extract individual features, applying a prefix.
    Handles potential multiple IDs separated by hyphens or semicolons.
    Does NOT split by spaces or underscores for terms like "cysteine desulfurase".
    Filters out "Uncharacterized protein", empty strings, and 'none' values.
    """
    if not isinstance(annotation_str, str) or pd.isna(annotation_str) or annotation_str.lower().strip() in _UNINFORMATIVE_TERMS:
        return set()

    features = set()
    # Split only by hyphens (-) and semicolons (;)
    parts = [p.strip() for p in re.split(r'[-;]', annotation_str) if p.strip()]

    for part in parts:
        # After splitting, re-check for uninformative parts
        if part.lower().strip() in _UNINFORMATIVE_TERMS:
            continue

        # Prioritize InterPro and PFAM IDs if they match the patterns
        if _IPR_REGEX.match(part):
            features.add(f"{prefix}{part.upper()}")
        elif _PFAM_REGEX.match(part):
            features.add(f"{prefix}{part.upper()}")
        else:
            # General terms, clean them up slightly (collapse multiple spaces)
            clean_part = re.sub(r'\s+', ' ', part).lower().strip()

            if clean_part: # Check if it's not empty after cleaning
                features.add(f"{prefix}{clean_part}")
    return features


def extract_features_from_gene_row(gene_row, current_weight_factor=1, base_prefix="N_", 
                                   include_desc=True, include_pfam=True, include_interpro=True):
    """
    Extracts features (InterPro, PFAM, function terms) from a single gene row,
    applying a base prefix and duplicating features by current_weight_factor.
    """
    
    features_set = set()
    raw_features = set()

    if include_desc:
        function_desc = gene_row[COL_FUNCTION_DESC]
        raw_features.update(parse_annotation_string(function_desc))
    
    if include_pfam:
        pfam_ids = gene_row[COL_PFAM_IDS]
        raw_features.update(parse_annotation_string(pfam_ids))
    
    if include_interpro:
        interpro_ids = gene_row[COL_INTERPRO_IDS]
        raw_features.update(parse_annotation_string(interpro_ids))

    if current_weight_factor > 1:
        for feature in raw_features:
            for i in range(current_weight_factor):
                features_set.add(f"{base_prefix}{feature}_w{i}") 
    else: 
        for feature in raw_features:
            features_set.add(f"{base_prefix}{feature}")

    return features_set


def _plot_dendrogram(linked, neighborhood_ids_subset, labels_map, distance_threshold, 
                     plot_title_base, label_type, original_input_sequence_id,
                     save_plots, output_dir, output_formats, dpi, 
                     min_plot_height, height_per_leaf, max_plot_height,
                     min_plot_width, width_per_leaf, max_plot_width,
                     report_file=None):
    """
    Helper function to generate a single dendrogram plot.
    """
    # An internal helper function for consistent logging
    def _write_and_print_internal(text):
        # Only print to console for plots as it can be very verbose
        # and only save to report if it's a critical message.
        # Plotting messages are now controlled outside this function.
        if report_file:
            report_file.write(text + '\n')
    
    fig_title = f"{plot_title_base} ({label_type.capitalize()} Labels)"
    
    labels_to_use = []
    # Store the hit_id for each label in the order it will be plotted, to match with xticklabels
    accession_ids_for_labels = []
    for nh_id in neighborhood_ids_subset:
        organism_name, hit_id_internal, ssn_cluster_id, accession_id, _ = labels_map.get(nh_id, ('Unknown', 'Unknown', None, 'Unknown', None))

        if label_type == 'organism':
            labels_to_use.append(organism_name.rstrip('.'))
        elif label_type == 'id': # This now means 'accession'
            labels_to_use.append(accession_id) # Use accession_id for 'id' labels
        else:
            labels_to_use.append(nh_id) # Fallback, should not be hit
        accession_ids_for_labels.append(accession_id)

    # Figure size calculation
    num_leaves = len(neighborhood_ids_subset)
    
    calculated_height = max(min_plot_height, num_leaves * height_per_leaf)
    final_height = min(calculated_height, max_plot_height)
    
    calculated_width = max(min_plot_width, num_leaves * width_per_leaf)
    final_width = min(calculated_width, max_plot_width)
    
    plt.figure(figsize=(final_width, final_height)) # Use dynamic width and height
    
    dendrogram(linked,
               orientation='top',
               labels=labels_to_use,
               distance_sort='descending',
               show_leaf_counts=True)
    
    plt.title(fig_title)
    plt.xlabel('Gene Neighborhood (Labeled by ' + ('Accession' if label_type == 'id' else label_type.capitalize()) + ')')     # Y-axis label is now fixed to Linear Scale
    plt.ylabel(f'Jaccard Distance (Linear Scale, Threshold: {distance_threshold})') 
    plt.axhline(y=distance_threshold, color='r', linestyle='--', label=f'Cut-off at {distance_threshold}')
    plt.legend()

    ax = plt.gca()    
    plt.setp(ax.get_xticklabels(), rotation=90, ha="right", rotation_mode="anchor")
    
    # Color specific leaf label
    if original_input_sequence_id:
        for i, tick_label in enumerate(ax.get_xticklabels()):
            if accession_ids_for_labels[i] == original_input_sequence_id:
                tick_label.set_color(HIGHLIGHT_COLOR)
                tick_label.set_weight('bold') # Make it bold for more prominence
    
    plt.tight_layout()

    if save_plots:
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        clean_plot_title_base = re.sub(r'[^\w\s-]', '', plot_title_base).replace(' ', '_')
        base_filename = f"{clean_plot_title_base}_{label_type}_labels"
        
        # Save in multiple formats ---
        for fmt in output_formats:
            full_filename = f"{base_filename}.{fmt}"
            plt.savefig(os.path.join(output_dir, full_filename), format=fmt, dpi=dpi)
        plt.close()
    else:
        plt.show()


def parallel_pdist_jaccard(feature_matrix, num_cores=-1):
    """
    Calculates the condensed Jaccard distance matrix.
    Uses joblib parallelization only if n_samples > MIN_ITEMS_FOR_PARALLEL_PROCESSING.
    Otherwise, it runs sequentially.
    """
    if not isinstance(feature_matrix, sp.csr_matrix):
        if isinstance(feature_matrix, (sp.csc_matrix, sp.lil_matrix, sp.coo_matrix)):
            feature_matrix = feature_matrix.tocsr()
        else:
            raise TypeError("Input feature_matrix must be a SciPy sparse matrix (preferably CSR).")

    n_samples = feature_matrix.shape[0]
    if n_samples <= 1:
        return np.array([])
    
    # Determine the number of cores to use, with a fallback to 1 if detection fails
    if num_cores == -1:
        detected_cores = os.cpu_count()
        num_cores = detected_cores if detected_cores is not None and detected_cores > 0 else 1
    elif num_cores == 0: # Treat 0 as explicit sequential execution
        num_cores = 1

    # --- Pre-compute all feature sets once in the parent process ---
    # This is always beneficial for performance regardless of parallelization,
    # as it avoids repeated set construction from the sparse matrix.
    # The tqdm desc is now more informative and reflects the context.
    print(f"  Pre-calculating {n_samples} feature sets...")
    set_precomputation_start = time.time()
    feature_sets = [
        set(feature_matrix.indices[feature_matrix.indptr[i]:feature_matrix.indptr[i+1]])
        for i in tqdm(range(n_samples), desc=f"  Pre-computing feature sets (N={n_samples})", leave=False)
    ]
    print(f"  Feature set pre-calculation took {time.time() - set_precomputation_start:.2f} seconds.")

    # Explicitly delete the sparse matrix after feature_sets are extracted.
    del feature_matrix
    gc.collect()

    # Define the core distance calculation logic (for both sequential and parallel workers)
    def _calculate_distances_from_sets(start_i, end_i, all_feature_sets_ref, n_samples_total_ref):
        # IMPORTANT: Limit internal (BLAS/LAPACK) threading for each joblib process to 1.
        with threadpool_limits(limits=1, user_api='blas'):
            distances_chunk = []
            
            for i in range(start_i, end_i):
                set_i = all_feature_sets_ref[i]
                for j in range(i + 1, n_samples_total_ref):
                    set_j = all_feature_sets_ref[j]

                    intersection_size = len(set_i.intersection(set_j))
                    union_size = len(set_i.union(set_j))

                    if union_size == 0:
                        distances_chunk.append(0.0)
                    else:
                        distances_chunk.append(1.0 - (intersection_size / union_size))
            return distances_chunk

    total_i_iterations = n_samples - 1
    if total_i_iterations <= 0:
        return np.array([])

    # --- Conditional Parallelization Logic ---
    if n_samples < MIN_ITEMS_FOR_PARALLEL_PROCESSING or num_cores == 1:
        print(f"  Running Jaccard distance sequentially for N={n_samples} (below parallel threshold or num_cores=1).")
        results = [_calculate_distances_from_sets(0, total_i_iterations, feature_sets, n_samples)]
    else:
        print(f"  Running Jaccard distance in parallel for N={n_samples} using {num_cores} cores.")
        i_ranges_for_tasks = []
        # Create more granular tasks for better load balancing, but not excessively so for small N
        num_tasks = min(total_i_iterations, num_cores * 4) # Up to 4 tasks per core, but no more than i_iterations
        if num_tasks == 0: return np.array([]) # Safety check for very small N
        
        chunk_size_for_i = max(1, (total_i_iterations + num_tasks - 1) // num_tasks)
        
        for k in range(0, total_i_iterations, chunk_size_for_i):
            start_i = k
            end_i = min(k + chunk_size_for_i, total_i_iterations)
            i_ranges_for_tasks.append((start_i, end_i))
            
        tasks = [
            delayed(_calculate_distances_from_sets)(start_i_chunk, end_i_chunk, feature_sets, n_samples)
            for start_i_chunk, end_i_chunk in i_ranges_for_tasks
        ]
        
        results = Parallel(n_jobs=num_cores, backend="loky", verbose=0)(tasks)

    condensed_distances = np.concatenate(results)
    return np.array(condensed_distances)




def _perform_collapsing(all_neighborhood_features, full_neighborhood_labels_map, 
                        core_neighborhood_features,
                        collapse_core_similarity_threshold, collapse_full_neighborhood_similarity_threshold,
                        output_prefix="", report_file=None, parallelize_pdist=False):
    """
    Performs a two-stage collapsing of similar neighborhoods.
    Stage 1: Group by core (hit + direct neighbors) features.
    Stage 2: Within these groups, sub-group by full neighborhood features.

    Returns: (final_neighborhood_features, final_neighborhood_labels_map, collapsed_groups_report)
    """
    # An internal helper function for consistent logging
    def _write_and_print_internal(text):
        print(text)
        if report_file:
            report_file.write(text + '\n')

    _write_and_print_internal(f"{output_prefix}  Starting two-stage collapsing (Core Thr: {collapse_core_similarity_threshold}, Full Thr: {collapse_full_neighborhood_similarity_threshold}).")
    
    collapsing_overall_start = time.time()

    collapsed_groups_report = {}
    
    # --- Stage 1: Group by CORE features (hit + direct neighbors) ---
    stage1_start = time.time()
    core_labels_ordered = sorted(list(core_neighborhood_features.keys()))
    if len(core_labels_ordered) < 2:
        _write_and_print_internal(f"{output_prefix}  Only {len(core_labels_ordered)} neighborhood(s) to process. Skipping collapsing.")
        return all_neighborhood_features, full_neighborhood_labels_map, collapsed_groups_report

    core_vocabulary = sorted(list(set.union(*core_neighborhood_features.values())))
    if not core_vocabulary:
        _write_and_print_internal(f"{output_prefix}  Warning: No core features found for collapsing. Skipping collapsing step.")
        return all_neighborhood_features, full_neighborhood_labels_map, collapsed_groups_report

    _write_and_print_internal(f"{output_prefix}  Stage 1: Building sparse matrix for {len(core_labels_ordered)} neighborhoods and {len(core_vocabulary)} core features...")
    matrix_build_start = time.time()
    core_feature_to_idx = {feature: i for i, feature in enumerate(core_vocabulary)}
    num_core_neighborhoods = len(core_labels_ordered)
    num_core_features = len(core_vocabulary)

    # Use LIL for efficient construction, then convert to CSR for computation
    core_feature_vectors_lil = sp.lil_matrix((num_core_neighborhoods, num_core_features), dtype=np.int8) 
    for i, nh_label in enumerate(tqdm(core_labels_ordered, desc=f"{output_prefix}  Stage 1: Populating core features", leave=False)): # <-- MODIFIED: Added tqdm
        for feature in core_neighborhood_features[nh_label]:
            if feature in core_feature_to_idx: # Safety check
                j = core_feature_to_idx[feature]
                core_feature_vectors_lil[i, j] = 1
    core_feature_vectors = core_feature_vectors_lil.tocsr() # Convert to CSR
    _write_and_print_internal(f"{output_prefix}  Stage 1: Sparse matrix built in {time.time() - matrix_build_start:.2f} seconds. Shape: {core_feature_vectors.shape}, NNZ: {core_feature_vectors.nnz}") # <-- ADDED: Detailed report
    gc.collect() # <-- ADDED: Garbage collection

    if core_feature_vectors.shape[0] < 2: # Check again after creating vectors
        _write_and_print_internal(f"{output_prefix}  Only {core_feature_vectors.shape[0]} valid core feature vector(s). Skipping collapsing.")
        return all_neighborhood_features, full_neighborhood_labels_map, collapsed_groups_report

    # Check for identical sparse vectors
    if num_core_neighborhoods > 1 and all(
        (core_feature_vectors[0] != core_feature_vectors[i]).nnz == 0 # Compare sparse rows
        for i in range(1, num_core_neighborhoods)
    ):
        core_pre_clusters = {core_labels_ordered[i]: 1 for i in range(len(core_labels_ordered))}
        _write_and_print_internal(f"{output_prefix}  All core feature vectors are identical. Treating as one initial core group.")
    else:
        _write_and_print_internal(f"{output_prefix}  Stage 1: Calculating core distances using scipy.pdist...") # <-- MODIFIED: Detailed report
        distance_calc_start = time.time()
        core_distances = parallel_pdist_jaccard(core_feature_vectors, num_cores=-1 if parallelize_pdist else 1)
        _write_and_print_internal(f"{output_prefix}  Stage 1: Core distance calculation took {time.time() - distance_calc_start:.2f} seconds.")

        _write_and_print_internal(f"{output_prefix}  Stage 1: Performing linkage and clustering for core features...") # <-- MODIFIED: Detailed report
        linkage_start = time.time()
        core_linked = linkage(core_distances, method='average')
        _write_and_print_internal(f"{output_prefix}  Stage 1: Linkage took {time.time() - linkage_start:.2f} seconds.") # <-- ADDED: Detailed report
        core_pre_clusters = fcluster(core_linked, collapse_core_similarity_threshold, criterion='distance')
    
    # Group neighborhoods by their initial core-feature-based cluster
    initial_core_groups = defaultdict(list)
    # The output of fcluster is an array of cluster IDs, map them back to labels_ordered
    # Handling for when core_pre_clusters is already a dict (all identical) or an array
    if isinstance(core_pre_clusters, np.ndarray): 
        for i, group_id in enumerate(core_pre_clusters):
            initial_core_groups[group_id].append(core_labels_ordered[i])
    else: 
        initial_core_groups = {1: core_labels_ordered} # Put all in one group (from the `if all(...)` block)
    
    _write_and_print_internal(f"{output_prefix}  Stage 1: Grouped into {len(initial_core_groups)} initial core groups based on a threshold of {collapse_core_similarity_threshold}. Total stage 1 took {time.time() - stage1_start:.2f} seconds.") # <-- ADDED: Detailed report

    # Explicit memory cleanup for Stage 1 objects
    del core_feature_vectors 
    if 'core_distances' in locals(): del core_distances 
    if 'core_linked' in locals(): del core_linked 
    gc.collect() # Garbage collection

    # --- Stage 2: Sub-group by FULL neighborhood features within each core group ---
    stage2_start = time.time()
    
    # We'll generate letter codes like A, B, ..., Z, AA, AB, ... for robustness
    def generate_letter_code(index):
        if index < 26:
            return string.ascii_uppercase[index]
        else:
            first_char_idx = (index // 26) - 1
            second_char_idx = index % 26
            return f"{string.ascii_uppercase[first_char_idx]}{string.ascii_uppercase[second_char_idx]}"

    _write_and_print_internal(f"{output_prefix}  Stage 2: Processing {len(initial_core_groups)} core groups for full neighborhood similarity...")

    # --- Define the worker function for processing a chunk of core groups ---
    def process_core_group_chunk(core_group_ids_chunk,
                                 all_neighborhood_features_ref,
                                 full_neighborhood_labels_map_ref,
                                 collapse_full_neighborhood_similarity_threshold_ref,
                                 parallelize_pdist_ref):
        
        results_to_aggregate = []
        local_collapsed_total_count = 0
        
        # Determine internal parallelism for pdist calls *within this worker*
        # This decision is based on parallelize_pdist_ref (if outer parallelization is on)
        # and the MIN_ITEMS_FOR_PARALLEL_PROCESSING threshold.
        # So, if parallelize_pdist_ref is True, then we allow the internal pdist calls to be parallel
        # if they meet their own MIN_ITEMS_FOR_PARALLEL_PROCESSING threshold (which is now called MIN_ITEMS_FOR_PARALLEL_PROCESSING).
        # Otherwise, internal pdist calls are sequential.
        num_cores_for_internal_pdist = -1 if parallelize_pdist_ref else 1 

        with threadpool_limits(limits=1, user_api='blas'):
            for group_id in core_group_ids_chunk:
                members_in_core_group = initial_core_groups[group_id]
                
                if len(members_in_core_group) < 2:
                    member_label = members_in_core_group[0]
                    results_to_aggregate.append((member_label,
                                                 all_neighborhood_features_ref[member_label],
                                                 full_neighborhood_labels_map_ref[member_label],
                                                 None))
                    continue

                sub_group_vocabulary = sorted(list(set.union(*[all_neighborhood_features_ref[m] for m in members_in_core_group])))
                if not sub_group_vocabulary:
                    sub_group_assignments = {m: 1 for m in members_in_core_group}
                else:
                    sub_group_feature_to_idx = {feature: i for i, feature in enumerate(sub_group_vocabulary)}
                    num_sub_group_neighborhoods = len(members_in_core_group)
                    num_sub_group_features = len(sub_group_vocabulary)

                    sub_group_feature_vectors_lil = sp.lil_matrix((num_sub_group_neighborhoods, num_sub_group_features), dtype=np.int8)
                    for i, nh_label in enumerate(members_in_core_group):
                        for feature in all_neighborhood_features_ref[nh_label]:
                            if feature in sub_group_feature_to_idx:
                                j = sub_group_feature_to_idx[feature]
                                sub_group_feature_vectors_lil[i, j] = 1
                    sub_group_feature_vectors = sub_group_feature_vectors_lil.tocsr()

                    if num_sub_group_neighborhoods > 1 and all(
                        (sub_group_feature_vectors[0] != sub_group_feature_vectors[i]).nnz == 0
                        for i in range(1, num_sub_group_neighborhoods)
                    ):
                        sub_group_assignments = {members_in_core_group[i]: 1 for i in range(len(members_in_core_group))}
                    else:
                        sub_group_distances = parallel_pdist_jaccard(
                            sub_group_feature_vectors,
                            num_cores=num_cores_for_internal_pdist # Use the determined internal pdist cores
                        )
                        if sub_group_distances.size == 0 and num_sub_group_neighborhoods <= 1: # Added <=1 for robustness
                            sub_group_assignments = {members_in_core_group[0]: 1}
                        elif sub_group_distances.size == 0 and num_sub_group_neighborhoods > 1:
                            sub_group_assignments = {m: 1 for m in members_in_core_group}
                        else:
                            sub_group_linked = linkage(sub_group_distances, method='average')
                            sub_group_assignments = fcluster(sub_group_linked, collapse_full_neighborhood_similarity_threshold_ref, criterion='distance')
                    
                    del sub_group_feature_vectors
                    if 'sub_group_distances' in locals(): del sub_group_distances
                    if 'sub_group_linked' in locals(): del sub_group_linked
                    gc.collect()

                current_sub_groups = defaultdict(list)
                if isinstance(sub_group_assignments, np.ndarray):
                    for i, sub_cluster_id in enumerate(sub_group_assignments):
                        current_sub_groups[sub_cluster_id].append(members_in_core_group[i])
                else:
                    current_sub_groups = {1: members_in_core_group}

                for sub_cluster_id in sorted(current_sub_groups.keys()):
                    collapsed_members = current_sub_groups[sub_cluster_id]
                    
                    if len(collapsed_members) > 1:
                        local_collapsed_total_count += (len(collapsed_members) - 1)
                        representative_label = collapsed_members[0]
                        
                        union_features = set()
                        for member_label in collapsed_members:
                            union_features.update(all_neighborhood_features_ref[member_label])

                        results_to_aggregate.append((representative_label,
                                                     union_features,
                                                     full_neighborhood_labels_map_ref[representative_label],
                                                     collapsed_members))
                    else:
                        member_label = collapsed_members[0]
                        results_to_aggregate.append((member_label,
                                                     all_neighborhood_features_ref[member_label],
                                                     full_neighborhood_labels_map_ref[member_label],
                                                     None))
        return results_to_aggregate, local_collapsed_total_count


    # Prepare arguments for parallel execution
    all_core_group_ids = sorted(list(initial_core_groups.keys()))
    num_total_core_groups = len(all_core_group_ids)

    # --- NEW: Conditional Parallelization for Stage 2 Outer Loop ---
    if num_total_core_groups < MIN_ITEMS_FOR_PARALLEL_PROCESSING or not parallelize_pdist:
        _write_and_print_internal(f"{output_prefix}  Stage 2: Running sequentially for {num_total_core_groups} core groups (below parallel threshold or parallelization disabled).")
        # Run sequentially
        results_from_workers = [
            process_core_group_chunk(
                all_core_group_ids,
                all_neighborhood_features,
                full_neighborhood_labels_map,
                collapse_full_neighborhood_similarity_threshold,
                False # Explicitly tell inner pdist to run sequentially if outer is sequential
            )
        ]
    else:
        # Determine the number of cores to use for stage 2 parallelism
        num_stage2_cores = os.cpu_count() if parallelize_pdist else 1
        if num_stage2_cores <= 0: num_stage2_cores = 1

        _write_and_print_internal(f"{output_prefix}  Stage 2: Distributing {num_total_core_groups} core groups among {num_stage2_cores} workers (parallelized).")

        # Create chunks of core group IDs to distribute
        chunk_size = max(1, num_total_core_groups // num_stage2_cores)
        group_id_chunks = [all_core_group_ids[i:i + chunk_size] for i in range(0, num_total_core_groups, chunk_size)]
        
        results_from_workers = Parallel(n_jobs=num_stage2_cores, backend="loky", verbose=100)(
            delayed(process_core_group_chunk)(
                chunk,
                all_neighborhood_features,
                full_neighborhood_labels_map,
                collapse_full_neighborhood_similarity_threshold,
                parallelize_pdist # Pass the overall parallelize_pdist flag
            ) for chunk in group_id_chunks
        )

    # --- Aggregate results from workers in the main process ---
    final_neighborhood_features = {}
    final_neighborhood_labels_map = {}
    collapsed_total_count = 0
    unique_collapsed_group_counter = 0 # Single global counter

    for worker_results, worker_collapsed_count in results_from_workers:
        collapsed_total_count += worker_collapsed_count
        for representative_label, features, original_labels_map_entry, collapsed_members in worker_results:
            if collapsed_members is not None: # This was a collapsed group
                letter_code = generate_letter_code(unique_collapsed_group_counter)
                unique_collapsed_group_counter += 1
                
                orig_organism, orig_hit_id, orig_ssn_id, orig_accession, _ = original_labels_map_entry
                final_neighborhood_labels_map[representative_label] = (orig_organism, orig_hit_id, orig_ssn_id, orig_accession, (len(collapsed_members), letter_code))
                
                collapsed_groups_report[letter_code] = {
                    'representative': representative_label,
                    'members': sorted(collapsed_members),
                    'count': len(collapsed_members)
                }
                final_neighborhood_features[representative_label] = features
            else: # Not a collapsed group, just an individual neighborhood
                final_neighborhood_features[representative_label] = features
                final_neighborhood_labels_map[representative_label] = original_labels_map_entry


    if collapsed_total_count > 0:
        _write_and_print_internal(f"{output_prefix}  Collapsed a total of {collapsed_total_count} neighborhoods into {len(final_neighborhood_features)} unique entities after two stages. Stage 2 took {time.time() - stage2_start:.2f} seconds.")
    else:
        _write_and_print_internal(f"{output_prefix}  No neighborhoods were collapsed after two stages (or disabled). Stage 2 took {time.time() - stage2_start:.2f} seconds.")

    _write_and_print_internal(f"{output_prefix}  Overall collapsing took {time.time() - collapsing_overall_start:.2f} seconds.")
    return final_neighborhood_features, final_neighborhood_labels_map, collapsed_groups_report


def cluster_gene_neighborhoods_from_sqlite(
    db_path,
    genes_table=GENES_TABLE,
    neighbors_table=NEIGHBORS_TABLE,
    col_neighborhood_id=COL_NEIGHBORHOOD_ID,
    col_gene_id=COL_GENE_ID,
    col_linking_key=COL_LINKING_KEY,
    col_accession_id=COL_ACCESSION_ID,
    col_function_desc=COL_FUNCTION_DESC,
    col_pfam_ids=COL_PFAM_IDS,
    col_interpro_ids=COL_INTERPRO_IDS,
    col_rel_start=COL_REL_START, 
    col_rel_stop=COL_REL_STOP,   
    col_ssn_cluster_id=COL_SSN_CLUSTER_ID,
    hit_gene_weight_factor=HIT_GENE_WEIGHT_FACTOR,
    direct_neighbor_weight_factor=DIRECT_NEIGHBOR_WEIGHT_FACTOR, 
    differentiate_by_ssn_cluster=False,
    ssn_cluster_value_to_filter=DEFAULT_SSN_CLUSTER_VALUE_TO_FILTER,
    collapse_identical_neighborhoods=COLLAPSE_IDENTICAL_NEIGHBORHOODS,
    collapse_core_similarity_threshold=COLLAPSE_CORE_SIMILARITY_THRESHOLD,
    collapse_full_neighborhood_similarity_threshold=COLLAPSE_FULL_NEIGHBORHOOD_SIMILARITY_THRESHOLD,
    original_input_sequence_id=None,
    distance_threshold=0.8,
    plot_dendrogram=True,
    save_plots=SAVE_PLOTS,
    output_dir=OUTPUT_DIR,
    output_formats=OUTPUT_FORMATS, 
    dpi=DPI,
    min_plot_height=MIN_PLOT_HEIGHT, 
    height_per_leaf=HEIGHT_PER_LEAF, 
    max_plot_height=MAX_PLOT_HEIGHT,
    min_plot_width=MIN_PLOT_WIDTH,
    width_per_leaf=WIDTH_PER_LEAF,
    max_plot_width=MAX_PLOT_WIDTH,
    report_file_handle=None, 
    parallelize_pdist=False
):
    """
    Clusters gene neighborhoods from an SQLite database based on detailed functional annotations,
    considering every row in 'attributes' as a hit gene and including its neighbors.
    Applies extra weight to the hit gene and direct neighbors.
    Optionally differentiates by SSN cluster.
    Can collapse highly similar neighborhoods (hit + direct neighbors + entire neighborhood) for plotting.
    Optimized for database I/O and parallel distance calculation.
    
    Returns:
        tuple: A tuple containing (clusters_dict, full_neighborhood_labels_map, collapsed_groups_report).
               clusters_dict is a dict where keys are SSN cluster IDs (or 'All' if no differentiation)
               and values are dicts of cluster_id -> list of unique_neighborhood_labels (which might be representatives).
               final_labels_map is a dict mapping unique_neighborhood_label to (organism, hit_id, ssn_cluster_id, accession_id, collapsed_members_info).
               collapsed_groups_report is a dict detailing the collapsed groups.
    """
    # An internal helper function for consistent logging
    def _write_and_print_internal(text):
        print(text)
        if report_file_handle:
            report_file_handle.write(text + '\n')

    start_time_overall = time.time()
    conn = sqlite3.connect(db_path)
    
    # --- Step 1: Fetch all data into DataFrames (optimized DB I/O) ---
    _write_and_print_internal("Fetching all hit gene data...")
    db_fetch_start = time.time()
    query_hit_genes = f"""
        SELECT {col_gene_id}, {col_neighborhood_id}, {col_function_desc}, 
               {col_pfam_ids}, {col_interpro_ids}, {col_ssn_cluster_id}, 
               {col_accession_id}
        FROM {genes_table}
    """
    hit_genes_df = pd.read_sql_query(query_hit_genes, conn)
    _write_and_print_internal(f"  Fetched {len(hit_genes_df)} hit genes in {time.time() - db_fetch_start:.2f} seconds.")

    if hit_genes_df.empty:
        _write_and_print_internal("No hit genes found in the 'attributes' table. Please check your database and configuration.")
        conn.close()
        return {}, {}, {}

    _write_and_print_internal("Fetching all neighbor data...")
    db_fetch_start = time.time()
    query_neighbors_all = f"""
        SELECT {col_linking_key}, {col_gene_id}, {col_function_desc}, 
               {col_pfam_ids}, {col_interpro_ids}, {col_rel_start}, {col_rel_stop}
        FROM {neighbors_table}
    """
    neighbors_df = pd.read_sql_query(query_neighbors_all, conn)
    conn.close() # Close connection as soon as data is fetched
    _write_and_print_internal(f"  Fetched {len(neighbors_df)} neighbor genes in {time.time() - db_fetch_start:.2f} seconds.")

    # Create a dictionary for quick lookup of neighbors by linking key
    _write_and_print_internal("Grouping neighbor data by linking key...")
    grouping_start = time.time()
    # Using tqdm for groupby apply to show progress
    neighbors_by_linking_key = {
        key: group.to_dict('records') 
        for key, group in tqdm(neighbors_df.groupby(col_linking_key), desc="Grouping neighbors", leave=False)
    }
    _write_and_print_internal(f"  Grouped neighbor data in {time.time() - grouping_start:.2f} seconds.")
    del neighbors_df # Free memory
    gc.collect() # Garbage collection

    all_neighborhood_features = defaultdict(set)
    core_neighborhood_features = defaultdict(set)
    full_neighborhood_labels_map = {}
    raw_ssn_ids_counts = defaultdict(int)

    # --- Step 2: Iterate through hit genes and their pre-fetched neighbors ---
    _write_and_print_internal(f"Processing features for {len(hit_genes_df)} hit genes and their neighborhoods...")
    feature_extraction_start_time = time.time()

    # Added tqdm for the main hit_genes_df iteration
    for idx, hit_row in tqdm(hit_genes_df.iterrows(), total=len(hit_genes_df), desc="Extracting features", unit="neighborhood"):
        hit_id = hit_row[COL_GENE_ID]
        organism_name = hit_row[COL_NEIGHBORHOOD_ID]
        ssn_cluster_id = hit_row[COL_SSN_CLUSTER_ID]
        accession_id = hit_row[COL_ACCESSION_ID]
        
        raw_ssn_ids_counts[ssn_cluster_id] += 1

        unique_neighborhood_label = f"{organism_name}_{hit_id}"
        
        current_full_features = set()
        current_core_features = set()

        # Add features of the HIT gene itself
        hit_full_features = extract_features_from_gene_row(
            gene_row=hit_row, 
            current_weight_factor=hit_gene_weight_factor,
            base_prefix="HIT_",
            include_desc=True, include_pfam=True, include_interpro=True
        )
        current_full_features.update(hit_full_features)
        
        hit_core_features = extract_features_from_gene_row(
            gene_row=hit_row, 
            current_weight_factor=1,
            base_prefix="HIT_CORE_",
            include_desc=False, include_pfam=True, include_interpro=True
        )
        current_core_features.update(hit_core_features)

        full_neighborhood_labels_map[unique_neighborhood_label] = (organism_name, hit_id, ssn_cluster_id, accession_id, None)

        raw_neighbor_genes_data = neighbors_by_linking_key.get(hit_id, [])

        closest_left_neighbor_gene_id = None
        closest_right_neighbor_gene_id = None
        max_neg_rel_stop = -np.inf
        min_pos_rel_start = np.inf

        for neighbor_row_dict in raw_neighbor_genes_data:
            rel_start = neighbor_row_dict[COL_REL_START]
            rel_stop = neighbor_row_dict[COL_REL_STOP]
            neighbor_gene_id = neighbor_row_dict[COL_GENE_ID]

            if rel_stop is not None and rel_stop < 0 and rel_stop > max_neg_rel_stop:
                max_neg_rel_stop = rel_stop
                closest_left_neighbor_gene_id = neighbor_gene_id

            if rel_start is not None and rel_start > 0 and rel_start < min_pos_rel_start:
                min_pos_rel_start = rel_start
                closest_right_neighbor_gene_id = neighbor_gene_id
            
        for neighbor_row_dict in raw_neighbor_genes_data:
            neighbor_gene_id = neighbor_row_dict[COL_GENE_ID]
            
            current_neighbor_weight_factor = 1
            is_direct_neighbor = False

            if (closest_left_neighbor_gene_id is not None and neighbor_gene_id == closest_left_neighbor_gene_id) or \
               (closest_right_neighbor_gene_id is not None and neighbor_gene_id == closest_right_neighbor_gene_id):
                current_neighbor_weight_factor = direct_neighbor_weight_factor
                is_direct_neighbor = True
            
            neighbor_full_features = extract_features_from_gene_row(
                gene_row=neighbor_row_dict, 
                current_weight_factor=current_neighbor_weight_factor,
                base_prefix="N_",
                include_desc=True, include_pfam=True, include_interpro=True
            ) 
            current_full_features.update(neighbor_full_features)

            if is_direct_neighbor:
                 neighbor_core_features = extract_features_from_gene_row(
                    gene_row=neighbor_row_dict,
                    current_weight_factor=1,
                    base_prefix="N_CORE_",
                    include_desc=False, include_pfam=True, include_interpro=True
                )
                 current_core_features.update(neighbor_core_features)

        all_neighborhood_features[unique_neighborhood_label].update(current_full_features)
        core_neighborhood_features[unique_neighborhood_label].update(current_core_features)
    
    _write_and_print_internal(f"Finished feature extraction in {time.time() - feature_extraction_start_time:.2f} seconds.")
    del hit_genes_df # Free memory
    del neighbors_by_linking_key # Free memory
    gc.collect() # Garbage collection

    _write_and_print_internal("\n--- Diagnostic: Raw SSN Cluster ID Distribution in 'attributes' table ---")
    for ssn_id, count in sorted(raw_ssn_ids_counts.items(), key=lambda item: str(item[0])):
        _write_and_print_internal(f"  SSN ID '{ssn_id}': {count} neighborhoods")
    _write_and_print_internal("-------------------------------------------------------------------")

    if not all_neighborhood_features:
        _write_and_print_internal("No gene neighborhoods found or parsed. Exiting.")
        return {}, {}, {}

    all_unique_features_vocabulary_initial = sorted(list(set.union(*all_neighborhood_features.values())))
    
    if not all_unique_features_vocabulary_initial:
        _write_and_print_internal("No significant features extracted for clustering from any neighborhood. Check parsing logic and data. Exiting.")
        return {'All_Neighborhoods': {1: list(all_neighborhood_features.keys())}}, full_neighborhood_labels_map, {} 

    # Pre-clustering (collapsing) similar neighborhoods 
    if collapse_identical_neighborhoods:
        collapsing_start_time = time.time()
        final_neighborhood_features, final_neighborhood_labels_map, collapsed_groups_report = \
            _perform_collapsing(all_neighborhood_features, full_neighborhood_labels_map, 
                                core_neighborhood_features,
                                collapse_core_similarity_threshold, collapse_full_neighborhood_similarity_threshold,
                                output_prefix="  [Collapsing]", 
                                report_file=report_file_handle, 
                                parallelize_pdist=parallelize_pdist)
    else:
        _write_and_print_internal("\nCollapsing identical/similar neighborhoods is disabled. Proceeding with all original neighborhoods.")
        final_neighborhood_features = all_neighborhood_features
        final_neighborhood_labels_map = full_neighborhood_labels_map
        collapsed_groups_report = {}
    
    # Explicit memory cleanup for pre-collapse feature dictionaries
    del all_neighborhood_features 
    del core_neighborhood_features 
    gc.collect() # Garbage collection

    all_unique_features_vocabulary = sorted(list(set.union(*final_neighborhood_features.values())))
    
    if not all_unique_features_vocabulary:
        _write_and_print_internal("No significant features extracted from final neighborhoods for clustering. Check parsing logic and data. Exiting.")
        return {'All_Neighborhoods': {1: list(final_neighborhood_features.keys())}}, final_neighborhood_labels_map, collapsed_groups_report

    ssn_clusters_to_process = defaultdict(list)
    if differentiate_by_ssn_cluster:
        for nh_label, (_, _, ssn_id, _, _) in final_neighborhood_labels_map.items():
            if ssn_id not in ssn_cluster_value_to_filter:
                ssn_clusters_to_process[ssn_id].append(nh_label)
        _write_and_print_internal(f"\nFound {len(ssn_clusters_to_process)} distinct SSN clusters to process (after filtering invalid IDs).")
        if ssn_clusters_to_process:
            _write_and_print_internal(f"  SSN Clusters to be processed: {sorted(list(ssn_clusters_to_process.keys()), key=str)}")
    else:
        ssn_clusters_to_process['All_Neighborhoods'] = sorted(list(final_neighborhood_features.keys())) 
        _write_and_print_internal("Processing all neighborhoods together (no SSN cluster differentiation).")

    clusters_output_dict = defaultdict(dict) 
    
    if differentiate_by_ssn_cluster and not ssn_clusters_to_process:
        _write_and_print_internal("\nNo valid SSN clusters with any neighborhoods found after filtering for differentiation. No plots/reports generated for individual SSN clusters.")
        return {}, final_neighborhood_labels_map, collapsed_groups_report


    # Added tqdm for the SSN cluster iteration
    for ssn_id, neighborhood_labels_in_ssn_cluster in tqdm(ssn_clusters_to_process.items(), desc="  Processing SSN clusters", unit="cluster"):
        ssn_cluster_start_time = time.time()
        if differentiate_by_ssn_cluster:
            _write_and_print_internal(f"\n--- Processing SSN Cluster: {ssn_id} (contains {len(neighborhood_labels_in_ssn_cluster)} neighborhoods) ---")
            plot_title_prefix = f"SSN Cluster {ssn_id}"
        else:
            plot_title_prefix = "All Gene Neighborhoods"

        current_ssn_neighborhood_features = {
            label: final_neighborhood_features[label] for label in neighborhood_labels_in_ssn_cluster
        }
        current_ssn_neighborhood_labels_map = {
            label: final_neighborhood_labels_map[label] for label in neighborhood_labels_in_ssn_cluster
        }

        num_neighborhoods_in_group = len(current_ssn_neighborhood_features)
        if num_neighborhoods_in_group < 2:
            _write_and_print_internal(f"  Skipping group {ssn_id}: Not enough distinct neighborhoods ({num_neighborhoods_in_group}) for clustering. Requires at least 2.")
            clusters_output_dict[ssn_id] = {1: neighborhood_labels_in_ssn_cluster}
            continue

        neighborhood_ids_sorted = sorted(list(current_ssn_neighborhood_features.keys()))
        
        current_ssn_vocabulary = sorted(list(set.union(*current_ssn_neighborhood_features.values())))
        if not current_ssn_vocabulary:
            _write_and_print_internal(f"  No features found for SSN group {ssn_id}. Cannot cluster.")
            clusters_output_dict[ssn_id] = {1: neighborhood_labels_in_ssn_cluster}
            continue

        feature_vector_creation_start = time.time()
        
        feature_to_idx = {feature: i for i, feature in enumerate(current_ssn_vocabulary)}
        num_current_neighborhoods = len(neighborhood_ids_sorted)
        num_current_features = len(current_ssn_vocabulary)

        feature_vectors_lil = sp.lil_matrix((num_current_neighborhoods, num_current_features), dtype=np.int8)
        # Added tqdm to this loop as well
        for i, nh_id in enumerate(tqdm(neighborhood_ids_sorted, desc=f"  Populating features for SSN {ssn_id}", leave=False)): 
            for feature in current_ssn_neighborhood_features[nh_id]:
                if feature in feature_to_idx:
                    j = feature_to_idx[feature]
                    feature_vectors_lil[i, j] = 1
        feature_vectors_np = feature_vectors_lil.tocsr() # Convert to CSR

        _write_and_print_internal(f"  Feature vector creation for {num_neighborhoods_in_group} neighborhoods ({len(current_ssn_vocabulary)} features) took {time.time() - feature_vector_creation_start:.2f} seconds.") 
        _write_and_print_internal(f"  Matrix shape: {feature_vectors_np.shape}, NNZ: {feature_vectors_np.nnz}") 
        gc.collect() # Garbage collection

        if num_current_neighborhoods > 1 and all(
            (feature_vectors_np[0] != feature_vectors_np[i]).nnz == 0 # Compare sparse rows
            for i in range(1, num_current_neighborhoods)
        ):
            _write_and_print_internal(f"  All neighborhoods in {plot_title_prefix} have identical features. No meaningful distance calculated. Skipping plotting.")
            clusters_output_dict[ssn_id] = {1: neighborhood_labels_in_ssn_cluster}
            if plot_dendrogram:
                 _write_and_print_internal(f"  (No plots generated for {plot_title_prefix} due to identical features)")
            continue
            
        plot_start = 0 # Initialize to 0 or None, will be updated if plotting happens
        
        if plot_dendrogram: # Only start timer if plotting will occur
            plot_start = time.time() # Start timer specifically for plotting

        distance_calc_start = time.time()
        _write_and_print_internal(f"  Calculating distances for {num_current_neighborhoods} neighborhoods...") 
        distances = parallel_pdist_jaccard(feature_vectors_np, num_cores=-1 if parallelize_pdist else 1) 
        _write_and_print_internal(f"  Distance calculation took {time.time() - distance_calc_start:.2f} seconds.") 

        linkage_start = time.time()
        _write_and_print_internal(f"  Performing linkage for {num_current_neighborhoods} neighborhoods...") 
        linked = linkage(distances, method='average')
        _write_and_print_internal(f"  Linkage calculation took {time.time() - linkage_start:.2f} seconds.") 
        
        # Explicit memory cleanup
        del feature_vectors_np 
        del distances 
        gc.collect() # Garbage collection
        

        if plot_dendrogram:
            # Organism Labels
            _write_and_print_internal(f"  Generating dendrogram plots...") 
            _plot_dendrogram(linked, neighborhood_ids_sorted, current_ssn_neighborhood_labels_map, distance_threshold, 
                            f"{plot_title_prefix} GNN", 'organism', 
                            original_input_sequence_id, 
                            save_plots, output_dir, output_formats, dpi, 
                            min_plot_height, height_per_leaf, max_plot_height,
                            min_plot_width, width_per_leaf, max_plot_width,
                            report_file=report_file_handle)
            # ID Labels
            _plot_dendrogram(linked, neighborhood_ids_sorted, current_ssn_neighborhood_labels_map, distance_threshold, 
                            f"{plot_title_prefix} GNN", 'id', 
                            original_input_sequence_id, 
                            save_plots, output_dir, output_formats, dpi, 
                            min_plot_height, height_per_leaf, max_plot_height,
                            min_plot_width, width_per_leaf, max_plot_width,
                            report_file=report_file_handle)
            _write_and_print_internal(f"  Plotting took {time.time() - plot_start:.2f} seconds.") 
                        
        cluster_assignments = fcluster(linked, distance_threshold, criterion='distance')
        current_ssn_clusters = defaultdict(list)
        for i, cluster_id in enumerate(cluster_assignments):
            current_ssn_clusters[cluster_id].append(neighborhood_ids_sorted[i])
        
        clusters_output_dict[ssn_id] = current_ssn_clusters
        _write_and_print_internal(f"--- Finished SSN Cluster {ssn_id} in {time.time() - ssn_cluster_start_time:.2f} seconds. ---") 
        del linked # Free memory
        gc.collect() # Garbage collection

    _write_and_print_internal(f"\nTotal runtime: {time.time() - start_time_overall:.2f} seconds.")
    return clusters_output_dict, final_neighborhood_labels_map, collapsed_groups_report

Set OMP_NUM_THREADS to 12 for SciPy/NumPy internal multi-threading.


In [3]:
# SQLITE_DB_PATH = '39061_CaMES_10kBlast_10e_50eEdge_noFilter_300AST_min900AA_withoutEgtD_withoutMethyltrans.sqlite' 
# SQLITE_DB_PATH = '39063_CaMES_10kBlast_10e_50eEdge_noFilter_300AST_min900AA-clusterseperated_10N.sqlite' 
# SQLITE_DB_PATH = '39094_CaMES_10k-Blast_noFilter_40ID-60AST_min950AA_10N.sqlite' 
SQLITE_DB_PATH = '39151_EanB_10k_Blast_noFilter_50ASTcolorized_onlybigCluster-renamed_10N.sqlite'
# SQLITE_DB_PATH = '39150_OvoA_10k_Blast_search_noFilter_80ASTcolorized_onlyBigCluster-separated_10N.sqlite'

# Set this to the UniProt ID of your original query protein if you want to highlight it.
# Set to None or an empty string if you don't want to highlight any specific protein.
# ORIGINAL_INPUT_SEQUENCE_ID = 'A0A7V4WV16' # CaMES e.g., 'A0A0B0EG43' or None or ''
ORIGINAL_INPUT_SEQUENCE_ID = 'B3ECE3' # EanB
# ORIGINAL_INPUT_SEQUENCE_ID = 'A0A1I5R890' # OvoA

# You can change this setting here or keep the global config
DIFFERENTIATE_BY_SSN_CLUSTER = True     # Set to True (SSN clusters differentiated) or False as needed
chosen_distance_threshold = 0.5         # Change as needed
PARALLELIZE_PDIST_ENABLED = True        # Set to True to enable parallel pdist.

# Configuration for collapsing similar neighborhoods (Local Override)
COLLAPSE_IDENTICAL_NEIGHBORHOODS_ACTIVE = True 
COLLAPSE_CORE_SIMILARITY_THRESHOLD_ACTIVE = 0.0 # Use 0.0 for exact match of hit+direct neighbors
COLLAPSE_FULL_NEIGHBORHOOD_SIMILARITY_THRESHOLD_ACTIVE = 0.5 # e.g., 0.3 for 70% similarity of full neighborhood

# Prepare report file
report_suffix = "_ssn_differentiated" if DIFFERENTIATE_BY_SSN_CLUSTER else "_all_neighborhoods"
report_filename = f"{REPORT_FILENAME_BASE}{report_suffix}.txt"
report_path = os.path.join(OUTPUT_DIR, report_filename)
    
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR) # Ensure output directory exists for report and plots

with open(report_path, 'w') as report_file:
    # Redefine write_and_print to use the local report_file handle
    def write_and_print_to_file(text):
        print(text)
        report_file.write(text + '\n')

    # Use write_and_print_to_file for all report output
    write_and_print_to_file(f"\n--- GNN Clustering Report ({datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}) ---") 
    write_and_print_to_file(f"Database: {SQLITE_DB_PATH}")
    write_and_print_to_file(f"Jaccard Distance Threshold: {chosen_distance_threshold}")
    write_and_print_to_file(f"Hit Gene Weight Factor: {HIT_GENE_WEIGHT_FACTOR}")
    write_and_print_to_file(f"Direct Neighbor Weight Factor: {DIRECT_NEIGHBOR_WEIGHT_FACTOR}")
    if DIFFERENTIATE_BY_SSN_CLUSTER:
        write_and_print_to_file(f"Clustering differentiated by SSN Cluster ID (column: '{COL_SSN_CLUSTER_ID}').")
    else:
        write_and_print_to_file("Clustering all neighborhoods together (no SSN cluster differentiation).")
    
    if COLLAPSE_IDENTICAL_NEIGHBORHOODS_ACTIVE:
        write_and_print_to_file(f"Collapsing identical/similar neighborhoods enabled:")
        write_and_print_to_file(f"  Stage 1 (Hit+Direct Neighbor Core): Threshold {COLLAPSE_CORE_SIMILARITY_THRESHOLD_ACTIVE}")
        write_and_print_to_file(f"  Stage 2 (Full Neighborhood): Threshold {COLLAPSE_FULL_NEIGHBORHOOD_SIMILARITY_THRESHOLD_ACTIVE}")
    else:
        write_and_print_to_file("Collapsing identical/similar neighborhoods disabled.")

    # Report on OMP_NUM_THREADS status instead of parallelize_pdist flag
    write_and_print_to_file(f"Distance calculation parallelism: {'Enabled (via joblib)' if PARALLELIZE_PDIST_ENABLED else 'Disabled (sequential custom Jaccard)'}")
    write_and_print_to_file(f"SciPy/NumPy internal parallelism (OMP_NUM_THREADS): {os.environ.get('OMP_NUM_THREADS', 'Not set (defaults will apply)')}")
    
    if ORIGINAL_INPUT_SEQUENCE_ID:
        write_and_print_to_file(f"Original Input Sequence Accession ID for highlighting: '{ORIGINAL_INPUT_SEQUENCE_ID}' (colored '{HIGHLIGHT_COLOR}')")
    else:
        write_and_print_to_file("No specific original input sequence ID provided for highlighting.")
    write_and_print_to_file(f"Plots saved to: {OUTPUT_DIR} in {OUTPUT_FORMATS} formats at {DPI} DPI.")
    write_and_print_to_file(f"Report also saved to: {report_path}")
    write_and_print_to_file("-" * 70)


    clusters_by_ssn, final_labels_map, collapsed_groups_report = cluster_gene_neighborhoods_from_sqlite(
                                            db_path=SQLITE_DB_PATH,
                                            col_accession_id=COL_ACCESSION_ID,
                                            col_rel_start=COL_REL_START,
                                            col_rel_stop=COL_REL_STOP,
                                            hit_gene_weight_factor=HIT_GENE_WEIGHT_FACTOR,
                                            direct_neighbor_weight_factor=DIRECT_NEIGHBOR_WEIGHT_FACTOR,
                                            differentiate_by_ssn_cluster=DIFFERENTIATE_BY_SSN_CLUSTER,
                                            ssn_cluster_value_to_filter=DEFAULT_SSN_CLUSTER_VALUE_TO_FILTER,
                                            collapse_identical_neighborhoods=COLLAPSE_IDENTICAL_NEIGHBORHOODS_ACTIVE,
                                            collapse_core_similarity_threshold=COLLAPSE_CORE_SIMILARITY_THRESHOLD_ACTIVE,
                                            collapse_full_neighborhood_similarity_threshold=COLLAPSE_FULL_NEIGHBORHOOD_SIMILARITY_THRESHOLD_ACTIVE,
                                            original_input_sequence_id=ORIGINAL_INPUT_SEQUENCE_ID,
                                            distance_threshold=chosen_distance_threshold,
                                            plot_dendrogram=True,
                                            save_plots=SAVE_PLOTS,
                                            output_dir=OUTPUT_DIR,
                                            output_formats=OUTPUT_FORMATS, 
                                            dpi=DPI,
                                            min_plot_height=MIN_PLOT_HEIGHT,
                                            height_per_leaf=HEIGHT_PER_LEAF, 
                                            max_plot_height=MAX_PLOT_HEIGHT,
                                            min_plot_width=MIN_PLOT_WIDTH, 
                                            width_per_leaf=WIDTH_PER_LEAF, 
                                            max_plot_width=MAX_PLOT_WIDTH,
                                            report_file_handle=report_file,
                                            parallelize_pdist=PARALLELIZE_PDIST_ENABLED
                                        )

    if clusters_by_ssn:
        write_and_print_to_file("\n--- Final Clustering Results ---")
        for ssn_id, clusters_in_ssn in sorted(clusters_by_ssn.items(), key=lambda item: str(item[0])):
            write_and_print_to_file(f"\n### Results for SSN Cluster: {ssn_id} ###")
            if not clusters_in_ssn:
                write_and_print_to_file("  No clusters formed for this SSN group, or insufficient data.")
                continue

            for cluster_id, neighborhoods_in_cluster in sorted(clusters_in_ssn.items()):
                write_and_print_to_file(f"  Cluster {cluster_id}: {len(neighborhoods_in_cluster)} neighborhoods")
                for nh_id in neighborhoods_in_cluster:
                    organism_name, hit_id_internal, _, accession_id, collapsed_info = final_labels_map.get(nh_id, ('UNKNOWN', 'UNKNOWN', None, 'UNKNOWN', None))
                    
                    highlight_indicator = " (ORIGINAL INPUT)" if accession_id == ORIGINAL_INPUT_SEQUENCE_ID else ""
                    collapsed_suffix = ""
                    if collapsed_info:
                        count, letter_code = collapsed_info
                        collapsed_suffix = f" (Collapsed: {count} neighborhoods, Ref: {letter_code})"
                    
                    write_and_print_to_file(f"    - Organism: {organism_name}, Hit Accession: {accession_id}{highlight_indicator}{collapsed_suffix} (Internal ID: {hit_id_internal}) (NH ID: {nh_id})")
            write_and_print_to_file("  " + "-" * 30)
        
        if collapsed_groups_report:
            write_and_print_to_file("\n--- Detailed Report on Collapsed Neighborhood Groups ---")
            for code, group_data in sorted(collapsed_groups_report.items()):
                write_and_print_to_file(f"  Group ({code}): Representative: {group_data['representative']} (Total: {group_data['count']} members)")
                for member_nh_id in group_data['members']:
                    member_organism, member_hit_id, _, member_accession, _ = final_labels_map.get(member_nh_id, ('UNKNOWN', 'UNKNOWN', None, 'UNKNOWN', None))
                    write_and_print_to_file(f"    - {member_organism} (Accession: {member_accession}) (Internal ID: {member_hit_id}) (NH ID: {member_nh_id})")
            write_and_print_to_file("-------------------------------------------------------")
    else:
        write_and_print_to_file("\nNo clusters formed at all. This could mean your database is empty, or no features were extracted after filtering, or no valid SSN clusters with multiple neighborhoods were found.")

    write_and_print_to_file("\n--- Report End ---")


--- GNN Clustering Report (2025-09-02 23:57:26) ---
Database: 39151_EanB_10k_Blast_noFilter_50ASTcolorized_onlybigCluster-renamed_10N.sqlite
Jaccard Distance Threshold: 0.5
Hit Gene Weight Factor: 10
Direct Neighbor Weight Factor: 3
Clustering differentiated by SSN Cluster ID (column: 'cluster_num').
Collapsing identical/similar neighborhoods enabled:
  Stage 1 (Hit+Direct Neighbor Core): Threshold 0.0
  Stage 2 (Full Neighborhood): Threshold 0.5
Distance calculation parallelism: Enabled (via joblib)
SciPy/NumPy internal parallelism (OMP_NUM_THREADS): 12
Original Input Sequence Accession ID for highlighting: 'B3ECE3' (colored 'red')
Plots saved to: gnn_cluster_plots_EanB in ['pdf'] formats at 300 DPI.
Report also saved to: gnn_cluster_plots_EanB\gnn_clustering_report_ssn_differentiated.txt
----------------------------------------------------------------------
Fetching all hit gene data...
  Fetched 9866 hit genes in 0.15 seconds.
Fetching all neighbor data...
  Fetched 309674 neighbor

                                                                         

  Grouped neighbor data in 4.69 seconds.
Processing features for 9866 hit genes and their neighborhoods...


Extracting features: 100%|██████████| 9866/9866 [00:13<00:00, 716.52neighborhood/s] 


Finished feature extraction in 13.77 seconds.

--- Diagnostic: Raw SSN Cluster ID Distribution in 'attributes' table ---
  SSN ID '1': 5136 neighborhoods
  SSN ID '2': 1735 neighborhoods
  SSN ID '3': 1454 neighborhoods
  SSN ID '4': 761 neighborhoods
  SSN ID '5': 434 neighborhoods
  SSN ID '6': 346 neighborhoods
-------------------------------------------------------------------
  [Collapsing]  Starting two-stage collapsing (Core Thr: 0.0, Full Thr: 0.5).
  [Collapsing]  Stage 1: Building sparse matrix for 9360 neighborhoods and 23490 core features...


                                                                                                        

  [Collapsing]  Stage 1: Sparse matrix built in 5.23 seconds. Shape: (9360, 23490), NNZ: 1246166
  [Collapsing]  Stage 1: Calculating core distances using scipy.pdist...
  Pre-calculating 9360 feature sets...


                                                                                             

  Feature set pre-calculation took 0.20 seconds.
  Running Jaccard distance in parallel for N=9360 using 12 cores.
  [Collapsing]  Stage 1: Core distance calculation took 178.16 seconds.
  [Collapsing]  Stage 1: Performing linkage and clustering for core features...
  [Collapsing]  Stage 1: Linkage took 1.90 seconds.
  [Collapsing]  Stage 1: Grouped into 9026 initial core groups based on a threshold of 0.0. Total stage 1 took 185.75 seconds.
  [Collapsing]  Stage 2: Processing 9026 core groups for full neighborhood similarity...
  [Collapsing]  Stage 2: Distributing 9026 core groups among 12 workers (parallelized).
[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   1 tasks      | elapsed:   12.6s
[Parallel(n_jobs=12)]: Done   2 out of  13 | elapsed:   23.7s remaining:  2.2min
[Parallel(n_jobs=12)]: Done   3 out of  13 | elapsed:   27.7s remaining:  1.5min
[Parallel(n_jobs=12)]: Done   4 out of  13 | elapsed:   36.4s remaining:  1.

  Processing SSN clusters:   0%|          | 0/6 [00:00<?, ?cluster/s]


--- Processing SSN Cluster: 4 (contains 739 neighborhoods) ---




  Feature vector creation for 739 neighborhoods (53876 features) took 1.63 seconds.
  Matrix shape: (739, 53876), NNZ: 413370
  Calculating distances for 739 neighborhoods...
  Pre-calculating 739 feature sets...




  Feature set pre-calculation took 0.05 seconds.
  Running Jaccard distance in parallel for N=739 using 12 cores.
  Distance calculation took 42.77 seconds.
  Performing linkage for 739 neighborhoods...
  Linkage calculation took 0.01 seconds.
  Generating dendrogram plots...


  Processing SSN clusters:  17%|█▋        | 1/6 [00:56<04:43, 56.61s/cluster]

  Plotting took 54.36 seconds.
--- Finished SSN Cluster 4 in 56.32 seconds. ---

--- Processing SSN Cluster: 2 (contains 1666 neighborhoods) ---




  Feature vector creation for 1666 neighborhoods (75416 features) took 4.58 seconds.
  Matrix shape: (1666, 75416), NNZ: 1139071
  Calculating distances for 1666 neighborhoods...
  Pre-calculating 1666 feature sets...




  Feature set pre-calculation took 0.13 seconds.
  Running Jaccard distance in parallel for N=1666 using 12 cores.
  Distance calculation took 136.89 seconds.
  Performing linkage for 1666 neighborhoods...
  Linkage calculation took 0.03 seconds.
  Generating dendrogram plots...
  Plotting took 163.13 seconds.
--- Finished SSN Cluster 2 in 168.16 seconds. ---


  Processing SSN clusters:  33%|███▎      | 2/6 [03:45<08:09, 122.46s/cluster]


--- Processing SSN Cluster: 1 (contains 4602 neighborhoods) ---




  Feature vector creation for 4602 neighborhoods (104758 features) took 12.10 seconds.
  Matrix shape: (4602, 104758), NNZ: 2358007
  Calculating distances for 4602 neighborhoods...
  Pre-calculating 4602 feature sets...




  Feature set pre-calculation took 0.28 seconds.
  Running Jaccard distance in parallel for N=4602 using 12 cores.
  Distance calculation took 304.16 seconds.
  Performing linkage for 4602 neighborhoods...
  Linkage calculation took 0.40 seconds.
  Generating dendrogram plots...
  Plotting took 407.49 seconds.
--- Finished SSN Cluster 1 in 420.28 seconds. ---


  Processing SSN clusters:  50%|█████     | 3/6 [10:46<12:56, 258.75s/cluster]


--- Processing SSN Cluster: 3 (contains 1265 neighborhoods) ---




  Feature vector creation for 1265 neighborhoods (66918 features) took 3.72 seconds.
  Matrix shape: (1265, 66918), NNZ: 778953
  Calculating distances for 1265 neighborhoods...
  Pre-calculating 1265 feature sets...




  Feature set pre-calculation took 0.11 seconds.
  Running Jaccard distance in parallel for N=1265 using 12 cores.
  Distance calculation took 111.21 seconds.
  Performing linkage for 1265 neighborhoods...
  Linkage calculation took 0.02 seconds.
  Generating dendrogram plots...
  Plotting took 135.25 seconds.
--- Finished SSN Cluster 3 in 139.67 seconds. ---


  Processing SSN clusters:  67%|██████▋   | 4/6 [13:06<07:04, 212.00s/cluster]


--- Processing SSN Cluster: 5 (contains 424 neighborhoods) ---




  Feature vector creation for 424 neighborhoods (37374 features) took 1.59 seconds.
  Matrix shape: (424, 37374), NNZ: 265630
  Calculating distances for 424 neighborhoods...
  Pre-calculating 424 feature sets...




  Feature set pre-calculation took 0.04 seconds.
  Running Jaccard distance in parallel for N=424 using 12 cores.
  Distance calculation took 33.39 seconds.
  Performing linkage for 424 neighborhoods...
  Linkage calculation took 0.00 seconds.
  Generating dendrogram plots...
  Plotting took 40.55 seconds.
--- Finished SSN Cluster 5 in 42.82 seconds. ---


  Processing SSN clusters:  83%|████████▎ | 5/6 [13:49<02:31, 151.24s/cluster]


--- Processing SSN Cluster: 6 (contains 332 neighborhoods) ---




  Feature vector creation for 332 neighborhoods (40985 features) took 0.73 seconds.
  Matrix shape: (332, 40985), NNZ: 164293
  Calculating distances for 332 neighborhoods...
  Pre-calculating 332 feature sets...




  Feature set pre-calculation took 0.03 seconds.
  Running Jaccard distance in parallel for N=332 using 12 cores.
  Distance calculation took 18.67 seconds.
  Performing linkage for 332 neighborhoods...
  Linkage calculation took 0.00 seconds.
  Generating dendrogram plots...
  Plotting took 24.53 seconds.
--- Finished SSN Cluster 6 in 25.97 seconds. ---


  Processing SSN clusters: 100%|██████████| 6/6 [14:16<00:00, 142.77s/cluster]



Total runtime: 1119.02 seconds.

--- Final Clustering Results ---

### Results for SSN Cluster: 1 ###
  Cluster 1: 3 neighborhoods
    - Organism: Candidatus Methylomirabilis lanthanidiphila., Hit Accession: A0A564ZLT5 (Internal ID: CABIKM010000034) (NH ID: Candidatus Methylomirabilis lanthanidiphila._CABIKM010000034)
    - Organism: Candidatus Methylomirabilota bacterium., Hit Accession: A0A2U1RKS1 (Internal ID: PQAM01000010) (NH ID: Candidatus Methylomirabilota bacterium._PQAM01000010)
    - Organism: Candidatus Methylomirabilota bacterium., Hit Accession: A0A2U1S3Z5 (Internal ID: PQAQ01000079) (NH ID: Candidatus Methylomirabilota bacterium._PQAQ01000079)
  Cluster 2: 1 neighborhoods
    - Organism: Candidatus Methylomirabilis tolerans., Hit Accession: A0AAJ1EU68 (Internal ID: JAIOIU010000162) (NH ID: Candidatus Methylomirabilis tolerans._JAIOIU010000162)
  Cluster 3: 1 neighborhoods
    - Organism: Candidatus Methylomirabilis limnetica., Hit Accession: A0A2T4TZL4 (Internal ID: NVQC

In [5]:
# SQLITE_DB_PATH = '39061_CaMES_10kBlast_10e_50eEdge_noFilter_300AST_min900AA_withoutEgtD_withoutMethyltrans.sqlite' 
# SQLITE_DB_PATH = '39063_CaMES_10kBlast_10e_50eEdge_noFilter_300AST_min900AA-clusterseperated_10N.sqlite' 
# SQLITE_DB_PATH = '39094_CaMES_10k-Blast_noFilter_40ID-60AST_min950AA_10N.sqlite' 
# SQLITE_DB_PATH = '39151_EanB_10k_Blast_noFilter_50ASTcolorized_onlybigCluster-renamed_10N.sqlite'
SQLITE_DB_PATH = '39150_OvoA_10k_Blast_search_noFilter_80ASTcolorized_onlyBigCluster-separated_10N.sqlite'

# Set this to the UniProt ID of your original query protein if you want to highlight it.
# Set to None or an empty string if you don't want to highlight any specific protein.
# ORIGINAL_INPUT_SEQUENCE_ID = 'A0A7V4WV16' # CaMES e.g., 'A0A0B0EG43' or None or ''
# ORIGINAL_INPUT_SEQUENCE_ID = 'B3ECE3' # EanB
ORIGINAL_INPUT_SEQUENCE_ID = 'A0A1I5R890' # OvoA

# You can change this setting here or keep the global config
DIFFERENTIATE_BY_SSN_CLUSTER = True     # Set to True (SSN clusters differentiated) or False as needed
chosen_distance_threshold = 0.5         # Change as needed
PARALLELIZE_PDIST_ENABLED = True        # Set to True to enable parallel pdist.

# Configuration for collapsing similar neighborhoods (Local Override)
COLLAPSE_IDENTICAL_NEIGHBORHOODS_ACTIVE = True 
COLLAPSE_CORE_SIMILARITY_THRESHOLD_ACTIVE = 0.0 # Use 0.0 for exact match of hit+direct neighbors
COLLAPSE_FULL_NEIGHBORHOOD_SIMILARITY_THRESHOLD_ACTIVE = 0.5 # e.g., 0.3 for 70% similarity of full neighborhood

# Prepare report file
report_suffix = "_ssn_differentiated" if DIFFERENTIATE_BY_SSN_CLUSTER else "_all_neighborhoods"
report_filename = f"{REPORT_FILENAME_BASE}{report_suffix}.txt"
report_path = os.path.join(OUTPUT_DIR, report_filename)
    
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR) # Ensure output directory exists for report and plots

with open(report_path, 'w') as report_file:
    # Redefine write_and_print to use the local report_file handle
    def write_and_print_to_file(text):
        print(text)
        report_file.write(text + '\n')

    # Use write_and_print_to_file for all report output
    write_and_print_to_file(f"\n--- GNN Clustering Report ({datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}) ---") 
    write_and_print_to_file(f"Database: {SQLITE_DB_PATH}")
    write_and_print_to_file(f"Jaccard Distance Threshold: {chosen_distance_threshold}")
    write_and_print_to_file(f"Hit Gene Weight Factor: {HIT_GENE_WEIGHT_FACTOR}")
    write_and_print_to_file(f"Direct Neighbor Weight Factor: {DIRECT_NEIGHBOR_WEIGHT_FACTOR}")
    if DIFFERENTIATE_BY_SSN_CLUSTER:
        write_and_print_to_file(f"Clustering differentiated by SSN Cluster ID (column: '{COL_SSN_CLUSTER_ID}').")
    else:
        write_and_print_to_file("Clustering all neighborhoods together (no SSN cluster differentiation).")
    
    if COLLAPSE_IDENTICAL_NEIGHBORHOODS_ACTIVE:
        write_and_print_to_file(f"Collapsing identical/similar neighborhoods enabled:")
        write_and_print_to_file(f"  Stage 1 (Hit+Direct Neighbor Core): Threshold {COLLAPSE_CORE_SIMILARITY_THRESHOLD_ACTIVE}")
        write_and_print_to_file(f"  Stage 2 (Full Neighborhood): Threshold {COLLAPSE_FULL_NEIGHBORHOOD_SIMILARITY_THRESHOLD_ACTIVE}")
    else:
        write_and_print_to_file("Collapsing identical/similar neighborhoods disabled.")

    # Report on OMP_NUM_THREADS status instead of parallelize_pdist flag
    write_and_print_to_file(f"Distance calculation parallelism: {'Enabled (via joblib)' if PARALLELIZE_PDIST_ENABLED else 'Disabled (sequential custom Jaccard)'}")
    write_and_print_to_file(f"SciPy/NumPy internal parallelism (OMP_NUM_THREADS): {os.environ.get('OMP_NUM_THREADS', 'Not set (defaults will apply)')}")
    
    if ORIGINAL_INPUT_SEQUENCE_ID:
        write_and_print_to_file(f"Original Input Sequence Accession ID for highlighting: '{ORIGINAL_INPUT_SEQUENCE_ID}' (colored '{HIGHLIGHT_COLOR}')")
    else:
        write_and_print_to_file("No specific original input sequence ID provided for highlighting.")
    write_and_print_to_file(f"Plots saved to: {OUTPUT_DIR} in {OUTPUT_FORMATS} formats at {DPI} DPI.")
    write_and_print_to_file(f"Report saved to: {report_path}")
    write_and_print_to_file("-" * 70)


    clusters_by_ssn, final_labels_map, collapsed_groups_report = cluster_gene_neighborhoods_from_sqlite(
                                            db_path=SQLITE_DB_PATH,
                                            col_accession_id=COL_ACCESSION_ID,
                                            col_rel_start=COL_REL_START,
                                            col_rel_stop=COL_REL_STOP,
                                            hit_gene_weight_factor=HIT_GENE_WEIGHT_FACTOR,
                                            direct_neighbor_weight_factor=DIRECT_NEIGHBOR_WEIGHT_FACTOR,
                                            differentiate_by_ssn_cluster=DIFFERENTIATE_BY_SSN_CLUSTER,
                                            ssn_cluster_value_to_filter=DEFAULT_SSN_CLUSTER_VALUE_TO_FILTER,
                                            collapse_identical_neighborhoods=COLLAPSE_IDENTICAL_NEIGHBORHOODS_ACTIVE,
                                            collapse_core_similarity_threshold=COLLAPSE_CORE_SIMILARITY_THRESHOLD_ACTIVE,
                                            collapse_full_neighborhood_similarity_threshold=COLLAPSE_FULL_NEIGHBORHOOD_SIMILARITY_THRESHOLD_ACTIVE,
                                            original_input_sequence_id=ORIGINAL_INPUT_SEQUENCE_ID,
                                            distance_threshold=chosen_distance_threshold,
                                            plot_dendrogram=True,
                                            save_plots=SAVE_PLOTS,
                                            output_dir=OUTPUT_DIR,
                                            output_formats=OUTPUT_FORMATS, 
                                            dpi=DPI,
                                            min_plot_height=MIN_PLOT_HEIGHT,
                                            height_per_leaf=HEIGHT_PER_LEAF, 
                                            max_plot_height=MAX_PLOT_HEIGHT,
                                            min_plot_width=MIN_PLOT_WIDTH, 
                                            width_per_leaf=WIDTH_PER_LEAF, 
                                            max_plot_width=MAX_PLOT_WIDTH,
                                            report_file_handle=report_file,
                                            parallelize_pdist=PARALLELIZE_PDIST_ENABLED
                                        )

    if clusters_by_ssn:
        write_and_print_to_file("\n--- Final Clustering Results ---")
        for ssn_id, clusters_in_ssn in sorted(clusters_by_ssn.items(), key=lambda item: str(item[0])):
            write_and_print_to_file(f"\n### Results for SSN Cluster: {ssn_id} ###")
            if not clusters_in_ssn:
                write_and_print_to_file("  No clusters formed for this SSN group, or insufficient data.")
                continue

            for cluster_id, neighborhoods_in_cluster in sorted(clusters_in_ssn.items()):
                write_and_print_to_file(f"  Cluster {cluster_id}: {len(neighborhoods_in_cluster)} neighborhoods")
                for nh_id in neighborhoods_in_cluster:
                    organism_name, hit_id_internal, _, accession_id, collapsed_info = final_labels_map.get(nh_id, ('UNKNOWN', 'UNKNOWN', None, 'UNKNOWN', None))
                    
                    highlight_indicator = " (ORIGINAL INPUT)" if accession_id == ORIGINAL_INPUT_SEQUENCE_ID else ""
                    collapsed_suffix = ""
                    if collapsed_info:
                        count, letter_code = collapsed_info
                        collapsed_suffix = f" (Collapsed: {count} neighborhoods, Ref: {letter_code})"
                    
                    write_and_print_to_file(f"    - Organism: {organism_name}, Hit Accession: {accession_id}{highlight_indicator}{collapsed_suffix} (Internal ID: {hit_id_internal}) (NH ID: {nh_id})")
            write_and_print_to_file("  " + "-" * 30)
        
        if collapsed_groups_report:
            write_and_print_to_file("\n--- Detailed Report on Collapsed Neighborhood Groups ---")
            for code, group_data in sorted(collapsed_groups_report.items()):
                write_and_print_to_file(f"  Group ({code}): Representative: {group_data['representative']} (Total: {group_data['count']} members)")
                for member_nh_id in group_data['members']:
                    member_organism, member_hit_id, _, member_accession, _ = final_labels_map.get(member_nh_id, ('UNKNOWN', 'UNKNOWN', None, 'UNKNOWN', None))
                    write_and_print_to_file(f"    - {member_organism} (Accession: {member_accession}) (Internal ID: {member_hit_id}) (NH ID: {member_nh_id})")
            write_and_print_to_file("-------------------------------------------------------")
    else:
        write_and_print_to_file("\nNo clusters formed at all. This could mean your database is empty, or no features were extracted after filtering, or no valid SSN clusters with multiple neighborhoods were found.")

    write_and_print_to_file("\n--- Report End ---")


--- GNN Clustering Report (2025-09-03 00:16:05) ---
Database: 39150_OvoA_10k_Blast_search_noFilter_80ASTcolorized_onlyBigCluster-separated_10N.sqlite
Jaccard Distance Threshold: 0.5
Hit Gene Weight Factor: 10
Direct Neighbor Weight Factor: 3
Clustering differentiated by SSN Cluster ID (column: 'cluster_num').
Collapsing identical/similar neighborhoods enabled:
  Stage 1 (Hit+Direct Neighbor Core): Threshold 0.0
  Stage 2 (Full Neighborhood): Threshold 0.5
Distance calculation parallelism: Enabled (via joblib)
SciPy/NumPy internal parallelism (OMP_NUM_THREADS): 12
Original Input Sequence Accession ID for highlighting: 'A0A1I5R890' (colored 'red')
Plots saved to: gnn_cluster_plots_OvoA in ['pdf'] formats at 300 DPI.
Report saved to: gnn_cluster_plots_OvoA\gnn_clustering_report_ssn_differentiated.txt
----------------------------------------------------------------------
Fetching all hit gene data...
  Fetched 8384 hit genes in 0.16 seconds.
Fetching all neighbor data...
  Fetched 264203 

                                                                         

  Grouped neighbor data in 4.27 seconds.
Processing features for 8384 hit genes and their neighborhoods...


Extracting features: 100%|██████████| 8384/8384 [00:11<00:00, 719.96neighborhood/s] 


Finished feature extraction in 11.65 seconds.

--- Diagnostic: Raw SSN Cluster ID Distribution in 'attributes' table ---
  SSN ID '1': 2794 neighborhoods
  SSN ID '10': 1 neighborhoods
  SSN ID '11': 1 neighborhoods
  SSN ID '12': 1 neighborhoods
  SSN ID '13': 1 neighborhoods
  SSN ID '15': 1 neighborhoods
  SSN ID '16': 1 neighborhoods
  SSN ID '18': 1 neighborhoods
  SSN ID '19': 1 neighborhoods
  SSN ID '2': 2542 neighborhoods
  SSN ID '22': 1 neighborhoods
  SSN ID '24': 1 neighborhoods
  SSN ID '25': 1 neighborhoods
  SSN ID '3': 1711 neighborhoods
  SSN ID '34': 1 neighborhoods
  SSN ID '35': 1 neighborhoods
  SSN ID '37': 1 neighborhoods
  SSN ID '38': 1 neighborhoods
  SSN ID '39': 1 neighborhoods
  SSN ID '4': 682 neighborhoods
  SSN ID '40': 1 neighborhoods
  SSN ID '41': 1 neighborhoods
  SSN ID '42': 1 neighborhoods
  SSN ID '43': 1 neighborhoods
  SSN ID '44': 1 neighborhoods
  SSN ID '45': 1 neighborhoods
  SSN ID '46': 1 neighborhoods
  SSN ID '5': 403 neighborhoods
  S

                                                                                                        

  [Collapsing]  Stage 1: Sparse matrix built in 5.66 seconds. Shape: (8185, 22212), NNZ: 1103155
  [Collapsing]  Stage 1: Calculating core distances using scipy.pdist...
  Pre-calculating 8185 feature sets...


                                                                                             

  Feature set pre-calculation took 0.18 seconds.
  Running Jaccard distance in parallel for N=8185 using 12 cores.
  [Collapsing]  Stage 1: Core distance calculation took 169.00 seconds.
  [Collapsing]  Stage 1: Performing linkage and clustering for core features...
  [Collapsing]  Stage 1: Linkage took 1.60 seconds.
  [Collapsing]  Stage 1: Grouped into 7992 initial core groups based on a threshold of 0.0. Total stage 1 took 176.68 seconds.
  [Collapsing]  Stage 2: Processing 7992 core groups for full neighborhood similarity...
  [Collapsing]  Stage 2: Distributing 7992 core groups among 12 workers (parallelized).
[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   1 tasks      | elapsed:   15.9s
[Parallel(n_jobs=12)]: Done   2 out of  12 | elapsed:   18.8s remaining:  1.6min
[Parallel(n_jobs=12)]: Done   3 out of  12 | elapsed:   21.5s remaining:  1.1min
[Parallel(n_jobs=12)]: Done   4 out of  12 | elapsed:   26.7s remaining:   5

  Processing SSN clusters:   0%|          | 0/30 [00:00<?, ?cluster/s]


--- Processing SSN Cluster: 1 (contains 2658 neighborhoods) ---




  Feature vector creation for 2658 neighborhoods (95376 features) took 7.27 seconds.
  Matrix shape: (2658, 95376), NNZ: 1544629
  Calculating distances for 2658 neighborhoods...
  Pre-calculating 2658 feature sets...




  Feature set pre-calculation took 0.20 seconds.
  Running Jaccard distance in parallel for N=2658 using 12 cores.
  Distance calculation took 202.77 seconds.
  Performing linkage for 2658 neighborhoods...
  Linkage calculation took 0.11 seconds.
  Generating dendrogram plots...
  Plotting took 244.91 seconds.
--- Finished SSN Cluster 1 in 252.67 seconds. ---


  Processing SSN clusters:   3%|▎         | 1/30 [04:13<2:02:19, 253.09s/cluster]


--- Processing SSN Cluster: 3 (contains 1575 neighborhoods) ---




  Feature vector creation for 1575 neighborhoods (78558 features) took 4.43 seconds.
  Matrix shape: (1575, 78558), NNZ: 991070
  Calculating distances for 1575 neighborhoods...
  Pre-calculating 1575 feature sets...




  Feature set pre-calculation took 0.13 seconds.
  Running Jaccard distance in parallel for N=1575 using 12 cores.
  Distance calculation took 123.33 seconds.
  Performing linkage for 1575 neighborhoods...
  Linkage calculation took 0.03 seconds.
  Generating dendrogram plots...
  Plotting took 147.88 seconds.
--- Finished SSN Cluster 3 in 152.83 seconds. ---


  Processing SSN clusters:   7%|▋         | 2/30 [06:46<1:30:43, 194.39s/cluster]


--- Processing SSN Cluster: 38 (contains 1 neighborhoods) ---
  Skipping group 38: Not enough distinct neighborhoods (1) for clustering. Requires at least 2.

--- Processing SSN Cluster: 5 (contains 398 neighborhoods) ---




  Feature vector creation for 398 neighborhoods (29574 features) took 0.89 seconds.
  Matrix shape: (398, 29574), NNZ: 188375
  Calculating distances for 398 neighborhoods...
  Pre-calculating 398 feature sets...




  Feature set pre-calculation took 0.03 seconds.
  Running Jaccard distance in parallel for N=398 using 12 cores.
  Distance calculation took 20.02 seconds.
  Performing linkage for 398 neighborhoods...
  Linkage calculation took 0.00 seconds.
  Generating dendrogram plots...
  Plotting took 26.51 seconds.
--- Finished SSN Cluster 5 in 27.87 seconds. ---


  Processing SSN clusters:  13%|█▎        | 4/30 [07:14<35:16, 81.40s/cluster]   


--- Processing SSN Cluster: 6 (contains 222 neighborhoods) ---




  Feature vector creation for 222 neighborhoods (26337 features) took 0.49 seconds.
  Matrix shape: (222, 26337), NNZ: 111830
  Calculating distances for 222 neighborhoods...
  Pre-calculating 222 feature sets...




  Feature set pre-calculation took 0.02 seconds.
  Running Jaccard distance in parallel for N=222 using 12 cores.
  Distance calculation took 11.62 seconds.
  Performing linkage for 222 neighborhoods...
  Linkage calculation took 0.00 seconds.
  Generating dendrogram plots...
  Plotting took 15.61 seconds.
--- Finished SSN Cluster 6 in 16.59 seconds. ---


  Processing SSN clusters:  17%|█▋        | 5/30 [07:31<25:37, 61.50s/cluster]


--- Processing SSN Cluster: 4 (contains 674 neighborhoods) ---




  Feature vector creation for 674 neighborhoods (47779 features) took 1.80 seconds.
  Matrix shape: (674, 47779), NNZ: 404965
  Calculating distances for 674 neighborhoods...
  Pre-calculating 674 feature sets...




  Feature set pre-calculation took 0.06 seconds.
  Running Jaccard distance in parallel for N=674 using 12 cores.
  Distance calculation took 43.76 seconds.
  Performing linkage for 674 neighborhoods...
  Linkage calculation took 0.01 seconds.
  Generating dendrogram plots...
  Plotting took 54.35 seconds.
--- Finished SSN Cluster 4 in 56.70 seconds. ---


  Processing SSN clusters:  20%|██        | 6/30 [08:29<24:04, 60.20s/cluster]


--- Processing SSN Cluster: 2 (contains 2441 neighborhoods) ---




  Feature vector creation for 2441 neighborhoods (60834 features) took 7.10 seconds.
  Matrix shape: (2441, 60834), NNZ: 1636185
  Calculating distances for 2441 neighborhoods...
  Pre-calculating 2441 feature sets...




  Feature set pre-calculation took 0.18 seconds.
  Running Jaccard distance in parallel for N=2441 using 12 cores.
  Distance calculation took 204.00 seconds.
  Performing linkage for 2441 neighborhoods...
  Linkage calculation took 0.09 seconds.
  Generating dendrogram plots...
  Plotting took 243.63 seconds.
--- Finished SSN Cluster 2 in 251.44 seconds. ---


  Processing SSN clusters: 100%|██████████| 30/30 [12:41<00:00, 25.37s/cluster]



--- Processing SSN Cluster: 13 (contains 1 neighborhoods) ---
  Skipping group 13: Not enough distinct neighborhoods (1) for clustering. Requires at least 2.

--- Processing SSN Cluster: 9 (contains 1 neighborhoods) ---
  Skipping group 9: Not enough distinct neighborhoods (1) for clustering. Requires at least 2.

--- Processing SSN Cluster: 11 (contains 1 neighborhoods) ---
  Skipping group 11: Not enough distinct neighborhoods (1) for clustering. Requires at least 2.

--- Processing SSN Cluster: 19 (contains 1 neighborhoods) ---
  Skipping group 19: Not enough distinct neighborhoods (1) for clustering. Requires at least 2.

--- Processing SSN Cluster: 46 (contains 1 neighborhoods) ---
  Skipping group 46: Not enough distinct neighborhoods (1) for clustering. Requires at least 2.

--- Processing SSN Cluster: 37 (contains 1 neighborhoods) ---
  Skipping group 37: Not enough distinct neighborhoods (1) for clustering. Requires at least 2.

--- Processing SSN Cluster: 15 (contains 1 neig