In [43]:
import os
import torch
import glob
from tqdm.auto import tqdm
from torch_geometric.data import Data

# --- CONFIGURATION ---
# Please confirm these paths and strings are correct for your environment.

# 1. INPUT: Directory with your original graph files
ORIGINAL_GRAPH_DIR = "/cwStorage/nodecw_group/jijh/hest_graph_data_pca50_knn6"

# 2. OUTPUT: A new directory where the augmented graphs will be saved.
OUTPUT_DIR = "/cwStorage/nodecw_group/jijh/hest_graph_data_with_gigapath_y_resilient"

# 3. PATH REPLACEMENT LOGIC:
LATENT_PATH_SUBSTRING_TO_REPLACE = 'hest_output_latents_bf16'
GIGAPATH_PATH_REPLACEMENT_SUBSTRING = 'hest_output_gigapath_features_bf16_notebook'

print("--- Configuration ---")
print(f"Original Graphs Path: {ORIGINAL_GRAPH_DIR}")
print(f"Output (Augmented) Graphs Path: {OUTPUT_DIR}")
print(f"Path replacement: '{LATENT_PATH_SUBSTRING_TO_REPLACE}' -> '{GIGAPATH_PATH_REPLACEMENT_SUBSTRING}'")
print("-" * 20)

# Validate input path
if not os.path.isdir(ORIGINAL_GRAPH_DIR):
    raise FileNotFoundError(f"The specified ORIGINAL_GRAPH_DIR does not exist: {ORIGINAL_GRAPH_DIR}")

--- Configuration ---
Original Graphs Path: /cwStorage/nodecw_group/jijh/hest_graph_data_pca50_knn6
Output (Augmented) Graphs Path: /cwStorage/nodecw_group/jijh/hest_graph_data_with_gigapath_y_resilient
Path replacement: 'hest_output_latents_bf16' -> 'hest_output_gigapath_features_bf16_notebook'
--------------------


In [44]:
def resilient_preprocess_graphs(original_graph_dir, output_dir,
                                str_to_replace, str_replacement):
    """
    Reads original graphs, derives GigaPath feature paths, and saves new graphs
    with features attached as `data.y`.

    NEW BEHAVIOR: If a graph contains any `None` or empty `latent_paths`,
    it will be SKIPPED, a warning will be printed, and processing will continue.
    """
    print(f"Starting resilient preprocessing. Invalid graphs will be skipped.")
    os.makedirs(output_dir, exist_ok=True)

    all_graph_files = sorted(glob.glob(os.path.join(original_graph_dir, "*_graph.pt")))
    if not all_graph_files:
        print("ERROR: No graph files found. Nothing to process.")
        return

    # Counters for the final summary
    processed_count = 0
    skipped_count = 0

    # Main loop with progress bar
    for graph_path in tqdm(all_graph_files, desc="Processing Graphs"):
        graph_data = torch.load(graph_path, map_location='cpu', weights_only=False)
        
        if not hasattr(graph_data, 'latent_paths'):
            print(f"WARNING: Skipping graph '{os.path.basename(graph_path)}' because it lacks the 'latent_paths' attribute.")
            skipped_count += 1
            continue
            
        gigapath_features_for_this_graph = []
        is_graph_valid = True # Assume the graph is valid until a problem is found

        # Iterate over paths to check for validity and load features
        for node_idx, original_latent_path in enumerate(graph_data.latent_paths):
            
            # THE KEY CHANGE IS HERE: Check for invalid path
            if not original_latent_path:
                print(f"WARNING: Skipping graph '{os.path.basename(graph_path)}' due to None or empty path at node index {node_idx}.")
                is_graph_valid = False
                skipped_count += 1
                break # Stop processing nodes for this invalid graph
            
            # Derive the expected path via string replacement
            expected_gp_feature_path = original_latent_path.replace(str_to_replace, str_replacement)
            
            # This check is still important for files that have a path but the target does not exist.
            if not os.path.exists(expected_gp_feature_path):
                print(f"WARNING: Skipping graph '{os.path.basename(graph_path)}' because derived feature file does not exist for node {node_idx}.")
                print(f"  --> Missing file: {expected_gp_feature_path}")
                is_graph_valid = False
                skipped_count += 1
                break

            # If all checks pass, load the feature
            feature_tensor = torch.load(expected_gp_feature_path, map_location='cpu', weights_only=False)
            gigapath_features_for_this_graph.append(feature_tensor)

        # After checking all nodes, only save if the graph was deemed valid
        if is_graph_valid:
            try:
                augmented_graph_data = graph_data.clone()
                augmented_graph_data.y = torch.stack(gigapath_features_for_this_graph, dim=0).float()
                
                output_path = os.path.join(output_dir, os.path.basename(graph_path))
                torch.save(augmented_graph_data, output_path)
                processed_count += 1
            except Exception as e:
                # Catch potential errors during stacking or saving
                print(f"ERROR: Failed to save '{os.path.basename(graph_path)}' even after validation. Error: {e}")
                skipped_count += 1


    # Final Summary Report
    print("\n" + "="*50)
    print("--- Preprocessing Finished ---")
    print(f"Successfully processed and saved: {processed_count} graphs.")
    print(f"Skipped due to errors: {skipped_count} graphs.")
    print(f"Total graphs analyzed: {len(all_graph_files)}")
    print("="*50 + "\n")

In [45]:
# The FutureWarnings are expected and can be ignored as we trust our data source.
# Let's run the new, more robust function.
resilient_preprocess_graphs(
    original_graph_dir=ORIGINAL_GRAPH_DIR,
    output_dir=OUTPUT_DIR,
    str_to_replace=LATENT_PATH_SUBSTRING_TO_REPLACE,
    str_replacement=GIGAPATH_PATH_REPLACEMENT_SUBSTRING
)

Starting resilient preprocessing. Invalid graphs will be skipped.


Processing Graphs:   0%|          | 0/209 [00:00<?, ?it/s]


--- Preprocessing Finished ---
Successfully processed and saved: 172 graphs.
Skipped due to errors: 37 graphs.
Total graphs analyzed: 209



# Test Code

In [6]:
samples = os.listdir(GIGAPATH_FEATURE_DIR)
graphs = os.listdir(ORIGINAL_GRAPH_DIR)

In [24]:
test_sample_id = "MEND130"

In [25]:
test_graph = f"{test_sample_id}_graph.pt"
# Read the graph
test_graph_path = os.path.join(ORIGINAL_GRAPH_DIR, test_graph)
if test_graph not in graphs:
    print(f"Graph {test_graph} not found in {ORIGINAL_GRAPH_DIR}.")
else:
    test_graph_data = torch.load(test_graph_path, map_location='cpu')
    print(f"Loaded graph {test_graph} with {test_graph_data.num_nodes} nodes and {test_graph_data.num_edges} edges.")
    # Check if the sample ID exists in the GigaPath features
    if test_sample_id not in samples:
        print(f"Sample ID {test_sample_id} not found in GigaPath features directory {GIGAPATH_FEATURE_DIR}.")
    else:
        print(f"Sample ID {test_sample_id} exists in GigaPath features directory.")
        # Check if the expected feature file exists
        expected_feature_filename = f"{test_sample_id}_0_0.pt"
        expected_feature_path = os.path.join(GIGAPATH_FEATURE_DIR, f"{test_sample_id}_tiles", expected_feature_filename)
        if os.path.exists(expected_feature_path):
            print(f"Expected feature file {expected_feature_filename} exists.")
        else:
            print(f"Expected feature file {expected_feature_filename} does NOT exist at {expected_feature_path}.")

Loaded graph MEND130_graph.pt with 4557 nodes and 24592 edges.
Sample ID MEND130 not found in GigaPath features directory /cwStorage/nodecw_group/jijh/hest_output_gigapath_features_bf16_notebook.


  test_graph_data = torch.load(test_graph_path, map_location='cpu')


In [27]:
test_graph_data

Data(x=[4557, 50], edge_index=[2, 24592], coords=[4557, 2], spot_ids=[4557], latent_paths=[4557], sample_id='MEND130')

In [15]:
# Replace the substring 'hest_output_latents_bf16' in the "latent_paths" with "hest_output_gigapath_features_bf16_notebook"
test_graph_data.gigapath_paths = [
    path.replace('hest_output_latents_bf16', 'hest_output_gigapath_features_bf16_notebook') 
    for path in test_graph_data.latent_paths
]

In [17]:
# Test whether the gigapath_paths files exist
missing_files = []
existing_files = []

for i, gp_path in enumerate(test_graph_data.gigapath_paths):
    if os.path.exists(gp_path):
        existing_files.append(gp_path)
    else:
        missing_files.append((i, gp_path))

print(f"Total gigapath_paths: {len(test_graph_data.gigapath_paths)}")
print(f"Existing files: {len(existing_files)}")
print(f"Missing files: {len(missing_files)}")

if missing_files:
    print(f"\nFirst 5 missing files:")
    for i, (node_idx, path) in enumerate(missing_files[:5]):
        print(f"  Node {node_idx}: {path}")
    if len(missing_files) > 5:
        print(f"  ... and {len(missing_files) - 5} more")
else:
    print("\nAll gigapath feature files exist!")


Total gigapath_paths: 546
Existing files: 546
Missing files: 0

All gigapath feature files exist!


In [34]:
import os
import torch

# --- Debugging Configuration ---
# Set the specific file and node that caused the error.
TEST_GRAPH_FILENAME = "MEND130_graph.pt"
PROBLEM_NODE_INDEX = 552

# Use the same graph directory from your main script
ORIGINAL_GRAPH_DIR = "/cwStorage/nodecw_group/jijh/hest_graph_data_pca50_knn6"
# -----------------------------

test_graph_path = os.path.join(ORIGINAL_GRAPH_DIR, TEST_GRAPH_FILENAME)

print(f"--- Starting Inspection for: {TEST_GRAPH_FILENAME} ---")

if not os.path.exists(test_graph_path):
    print(f"\nERROR: The test graph file was not found at the specified path.")
    print(f"Path: {test_graph_path}")
else:
    print(f"Loading graph file: {test_graph_path}")
    # We ignore the Future-Warning as it's not the error source
    graph_data = torch.load(test_graph_path, map_location='cpu')
    print("Graph loaded successfully.")
    
    # 1. Check if the `latent_paths` attribute exists
    if not hasattr(graph_data, 'latent_paths'):
        print("\nFATAL ISSUE: The loaded graph object does NOT have a 'latent_paths' attribute!")
    else:
        print(f"\nFound 'latent_paths' attribute with {len(graph_data.latent_paths)} entries.")
        print(f"Graph has {graph_data.num_nodes} nodes.")
        if len(graph_data.latent_paths) != graph_data.num_nodes:
            print("WARNING: The number of latent paths does not match the number of nodes!")

        # 2. Find ALL problematic (None or empty) paths in this file
        problematic_indices = []
        for i, path in enumerate(graph_data.latent_paths):
            if not path: # This checks for both None and empty strings ""
                problematic_indices.append(i)
        
        if not problematic_indices:
            print("\nCONCLUSION: No 'None' or empty latent paths were found in this file. The issue might be different.")
        else:
            print(f"\nISSUE FOUND: Found {len(problematic_indices)} 'None' or empty entries in 'latent_paths'.")
            print(f"Indices with issues are: {problematic_indices}")

            # 3. Zoom in on the specific problematic node (552)
            print(f"\n--- Detailed view around the reported problem node: {PROBLEM_NODE_INDEX} ---")
            
            # Define a small window to view context around the problem index
            start = max(0, PROBLEM_NODE_INDEX - 2)
            end = min(len(graph_data.latent_paths), PROBLEM_NODE_INDEX + 3)

            for i in range(start, end):
                path_value = graph_data.latent_paths[i]
                prefix = f"  Node {i}:"
                if i == PROBLEM_NODE_INDEX:
                    # Highlight the specific problematic entry
                    print(f"{prefix} >> {path_value} <<  <-- THIS IS THE PROBLEM")
                else:
                    print(f"{prefix}    {path_value}")
            print("-" * 20)

--- Starting Inspection for: MEND130_graph.pt ---
Loading graph file: /cwStorage/nodecw_group/jijh/hest_graph_data_pca50_knn6/MEND130_graph.pt
Graph loaded successfully.

Found 'latent_paths' attribute with 4557 entries.
Graph has 4557 nodes.

ISSUE FOUND: Found 7 'None' or empty entries in 'latent_paths'.
Indices with issues are: [552, 1333, 2546, 2702, 2942, 3249, 4012]

--- Detailed view around the reported problem node: 552 ---
  Node 550:    /cwStorage/nodecw_group/jijh/hest_output_latents_bf16/MEND130_tiles/MEND130_13221_23758.pt
  Node 551:    /cwStorage/nodecw_group/jijh/hest_output_latents_bf16/MEND130_tiles/MEND130_33879_5517.pt
  Node 552: >> None <<  <-- THIS IS THE PROBLEM
  Node 553:    /cwStorage/nodecw_group/jijh/hest_output_latents_bf16/MEND130_tiles/MEND130_39957_27556.pt
  Node 554:    /cwStorage/nodecw_group/jijh/hest_output_latents_bf16/MEND130_tiles/MEND130_24810_13624.pt
--------------------


  graph_data = torch.load(test_graph_path, map_location='cpu')


In [35]:
# Check if other attributes for node 552 are also problematic
node_idx = 552

print(f"--- Checking all attributes for node {node_idx} ---")

# Check x (node features)
if hasattr(graph_data, 'x') and graph_data.x is not None:
    if node_idx < graph_data.x.shape[0]:
        node_features = graph_data.x[node_idx]
        print(f"Node features (x): shape={node_features.shape}, all zeros: {torch.all(node_features == 0).item()}")
        print(f"  First 5 values: {node_features[:5]}")
    else:
        print(f"Node {node_idx} is out of bounds for x tensor (size: {graph_data.x.shape[0]})")
else:
    print("Graph has no 'x' attribute or it's None")

# Check coords
if hasattr(graph_data, 'coords') and graph_data.coords is not None:
    if node_idx < graph_data.coords.shape[0]:
        node_coords = graph_data.coords[node_idx]
        print(f"Node coordinates: {node_coords}")
    else:
        print(f"Node {node_idx} is out of bounds for coords tensor (size: {graph_data.coords.shape[0]})")
else:
    print("Graph has no 'coords' attribute or it's None")

# Check spot_ids
if hasattr(graph_data, 'spot_ids') and graph_data.spot_ids is not None:
    if node_idx < len(graph_data.spot_ids):
        spot_id = graph_data.spot_ids[node_idx]
        print(f"Spot ID: {spot_id}")
    else:
        print(f"Node {node_idx} is out of bounds for spot_ids (length: {len(graph_data.spot_ids)})")
else:
    print("Graph has no 'spot_ids' attribute or it's None")

# Check latent_paths (we already know this is None)
if hasattr(graph_data, 'latent_paths') and graph_data.latent_paths is not None:
    if node_idx < len(graph_data.latent_paths):
        latent_path = graph_data.latent_paths[node_idx]
        print(f"Latent path: {latent_path}")
    else:
        print(f"Node {node_idx} is out of bounds for latent_paths (length: {len(graph_data.latent_paths)})")
else:
    print("Graph has no 'latent_paths' attribute or it's None")

print(f"\n--- Summary for node {node_idx} ---")

--- Checking all attributes for node 552 ---
Node features (x): shape=torch.Size([50]), all zeros: False
  First 5 values: tensor([ 36.2581, -12.4293,  16.5186,   1.6257,   0.3748])
Node coordinates: tensor([37961.9297, 12732.5000])
Spot ID: MEND130_552
Latent path: None

--- Summary for node 552 ---


In [38]:
# Compare the latent_paths and the files' paths in the /cwStorage/nodecw_group/jijh/hest_output_latents_bf16/MEND130_tiles directory to see the discrepancy
# Get the directory path for MEND130 latent files
latent_dir = "/cwStorage/nodecw_group/jijh/hest_output_latents_bf16/MEND130_tiles"

print(f"--- Comparing latent_paths vs actual files in {latent_dir} ---")

# Get all actual files in the directory
if os.path.exists(latent_dir):
    actual_files = sorted(os.listdir(latent_dir))
    print(f"Found {len(actual_files)} actual files in the directory")
else:
    print(f"ERROR: Directory {latent_dir} does not exist!")
    actual_files = []

# Extract expected filenames from latent_paths (non-None entries only)
expected_files = []
valid_latent_paths = []
for i, path in enumerate(graph_data.latent_paths):
    if path:  # Skip None entries
        filename = os.path.basename(path)
        expected_files.append(filename)
        valid_latent_paths.append(path)

expected_files = sorted(expected_files)
print(f"Expected {len(expected_files)} files from valid latent_paths")

# Find missing and extra files
missing_from_disk = set(expected_files) - set(actual_files)
extra_on_disk = set(actual_files) - set(expected_files)

print(f"\nFiles expected but missing from disk: {len(missing_from_disk)}")
if missing_from_disk:
    print("First 5 missing files:")
    for f in list(missing_from_disk)[:5]:
        print(f"  {f}")

print(f"\nFiles on disk but not in latent_paths: {len(extra_on_disk)}")
if extra_on_disk:
    print("First 10 extra files:")
    for f in list(extra_on_disk)[:10]:
        print(f"  {f}")

# Check the problematic indices to see what files they should point to
print(f"\n--- Checking what files the problematic nodes should point to ---")
for idx in problematic_indices[:5]:  # Check first 5 problematic nodes
    spot_id = graph_data.spot_ids[idx]
    coords = graph_data.coords[idx]
    
    # Try to find a matching file based on spot_id pattern
    potential_files = [f for f in actual_files if spot_id in f]
    
    print(f"Node {idx} (spot_id: {spot_id}, coords: {coords.tolist()}):")
    print(f"  latent_paths[{idx}]: {graph_data.latent_paths[idx]}")
    print(f"  Potential matching files: {potential_files}")

--- Comparing latent_paths vs actual files in /cwStorage/nodecw_group/jijh/hest_output_latents_bf16/MEND130_tiles ---
Found 4557 actual files in the directory
Expected 4550 files from valid latent_paths

Files expected but missing from disk: 0

Files on disk but not in latent_paths: 7
First 10 extra files:
  MEND130_38424_10451.pt
  MEND130_36912_11319.pt
  MEND130_12733_37962.pt
  MEND130_30361_16819.pt
  MEND130_35400_12187.pt
  MEND130_34397_16824.pt
  MEND130_35925_29871.pt

--- Checking what files the problematic nodes should point to ---
Node 552 (spot_id: MEND130_552, coords: [37961.9296875, 12732.5]):
  latent_paths[552]: None
  Potential matching files: []
Node 1333 (spot_id: MEND130_1333, coords: [16819.2109375, 30361.5]):
  latent_paths[1333]: None
  Potential matching files: []
Node 2546 (spot_id: MEND130_2546, coords: [16823.712890625, 34396.5]):
  latent_paths[2546]: None
  Potential matching files: []
Node 2702 (spot_id: MEND130_2702, coords: [11318.5, 36912.09765625]):


In [40]:
# Loop through all graph files to check for similar issues
output_file = "/cwStorage/nodecw_group/jijh/feature_path_discrepency.txt"

print(f"Checking all graph files for latent_paths issues...")
print(f"Results will be saved to: {output_file}")

with open(output_file, 'w') as f:
    f.write("Feature Path Discrepancy Analysis\n")
    f.write("="*50 + "\n\n")
    
    total_graphs = 0
    graphs_with_issues = 0
    total_problematic_nodes = 0
    
    for graph_file in tqdm(graphs, desc="Analyzing graphs"):
        graph_path = os.path.join(ORIGINAL_GRAPH_DIR, graph_file)
        
        try:
            graph_data = torch.load(graph_path, map_location='cpu')
            total_graphs += 1
            
            # Check if latent_paths attribute exists
            if not hasattr(graph_data, 'latent_paths'):
                message = f"CRITICAL: {graph_file} - Missing 'latent_paths' attribute\n"
                print(message.strip())
                f.write(message)
                graphs_with_issues += 1
                continue
            
            # Find problematic paths (None or empty)
            problematic_indices = []
            for i, path in enumerate(graph_data.latent_paths):
                if not path:  # None or empty string
                    problematic_indices.append(i)
            
            if problematic_indices:
                graphs_with_issues += 1
                total_problematic_nodes += len(problematic_indices)
                
                message = f"{graph_file}:\n"
                message += f"  Total nodes: {graph_data.num_nodes}\n"
                message += f"  Problematic nodes: {len(problematic_indices)}\n"
                message += f"  Problematic indices: {problematic_indices[:10]}"  # Show first 10
                if len(problematic_indices) > 10:
                    message += f" ... and {len(problematic_indices) - 10} more"
                message += "\n\n"
                
                print(f"{graph_file}: {len(problematic_indices)} problematic nodes")
                f.write(message)
            else:
                print(f"{graph_file}: OK")
                f.write(f"{graph_file}: No issues found\n")
                
        except Exception as e:
            error_message = f"ERROR loading {graph_file}: {str(e)}\n"
            print(error_message.strip())
            f.write(error_message)
    
    # Summary
    summary = f"\n{'='*50}\n"
    summary += f"SUMMARY:\n"
    summary += f"Total graphs analyzed: {total_graphs}\n"
    summary += f"Graphs with issues: {graphs_with_issues}\n"
    summary += f"Total problematic nodes: {total_problematic_nodes}\n"
    summary += f"Percentage of graphs with issues: {graphs_with_issues/total_graphs*100:.2f}%\n"
    
    print(summary)
    f.write(summary)

print(f"\nAnalysis complete. Results saved to: {output_file}")

Checking all graph files for latent_paths issues...
Results will be saved to: /cwStorage/nodecw_group/jijh/feature_path_discrepency.txt


Analyzing graphs:   0%|          | 0/209 [00:00<?, ?it/s]

TENX83_graph.pt: 20 problematic nodes
NCBI388_graph.pt: OK
NCBI528_graph.pt: OK
MEND71_graph.pt: OK
MEND46_graph.pt: OK
NCBI406_graph.pt: OK
NCBI349_graph.pt: OK


  graph_data = torch.load(graph_path, map_location='cpu')


NCBI531_graph.pt: OK
NCBI800_graph.pt: 4 problematic nodes
NCBI638_graph.pt: 1 problematic nodes
NCBI403_graph.pt: OK
NCBI340_graph.pt: OK
SPA6_graph.pt: OK
NCBI715_graph.pt: OK
MEND42_graph.pt: OK
NCBI402_graph.pt: OK
MEND130_graph.pt: 7 problematic nodes
ZEN60_graph.pt: 2 problematic nodes
NCBI654_graph.pt: OK
TENX85_graph.pt: 2 problematic nodes
TENX78_graph.pt: 20 problematic nodes
NCBI660_graph.pt: OK
NCBI367_graph.pt: OK
TENX43_graph.pt: OK
NCBI628_graph.pt: OK
NCBI720_graph.pt: OK
SPA12_graph.pt: OK
NCBI368_graph.pt: OK
SPA14_graph.pt: OK
NCBI342_graph.pt: OK
NCBI641_graph.pt: 3 problematic nodes
TENX61_graph.pt: OK
NCBI653_graph.pt: OK
TENX18_graph.pt: OK
SPA10_graph.pt: OK
NCBI348_graph.pt: OK
NCBI659_graph.pt: OK
MEND50_graph.pt: OK
NCBI397_graph.pt: OK
TENX84_graph.pt: 2 problematic nodes
MEND55_graph.pt: OK
NCBI339_graph.pt: OK
MISC9_graph.pt: 5 problematic nodes
TENX79_graph.pt: OK
TENX73_graph.pt: OK
SPA5_graph.pt: OK
NCBI640_graph.pt: 1 problematic nodes
MISC3_graph.pt: 

In [41]:
def analyze_sample_paths(sample_id, original_graph_dir=ORIGINAL_GRAPH_DIR):
    """
    Analyze latent_paths vs actual files for a specific sample.
    
    Args:
        sample_id (str): The sample ID to analyze (e.g., "MEND130")
        original_graph_dir (str): Directory containing the graph files
    """
    # Get the directory path for sample latent files
    latent_dir = f"/cwStorage/nodecw_group/jijh/hest_output_latents_bf16/{sample_id}_tiles"
    graph_file = f"{sample_id}_graph.pt"
    graph_path = os.path.join(original_graph_dir, graph_file)
    
    print(f"--- Analyzing sample: {sample_id} ---")
    print(f"Latent directory: {latent_dir}")
    print(f"Graph file: {graph_file}")
    
    # Load the graph
    if not os.path.exists(graph_path):
        print(f"ERROR: Graph file not found at {graph_path}")
        return
    
    graph_data = torch.load(graph_path, map_location='cpu')
    print(f"Loaded graph with {graph_data.num_nodes} nodes")
    
    # Get all actual files in the directory
    if os.path.exists(latent_dir):
        actual_files = sorted(os.listdir(latent_dir))
        print(f"Found {len(actual_files)} actual files in the directory")
    else:
        print(f"ERROR: Directory {latent_dir} does not exist!")
        return
    
    # Extract expected filenames from latent_paths (non-None entries only)
    expected_files = []
    valid_latent_paths = []
    problematic_indices = []
    
    for i, path in enumerate(graph_data.latent_paths):
        if path:  # Skip None entries
            filename = os.path.basename(path)
            expected_files.append(filename)
            valid_latent_paths.append(path)
        else:
            problematic_indices.append(i)
    
    expected_files = sorted(expected_files)
    print(f"Expected {len(expected_files)} files from valid latent_paths")
    print(f"Found {len(problematic_indices)} problematic (None/empty) latent_paths")
    
    # Find missing and extra files
    missing_from_disk = set(expected_files) - set(actual_files)
    extra_on_disk = set(actual_files) - set(expected_files)
    
    print(f"\nFiles expected but missing from disk: {len(missing_from_disk)}")
    if missing_from_disk:
        print("First 5 missing files:")
        for f in list(missing_from_disk)[:5]:
            print(f"  {f}")
    
    print(f"\nFiles on disk but not in latent_paths: {len(extra_on_disk)}")
    if extra_on_disk:
        print("First 10 extra files:")
        for f in list(extra_on_disk)[:10]:
            print(f"  {f}")
    
    # Check the problematic indices to see what files they should point to
    if problematic_indices:
        print(f"\n--- Checking what files the problematic nodes should point to ---")
        for idx in problematic_indices[:5]:  # Check first 5 problematic nodes
            spot_id = graph_data.spot_ids[idx]
            coords = graph_data.coords[idx]
            
            # Try to find a matching file based on spot_id pattern
            potential_files = [f for f in actual_files if spot_id in f]
            
            print(f"Node {idx} (spot_id: {spot_id}, coords: {coords.tolist()}):")
            print(f"  latent_paths[{idx}]: {graph_data.latent_paths[idx]}")
            print(f"  Potential matching files: {potential_files}")
    
    return {
        'sample_id': sample_id,
        'total_nodes': graph_data.num_nodes,
        'actual_files_count': len(actual_files),
        'expected_files_count': len(expected_files),
        'problematic_indices_count': len(problematic_indices),
        'missing_from_disk': missing_from_disk,
        'extra_on_disk': extra_on_disk,
        'problematic_indices': problematic_indices
    }

In [42]:
analyze_sample_paths("TENX159")

--- Analyzing sample: TENX159 ---
Latent directory: /cwStorage/nodecw_group/jijh/hest_output_latents_bf16/TENX159_tiles
Graph file: TENX159_graph.pt
Loaded graph with 3429 nodes


  graph_data = torch.load(graph_path, map_location='cpu')


Found 3043 actual files in the directory
Expected 2996 files from valid latent_paths
Found 433 problematic (None/empty) latent_paths

Files expected but missing from disk: 0

Files on disk but not in latent_paths: 47
First 10 extra files:
  TENX159_14277_6341.pt
  TENX159_14277_8895.pt
  TENX159_14277_16194.pt
  TENX159_14277_11085.pt
  TENX159_14277_5246.pt
  TENX159_14277_4881.pt
  TENX159_14277_5611.pt
  TENX159_14277_11450.pt
  TENX159_14277_14734.pt
  TENX159_14277_3422.pt

--- Checking what files the problematic nodes should point to ---
Node 0 (spot_id: TENX159_0, coords: [1961.8677978515625, -1050.4527587890625]):
  latent_paths[0]: None
  Potential matching files: []
Node 1 (spot_id: TENX159_1, coords: [2326.795166015625, -1050.4527587890625]):
  latent_paths[1]: None
  Potential matching files: ['TENX159_10262_10355.pt', 'TENX159_10262_10720.pt', 'TENX159_10262_11085.pt', 'TENX159_10262_11450.pt', 'TENX159_10262_11815.pt', 'TENX159_10262_12180.pt', 'TENX159_10262_1232.pt', 'T

{'sample_id': 'TENX159',
 'total_nodes': 3429,
 'actual_files_count': 3043,
 'expected_files_count': 2996,
 'problematic_indices_count': 433,
 'missing_from_disk': set(),
 'extra_on_disk': {'TENX159_14277_10355.pt',
  'TENX159_14277_10720.pt',
  'TENX159_14277_11085.pt',
  'TENX159_14277_11450.pt',
  'TENX159_14277_11815.pt',
  'TENX159_14277_12180.pt',
  'TENX159_14277_1232.pt',
  'TENX159_14277_12545.pt',
  'TENX159_14277_12910.pt',
  'TENX159_14277_13275.pt',
  'TENX159_14277_13640.pt',
  'TENX159_14277_137.pt',
  'TENX159_14277_14004.pt',
  'TENX159_14277_14369.pt',
  'TENX159_14277_14734.pt',
  'TENX159_14277_15099.pt',
  'TENX159_14277_15464.pt',
  'TENX159_14277_15829.pt',
  'TENX159_14277_1597.pt',
  'TENX159_14277_16194.pt',
  'TENX159_14277_16559.pt',
  'TENX159_14277_16924.pt',
  'TENX159_14277_1962.pt',
  'TENX159_14277_2327.pt',
  'TENX159_14277_2692.pt',
  'TENX159_14277_3057.pt',
  'TENX159_14277_3422.pt',
  'TENX159_14277_3787.pt',
  'TENX159_14277_4151.pt',
  'TENX159_