In [2]:


# Utility function to properly load parquet with spot names
def load_parquet_with_spots(file_path, n_rows=None):
    """
    Load parquet file and properly set spot names as index
    """
    if n_rows:
        # Read limited rows
        parquet_file = pq.ParquetFile(file_path)
        first_batch = next(parquet_file.iter_batches(batch_size=n_rows))
        df = first_batch.to_pandas().head(n_rows)
    else:
        # Read full file (use with caution!)
        df = pd.read_parquet(file_path)
    
    # Set spot_id as index if it exists
    if 'spot_id' in df.columns:
        df = df.set_index('spot_id')
        print(f"✅ Set spot_id as index. Sample spot names: {df.index[:3].tolist()}")
    else:
        print(f"❌ No spot_id column found. Available columns: {df.columns.tolist()}")
    
    return df



In [1]:
# TRULY memory-efficient parquet reading with pyarrow
import pandas as pd
import pyarrow.parquet as pq
import pyarrow as pa
import pandas as pd

In [18]:
# Test with your file
file_path = "/storage/research/dbmr_luisierlab/temp/lfournier/repositories/TumorArchetype-FM/results/molecular/TNBC_processed/TNBC_filtered_counts.parquet"
annotation = pd.read_csv("/storage/research/dbmr_luisierlab/temp/lfournier/repositories/TumorArchetype-FM/results/compute_patches/TNBC/spots_labels.csv", index_col=0)

In [21]:
# Memory-efficient tissue-specific spot loading
def load_parquet_by_tissue_chunks(file_path, annotation, target_tissues, chunk_size=1000):
    """
    Load only spots from specific tissue types using chunked processing.
    
    Parameters:
    - file_path: Path to parquet file
    - annotation: DataFrame with spot annotations (index = spot names, 'label' column = tissue type)
    - target_tissues: List of tissue types to load (e.g., ['adipose tissue'])
    - chunk_size: Number of rows to process per chunk
    
    Returns:
    - DataFrame with only spots from target tissues
    """
    # Get target spot names
    target_spots = set(annotation[annotation['label'].isin(target_tissues)].index)
    print(f"Target tissues: {target_tissues}")
    print(f"Found {len(target_spots)} spots in target tissues")
    
    if len(target_spots) == 0:
        print("❌ No spots found for target tissues!")
        return pd.DataFrame()
    
    # Process file in chunks
    parquet_file = pq.ParquetFile(file_path)
    matching_chunks = []
    total_processed = 0
    
    print(f"Processing parquet file in chunks of {chunk_size}...")
    
    for i, batch in enumerate(parquet_file.iter_batches(batch_size=chunk_size)):
        chunk_df = batch.to_pandas()
        chunk_df['spot_id'] = chunk_df['spot_id'].apply(lambda x: x.split('parquet.')[1].replace('X', 'spot'))
        total_processed += len(chunk_df)
        
        # Check if spot_id column exists
        if 'spot_id' in chunk_df.columns:
            # Filter for target spots
            matching_spots = chunk_df[chunk_df['spot_id'].isin(target_spots)]
            if len(matching_spots) > 0:
                matching_spots = matching_spots.set_index('spot_id')
                
                matching_chunks.append(matching_spots)
                print(f"  Chunk {i+1}: Found {len(matching_spots)} matching spots")
        
        # Progress update every 10 chunks
        if (i + 1) % 10 == 0:
            print(f"  Processed {total_processed} rows...")
    
    # Combine all matching chunks
    if matching_chunks:
        result = pd.concat(matching_chunks, axis=0)
        print(f"✅ Successfully loaded {len(result)} spots from {len(matching_chunks)} chunks")
        
        # Show tissue distribution
        result_annotation = annotation.loc[result.index]
        tissue_counts = result_annotation['label'].value_counts()
        print("Tissue distribution in result:")
        for tissue, count in tissue_counts.items():
            print(f"  {tissue}: {count} spots")
        
        return result
    else:
        print("❌ No matching spots found in any chunk!")
        return pd.DataFrame()


def load_adipose_tissue_spots(file_path, annotation, chunk_size=1000):
    """
    Convenience function to load only adipose tissue spots.
    """
    return load_parquet_by_tissue_chunks(file_path, annotation, ['adipose tissue'], chunk_size)


def load_multiple_tissues(file_path, annotation, tissue_list, chunk_size=1000):
    """
    Convenience function to load spots from multiple tissue types.
    
    Parameters:
    - tissue_list: List of tissue types (e.g., ['adipose tissue', 'connective tissue'])
    """
    return load_parquet_by_tissue_chunks(file_path, annotation, tissue_list, chunk_size)

In [22]:
# Test: Load multiple tissue types at once
print("\n=== LOADING MULTIPLE TISSUE TYPES ===")

# Check what tissue types are available
print("Available tissue types:")
tissue_counts = annotation['label'].value_counts()
print(tissue_counts)

# Load multiple tissue types (example: adipose + connective tissue)
multiple_tissues = list(annotation[annotation['label'] != 'undetermined']['label'].unique())
multi_tissue_spots = load_multiple_tissues(file_path, annotation, multiple_tissues, chunk_size=500)

print(f"\nMulti-tissue result shape: {multi_tissue_spots.shape}")
if len(multi_tissue_spots) > 0:
    print("Sample spot names:", multi_tissue_spots.index[:3].tolist())


=== LOADING MULTIPLE TISSUE TYPES ===
Available tissue types:
undetermined         53782
connective tissue    30330
invasive cancer      12106
adipose tissue        3716
immune infiltrate     1873
cancer in situ          96
breast glands           51
Name: label, dtype: int64
Target tissues: ['adipose tissue', 'connective tissue', 'invasive cancer', 'immune infiltrate', 'breast glands', 'cancer in situ']
Found 48172 spots in target tissues
Processing parquet file in chunks of 500...
  Chunk 1: Found 280 matching spots
  Chunk 2: Found 218 matching spots
  Chunk 3: Found 342 matching spots
  Chunk 4: Found 148 matching spots
  Chunk 5: Found 172 matching spots
  Chunk 6: Found 300 matching spots
  Chunk 7: Found 320 matching spots
  Chunk 8: Found 286 matching spots
  Chunk 9: Found 336 matching spots
  Chunk 10: Found 326 matching spots
  Processed 5000 rows...
  Chunk 11: Found 323 matching spots
  Chunk 12: Found 316 matching spots
  Chunk 13: Found 157 matching spots
  Chunk 14: Fo

In [23]:
multi_tissue_spots.to_parquet("/storage/research/dbmr_luisierlab/temp/lfournier/repositories/TumorArchetype-FM/results/molecular/TNBC_processed/TNBC_filtered_counts_annotated_spots.parquet")

In [3]:
# Load parquet file
normalized_counts = "/storage/research/dbmr_luisierlab/temp/lfournier/repositories/TumorArchetype-FM/results/molecular/TNBC_processed/TNBC_filtered_normalized_counts_annotated_spots.parquet"
filtered_normalized = pd.read_parquet(normalized_counts)

In [4]:
filtered_normalized

Unnamed: 0_level_0,A2M,A2ML1,A4GALT,AAAS,AACS,AAGAB,AAK1,AAMDC,AAMP,AAR2,...,LHX2,C12orf56,GATA5,HECW1,KRT6C,PRKG2,CLEC4D,HSPB9,UTS2,MTMR8
spot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TNBC1_spot2x18,0.000000,0.0,0.0,0.000000,0.0,0.0,1.371040,0.000000,1.928458,2.284252,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TNBC1_spot2x20,1.740722,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,1.740722,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TNBC1_spot2x22,1.578796,0.0,0.0,0.000000,0.0,0.0,1.578796,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TNBC1_spot2x24,1.589578,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,1.082089,1.082089,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TNBC1_spot2x26,2.371941,0.0,0.0,0.000000,0.0,0.0,0.000000,0.870586,1.328830,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TNBC96_spot64x22,2.977004,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TNBC96_spot64x24,3.044522,0.0,0.0,0.000000,0.0,0.0,0.000000,1.466337,0.000000,1.466337,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TNBC96_spot64x26,2.958109,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,1.537362,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TNBC96_spot64x28,2.826836,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
