In [12]:
import pandas as pd
import h5py
import os 

In [39]:
# Input and output directories
input_dir = '/home/evos/Outputs/CRC/FFPE/CLAM_patch_features/h5_files'
output_dir = '/home/evos/Outputs/CRC/UNI_features_TCGA/FFPE_CLAM'

# Ensure the output directory exists
os.makedirs(output_dir, exist_ok=True)

# Loop through all .h5 files in the input directory
for file_name in os.listdir(input_dir):
    if file_name.endswith('.h5'):  # Process only .h5 files
        file_path = os.path.join(input_dir, file_name)
        
        # Open and process the .h5 file
        with h5py.File(file_path, 'r') as h5_file:
            # Extract datasets
            coords = h5_file['coords'][:]
            features = h5_file['features'][:]
        
        # Extract slide name from the file name
        slide_name = file_name.split(".")[0]

        # Prepare the data for DataFrame
        data = {
            'slide_submitter_id': [slide_name] * len(coords),
            'Coord_X': coords[:, 0],
            'Coord_Y': coords[:, 1]
        }

        # Add feature columns
        feature_columns = {f'Feature_{i}': features[:, i] for i in range(features.shape[1])}
        data.update(feature_columns)

        # Create the DataFrame
        df = pd.DataFrame(data)
        
        # Add additional columns
        df['tile_ID'] = df['slide_submitter_id'] + '_' + df['Coord_X'].astype(str) + '_' + df['Coord_Y'].astype(str)
        df['sample_submitter_id'] = df['tile_ID'].str.split('-').str[:4].str.join('-')
        df['Section'] = df['slide_submitter_id'].str.split('-').str[-1]

        # Save the DataFrame to a parquet file
        output_file = os.path.join(output_dir, f"{slide_name}_features.parquet")
        df.to_parquet(output_file, index=False)
        print(f"Saved {output_file}")

print("Processing complete!")


Saved /home/evos/Outputs/CRC/UNI_features_TCGA/FFPE_CLAM/TCGA-4N-A93T-01Z-00-DX1_features.parquet
Saved /home/evos/Outputs/CRC/UNI_features_TCGA/FFPE_CLAM/TCGA-3L-AA1B-01Z-00-DX2_features.parquet
Saved /home/evos/Outputs/CRC/UNI_features_TCGA/FFPE_CLAM/TCGA-A6-2671-01Z-00-DX1_features.parquet
Saved /home/evos/Outputs/CRC/UNI_features_TCGA/FFPE_CLAM/TCGA-A6-2672-01Z-00-DX1_features.parquet
Processing complete!
