In [2]:
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
import re
import numpy as np

# Prepare input data for Mosna analysis

Mosna require an input file with cells coordinates, marker levels (expression as counts or protein levels) and both a column with cell type for each cell and a set of columns one hot encoded for celltypes

In IMMUCAN, IMC files for NSCLC2 are composed of two separate files:
- raw count files with protein levels 
- cell metadata files with celltypes, coordinates and lot more.

Moreover, Mosna suppose to start with one sigle file with all the samples and patients inside. In later steps, this initial file will be processed into single nodes and edges files with .parquet extention

In [3]:
############# FILLED WITH TEST FOLDER ##########################

raw_counts_path = Path("/home/francesco.massaini/Desktop/IMMUCAN_data/NSCLC2/01_Imaging/IMC/test_samples/raw_counts")
cell_metadata_path = Path("/home/francesco.massaini/Desktop/IMMUCAN_data/NSCLC2/01_Imaging/IMC/test_samples/cell_metadata")

output_folder = Path("/home/francesco.massaini/Desktop/IMMUCAN_data/NSCLC2/01_Imaging/IMC/test_samples/combined_files")
############# FILLED WITH TEST FOLDER ##########################

In [4]:
raw_files = os.listdir(raw_counts_path)
metadata_files = os.listdir(cell_metadata_path)

In [38]:
raw_ids = [s.split('_#_raw_counts')[0] for s in raw_files]
raw_ids.sort()
print(raw_ids)

metadata_ids = [s.split('_#_cell_metadata')[0] for s in metadata_files]
metadata_ids.sort()
print(metadata_ids)

# Ideally, these ids should be the same
print(raw_ids == metadata_ids)

# Since we have checked they are equal, we can merge files based on their name
IMCs_all_TLS = [] # empty list to store all the final combined dfs
IMCs_all_otherROIs = [] # empty list to store all the final combined dfs

marker_columns = pd.read_csv(raw_counts_path/raw_files[0], index_col=0).columns.tolist() # Required to keep only relevant columns for Mosna in the final step
relevant_metadata_columns = ["Pos_X", "Pos_Y", "celltypes", "ROI", "tumor_patches"]
columns_to_select = relevant_metadata_columns + marker_columns

ROI_TLS = 4 # ROI which contains TLS

for raw_file in raw_files:
    # Attempt to read the raw data file, handle exceptions if file read fails
    try:
        df_raw = pd.read_csv(raw_counts_path/raw_file, index_col=0)  # First column is used as index for both dataframes
        df_raw.rename (columns={"CD4":"CD4_marker_col", "HLADR":"HLADR_marker_col"} , inplace=True) # To avoid duplicated columns during the hot-encoded columns creation   
    except Exception as e:
        print(f"Error reading {raw_file}: {e}")
        continue
    
    
    # Extract the sample ID from the filename
    sample_id = raw_file.split('_#_raw_counts')[0]
    print("raw:", raw_file, "\n        Corresponding sample:", sample_id)
    
    # Filter metadata files to find the one starting with the sample_id
    matching_files = [f for f in metadata_files if f.startswith(sample_id)]
    
    if len(matching_files) == 1:
        metadata_file = matching_files[0]
        
        # Attempt to read the metadata file, handle exceptions if file read fails
        try:
            df_metadata = pd.read_csv(cell_metadata_path/metadata_file, index_col=0)
        except Exception as e:
            print(f"Error reading metadata file {metadata_file}: {e}")
            continue
        
        print(" Corresponding metadata file:", metadata_file)

        # Further level of check: merge only if the number of rows (cells) matches
        if df_raw.shape[0] == df_metadata.shape[0]:
            try:
                df_combined = pd.merge(df_raw, df_metadata, left_index=True, right_index=True)
                file_name = sample_id + "_combined_raw_and_metadata.csv"
                df_combined.to_csv(output_folder / file_name)
                
                df_copy = df_combined[columns_to_select].copy()
                df_forMosna = pd.get_dummies(df_copy, columns=['celltypes'], drop_first=False, prefix=None) # one hot encoded cell types, as required by Mosna. Columns mustn't have any prefix in their name
                
                # Add back celltypes column used for the last step 
                if 'celltypes' not in df_forMosna.columns:
                    df_forMosna['celltypes'] = df_copy['celltypes']

                # Add some useful columns    
                df_forMosna['filename'] = file_name
                df_forMosna['sample'] = df_forMosna['filename'].str.split('-IMC').str[0]
                df_forMosna['patient'] = df_forMosna['filename'].str.split('-FIXT').str[0]    

                df_forMosna_TLS = df_forMosna[df_forMosna["ROI"] == ROI_TLS] # subset the df to analyze separatly the different ROIs
                df_forMosna_otherROIs = df_forMosna[df_forMosna["ROI"] != ROI_TLS]
                
                IMCs_all_TLS.append(df_forMosna_TLS)
                IMCs_all_otherROIs.append(df_forMosna_otherROIs)

                
            except Exception as e:
                print(f"Error during merging or saving the file: {e}")
    else:
        if len(matching_files) == 0:
            print(f"No matching metadata file found for {sample_id}")
        else:
            print(f"Multiple matching metadata files found for {sample_id}, please check data.")

IMC_all_TLS = pd.concat(IMCs_all_TLS)
IMC_all_TLS.to_csv(output_folder.parent / "IMC_all_TLS_test.csv") #### TOGLIERE TEST 

IMC_all_otherROIs = pd.concat(IMCs_all_otherROIs)
IMC_all_otherROIs.to_csv(output_folder.parent / "IMC_all_otherROIs_test.csv")  #### TOGLIERE TEST
    

['LUNG-NSCLC2-0620-FIXT-01-IMC1-01_#_IMMUcan_panel_1_1.10', 'LUNG-NSCLC2-0632-FIXT-01-IMC1-01_#_IMMUcan_panel_1_1.10', 'LUNG-NSCLC2-0633-FIXT-01-IMC1-01_#_IMMUcan_panel_1_1.10', 'LUNG-NSCLC2-0654-FIXT-01-IMC1-01_#_IMMUcan_panel_1_1.10', 'LUNG-NSCLC2-0685-FIXT-01-IMC1-01_#_IMMUcan_panel_1_1.10', 'LUNG-NSCLC2-0690-FIXT-01-IMC1-01_#_IMMUcan_panel_1_1.10', 'LUNG-NSCLC2-0692-FIXT-01-IMC1-01_#_IMMUcan_panel_1_1.10', 'LUNG-NSCLC2-0703-FIXT-01-IMC1-01_#_IMMUcan_panel_1_1.10', 'LUNG-NSCLC2-0704-FIXT-01-IMC1-01_#_IMMUcan_panel_1_1.10', 'LUNG-NSCLC2-0743-FIXT-01-IMC1-01_#_IMMUcan_panel_1_1.10']
['LUNG-NSCLC2-0620-FIXT-01-IMC1-01_#_IMMUcan_panel_1_1.10', 'LUNG-NSCLC2-0632-FIXT-01-IMC1-01_#_IMMUcan_panel_1_1.10', 'LUNG-NSCLC2-0633-FIXT-01-IMC1-01_#_IMMUcan_panel_1_1.10', 'LUNG-NSCLC2-0654-FIXT-01-IMC1-01_#_IMMUcan_panel_1_1.10', 'LUNG-NSCLC2-0685-FIXT-01-IMC1-01_#_IMMUcan_panel_1_1.10', 'LUNG-NSCLC2-0690-FIXT-01-IMC1-01_#_IMMUcan_panel_1_1.10', 'LUNG-NSCLC2-0692-FIXT-01-IMC1-01_#_IMMUcan_panel_1_1.

  values = values.astype(str)


raw: LUNG-NSCLC2-0620-FIXT-01-IMC1-01_#_IMMUcan_panel_1_1.10_#_raw_counts_#_e18fda5fdecfd4e71948936ced26c944.csv 
        Corresponding sample: LUNG-NSCLC2-0620-FIXT-01-IMC1-01_#_IMMUcan_panel_1_1.10
 Corresponding metadata file: LUNG-NSCLC2-0620-FIXT-01-IMC1-01_#_IMMUcan_panel_1_1.10_#_cell_metadata_#_c96f181b417ceedf372f4d3263a917f5.csv


  values = values.astype(str)


raw: LUNG-NSCLC2-0654-FIXT-01-IMC1-01_#_IMMUcan_panel_1_1.10_#_raw_counts_#_7dc081927735ca8105ddc4b7bed0cdc9.csv 
        Corresponding sample: LUNG-NSCLC2-0654-FIXT-01-IMC1-01_#_IMMUcan_panel_1_1.10
 Corresponding metadata file: LUNG-NSCLC2-0654-FIXT-01-IMC1-01_#_IMMUcan_panel_1_1.10_#_cell_metadata_#_38df0b46011881758ed93e5e90e40248.csv


  values = values.astype(str)


raw: LUNG-NSCLC2-0703-FIXT-01-IMC1-01_#_IMMUcan_panel_1_1.10_#_raw_counts_#_3bdc76183b659bfc20481cd98bdae562.csv 
        Corresponding sample: LUNG-NSCLC2-0703-FIXT-01-IMC1-01_#_IMMUcan_panel_1_1.10
 Corresponding metadata file: LUNG-NSCLC2-0703-FIXT-01-IMC1-01_#_IMMUcan_panel_1_1.10_#_cell_metadata_#_17b7519b9ceef8f6534c03cfa32a97d6.csv


  values = values.astype(str)


raw: LUNG-NSCLC2-0632-FIXT-01-IMC1-01_#_IMMUcan_panel_1_1.10_#_raw_counts_#_26ed6c84f93eb31906d943efaa28de2f.csv 
        Corresponding sample: LUNG-NSCLC2-0632-FIXT-01-IMC1-01_#_IMMUcan_panel_1_1.10
 Corresponding metadata file: LUNG-NSCLC2-0632-FIXT-01-IMC1-01_#_IMMUcan_panel_1_1.10_#_cell_metadata_#_6f6112ae900050d0856595b27a45d25d.csv


  values = values.astype(str)


raw: LUNG-NSCLC2-0692-FIXT-01-IMC1-01_#_IMMUcan_panel_1_1.10_#_raw_counts_#_9d6913ae9db56f9c2967b732e5f367f2.csv 
        Corresponding sample: LUNG-NSCLC2-0692-FIXT-01-IMC1-01_#_IMMUcan_panel_1_1.10
 Corresponding metadata file: LUNG-NSCLC2-0692-FIXT-01-IMC1-01_#_IMMUcan_panel_1_1.10_#_cell_metadata_#_8383629848658c6496b0326c04ab54c7.csv


  values = values.astype(str)


raw: LUNG-NSCLC2-0633-FIXT-01-IMC1-01_#_IMMUcan_panel_1_1.10_#_raw_counts_#_13260b10295253756c53a857ca8e744d.csv 
        Corresponding sample: LUNG-NSCLC2-0633-FIXT-01-IMC1-01_#_IMMUcan_panel_1_1.10
 Corresponding metadata file: LUNG-NSCLC2-0633-FIXT-01-IMC1-01_#_IMMUcan_panel_1_1.10_#_cell_metadata_#_ea0448ac79a6a53e7528608c31de99f3.csv


  values = values.astype(str)


raw: LUNG-NSCLC2-0743-FIXT-01-IMC1-01_#_IMMUcan_panel_1_1.10_#_raw_counts_#_e1fb93fd9f172f6d39a94e3e73ff1b40.csv 
        Corresponding sample: LUNG-NSCLC2-0743-FIXT-01-IMC1-01_#_IMMUcan_panel_1_1.10
 Corresponding metadata file: LUNG-NSCLC2-0743-FIXT-01-IMC1-01_#_IMMUcan_panel_1_1.10_#_cell_metadata_#_475c7e8fa35530b658f80c60d7038bcf.csv


  values = values.astype(str)


raw: LUNG-NSCLC2-0685-FIXT-01-IMC1-01_#_IMMUcan_panel_1_1.10_#_raw_counts_#_b319984eb843ccbbd6b2bbc5f6f47cc0.csv 
        Corresponding sample: LUNG-NSCLC2-0685-FIXT-01-IMC1-01_#_IMMUcan_panel_1_1.10
 Corresponding metadata file: LUNG-NSCLC2-0685-FIXT-01-IMC1-01_#_IMMUcan_panel_1_1.10_#_cell_metadata_#_6c9a8274e9a0c7885ef83e3aa1d50c6b.csv


  values = values.astype(str)


raw: LUNG-NSCLC2-0704-FIXT-01-IMC1-01_#_IMMUcan_panel_1_1.10_#_raw_counts_#_5098c04b5f916ad463337321a487000f.csv 
        Corresponding sample: LUNG-NSCLC2-0704-FIXT-01-IMC1-01_#_IMMUcan_panel_1_1.10
 Corresponding metadata file: LUNG-NSCLC2-0704-FIXT-01-IMC1-01_#_IMMUcan_panel_1_1.10_#_cell_metadata_#_e5b17b82674be648b77907e2e0303165.csv


  values = values.astype(str)
