In [1]:
import pandas as pd
import numpy as np  

from readimc import MCDFile, TXTFile

import anndata
import pickle

from imread import imread, imsave
import re
import os

In [1]:
%reset -f

# Table of Contents

1. [Store Patient and Sample IDs](#1.-Store-Patient-and-Sample-IDs)
2. [Store Mask IDs](#2.-Store-Mask-IDs)
3. [Store Image IDs](#3.-Store-Image-IDs)
4. [Check identical masks](#4.-Check-identical-masks)

# 1. Store Patient and Sample IDs

Store sample IDs based on the sce object 

In [2]:
ad = anndata.read_h5ad('/work/FAC/FBM/DBC/mrapsoma/prometex/data/NSCLC/02_processed/sce_objects/sce.h5ad')

In [5]:
# create column in anndata obs with sample id based on TmaID TmaBlock and acID
ad.obs["sample_ID"] = ad.obs["TmaID"].astype(str) + "_" + ad.obs["TmaBlock"].astype(str) + "_" + ad.obs["acID"].astype(str)

In [6]:
len(np.unique(ad.obs['sample_ID']))

2072

In [7]:
ad.obs.columns

Index(['ImageNumber', 'CellNumber', 'Center_X', 'Center_Y', 'Area',
       'MajorAxisLength', 'MinorAxisLength', 'Compartment', 'Area_Description',
       'BatchID', 'Panel', 'TmaID', 'TmaBlock', 'acID', 'CellID', 'mclust',
       'TMA', 'Tma_ac', 'cell_category', 'cell_type', 'cell_subtype', 'ROI_xy',
       'RoiID', 'Patient_Nr', 'X..spots', 'DX.name', 'x.y.localisation', 'Age',
       'Gender', 'Typ', 'Grade', 'Size', 'Vessel', 'Pleura', 'T.new', 'N',
       'M.new', 'Stage', 'R', 'Chemo', 'Radio', 'Chemo3', 'Radio4', 'Relapse',
       'Chemo5', 'Radio6', 'DFS', 'Ev.O', 'OS', 'Smok', 'Nikotin', 'ROI',
       'Patient_ID', 'LN.Met', 'Dist.Met', 'NeoAdj', 'Area_px_Stroma',
       'Area_px_Tumour', 'Area_px_Core', 'Area_mm_Stroma', 'Area_mm_Tumour',
       'Area_mm_Core', 'sample_ID'],
      dtype='object')

In [None]:
# need to store in a txt file unique sample_IDs and its associated patient ID 

unique_samples = ad.obs[['sample_ID', 'Patient_ID']].drop_duplicates()

unique_samples.to_csv('SampleIDs_PatientIDs.txt', sep='\t', index=False)

In [17]:
ad.obs['DX.name'].values

['Adenocarcinoma', 'Adenocarcinoma', 'Adenocarcinoma', 'Adenocarcinoma', 'Adenocarcinoma', ..., 'Squamous cell carcinoma', 'Adenocarcinoma', 'Adenocarcinoma', 'Adenocarcinoma', 'Adenocarcinoma']
Length: 5984454
Categories (10, object): ['Adeno squamous cell carcinoma', 'Adenocarcinoma', 'Basaloides Ca', 'Control', ..., 'Mesotheliom', 'NA', 'NSCLC', 'Squamous cell carcinoma']

# 2. Store Mask IDs

In [23]:
directory = '/work/FAC/FBM/DBC/mrapsoma/prometex/data/NSCLC/02_processed/Cell_masks_copy/renamed_masks_all'


file_names = []

for file in os.listdir(directory):
    if file.endswith('.tiff'): 
        file_names.append(file[:-5])  

output_file = 'masks_IDs.txt'

with open(output_file, 'w') as f:
    for name in file_names:
        f.write(name + '\n')

print(f"Filenames have been saved to {output_file}")

Filenames have been saved to masks_IDs.txt


# 3. Store Image IDs

In [24]:
directory = '/work/FAC/FBM/DBC/mrapsoma/prometex/data/NSCLC/01_raw/raw/img'

pattern = r'86_([A-C])_(\d+)' 


with open('imgs_ID.txt', 'w') as f:

    for filename in os.listdir(directory):

        if filename.endswith('.tiff'):

            match = re.search(pattern, filename)
            if match:

                letter = match.group(1)
                number = int(match.group(2))  
                pattern_str = f"86_{letter}_{number}"  
                f.write(pattern_str + '\n')


In [3]:
df_imgs_ID = pd.read_csv('imgs_ID.txt', header=None, names=['imgs_ID'])
df_masks_ID = pd.read_csv('masks_IDs.txt', header=None, names=['masks_ID'])
df_sample_ID = pd.read_csv('SampleIDs_PatientIDs.txt', sep='\t')


In [4]:
df_masks_ID

Unnamed: 0,masks_ID
0,88_A_100
1,176_C_34
2,87_A_86
3,178_C_30
4,175_B_47
...,...
1817,88_B_103
1818,88_B_121
1819,86_A_33
1820,176_C_65


In [5]:
df_sample_ID

Unnamed: 0,sample_ID,Patient_ID
0,86_A_1,86_1
1,86_A_10,86_37
2,86_A_100,86_8
3,86_A_101,86_12
4,86_A_102,86_16
...,...,...
2067,86_B_78,86_87
2068,88_A_7,88_362
2069,86_B_82,86_103
2070,176_C_30,Control


In [None]:
df_aligned = pd.merge(df_masks_ID, df_sample_ID, how='outer', left_on='imgs_ID', right_on='masks_ID')

df_aligned

Unnamed: 0,imgs_ID,masks_ID
0,,175_A_1
1,,175_A_10
2,,175_A_100
3,,175_A_102
4,,175_A_103
...,...,...
1817,,88_C_95
1818,,88_C_96
1819,,88_C_97
1820,,88_C_98


In [11]:
df_merged = pd.merge(df_masks_ID, df_sample_ID, left_on='masks_ID', right_on='sample_ID', how='outer')
df_merged

Unnamed: 0,masks_ID,sample_ID,Patient_ID
0,175_A_1,175_A_1,175_1
1,175_A_10,175_A_10,175_41
2,175_A_100,175_A_100,175_59
3,175_A_102,175_A_102,175_63
4,175_A_103,175_A_103,175_4
...,...,...,...
2073,88_C_95,88_C_95,88_505
2074,88_C_96,88_C_96,88_509
2075,88_C_97,88_C_97,88_513
2076,88_C_98,88_C_98,88_517


In [13]:
df_with_nan = df_merged[df_merged.isna().any(axis=1)]
df_with_nan

Unnamed: 0,masks_ID,sample_ID,Patient_ID
395,,176_A_1,176_193
396,,176_A_10,176_229
397,,176_A_100,176_208
398,,176_A_101,176_212
399,,176_A_102,176_216
...,...,...,...
1231,86_B_24,,
1734,88_A_1,,
1780,88_A_2,,
1791,88_A_3,,


# 4. Check identical masks

In [20]:
# Define directories
dir1 = "/work/FAC/FBM/DBC/mrapsoma/prometex/data/NSCLC/01_raw/Cell_masks/86_B_mask/"
dir2 = "/work/FAC/FBM/DBC/mrapsoma/prometex/data/NSCLC/01_raw/Cell_masks/86_C_mask/"

# Get list of files in both directories
files1 = set(os.listdir(dir1))
files2 = set(os.listdir(dir2))
renamed_files = {file: file.replace("_new.tiff", ".tiff") for file in files2}
second_names = list(renamed_files.values())
second_names

# Find common files (files that exist in both directories)
common_files = files1.intersection(second_names)

print(common_files)

# Compare images
for file in common_files:
    path1 = os.path.join(dir1, file)
    path2 = os.path.join(dir2, file.replace('.tiff', '_new.tiff'))

    # Open images
    img1 = Image.open(path1)
    img2 = Image.open(path2)

    # Convert to numpy arrays
    arr1 = np.array(img1)
    arr2 = np.array(img2)

    # Check if images are exactly the same
    if np.array_equal(arr1, arr2):
        print(f"{file}: ✅ Identical")
    else:
        print(f"{file}: ❌ Different")

{'2020120_LC_NSCLC_TMA_86_C_s0_a47_ac_ilastik_s2_Probabilitiescells_mask.tiff', '2020120_LC_NSCLC_TMA_86_C_s0_a98_ac_ilastik_s2_Probabilitiescells_mask.tiff', '2020120_LC_NSCLC_TMA_86_C_s0_a90_ac_ilastik_s2_Probabilitiescells_mask.tiff', '2020120_LC_NSCLC_TMA_86_C_s0_a39_ac_ilastik_s2_Probabilitiescells_mask.tiff', '2020120_LC_NSCLC_TMA_86_C_s0_a88_ac_ilastik_s2_Probabilitiescells_mask.tiff', '2020120_LC_NSCLC_TMA_86_C_s0_a71_ac_ilastik_s2_Probabilitiescells_mask.tiff', '2020120_LC_NSCLC_TMA_86_C_s0_a26_ac_ilastik_s2_Probabilitiescells_mask.tiff', '2020120_LC_NSCLC_TMA_86_C_s0_a53_ac_ilastik_s2_Probabilitiescells_mask.tiff', '2020120_LC_NSCLC_TMA_86_C_s0_a31_ac_ilastik_s2_Probabilitiescells_mask.tiff', '2020120_LC_NSCLC_TMA_86_C_s0_a100_ac_ilastik_s2_Probabilitiescells_mask.tiff', '2020120_LC_NSCLC_TMA_86_C_s0_a35_ac_ilastik_s2_Probabilitiescells_mask.tiff', '2020120_LC_NSCLC_TMA_86_C_s0_a3_ac_ilastik_s2_Probabilitiescells_mask.tiff', '2020120_LC_NSCLC_TMA_86_C_s0_a91_ac_ilastik_s2_Pro

In [5]:
dir1 = '/work/FAC/FBM/DBC/mrapsoma/prometex/data/NSCLC/01_raw/Cell_masks/178_B_mask/'
dir2 = '/work/FAC/FBM/DBC/mrapsoma/prometex/data/NSCLC/01_raw/Cell_masks/176_B_mask/'

# Get list of files
files1 = set(os.listdir(dir1))
files2 = set(os.listdir(dir2))

# Find common files
common_files = files1.intersection(files2)

# Compare images
for file in common_files:
    path1 = os.path.join(dir1, file)
    path2 = os.path.join(dir2, file)

    # Open images
    img1 = Image.open(path1)
    img2 = Image.open(path2)

    # Convert to numpy arrays
    arr1 = np.array(img1)
    arr2 = np.array(img2)

    # Check if images are exactly the same
    if np.array_equal(arr1, arr2):
        print(f"{file}: ✅ Identical")
    else:
        print(f"{file}: ❌ Different")

20210112_LC_NSCLC_TMA_176_B_s0_a1_ac_ilastik_s2_Probabilitiescells_mask.tiff: ✅ Identical
20210112_LC_NSCLC_TMA_176_B_s0_a72_ac_ilastik_s2_Probabilitiescells_mask.tiff: ✅ Identical
20210112_LC_NSCLC_TMA_176_B_s0_a62_ac_ilastik_s2_Probabilitiescells_mask.tiff: ✅ Identical
20210112_LC_NSCLC_TMA_176_B_s0_a12_ac_ilastik_s2_Probabilitiescells_mask.tiff: ✅ Identical
20210112_LC_NSCLC_TMA_176_B_s0_a117_ac_ilastik_s2_Probabilitiescells_mask.tiff: ✅ Identical
20210112_LC_NSCLC_TMA_176_B_s0_a100_ac_ilastik_s2_Probabilitiescells_mask.tiff: ✅ Identical
20210112_LC_NSCLC_TMA_176_B_s0_a65_ac_ilastik_s2_Probabilitiescells_mask.tiff: ✅ Identical
20210112_LC_NSCLC_TMA_176_B_s0_a7_ac_ilastik_s2_Probabilitiescells_mask.tiff: ✅ Identical
20210112_LC_NSCLC_TMA_176_B_s0_a54_ac_ilastik_s2_Probabilitiescells_mask.tiff: ✅ Identical
20210112_LC_NSCLC_TMA_176_B_s0_a49_ac_ilastik_s2_Probabilitiescells_mask.tiff: ✅ Identical
20210112_LC_NSCLC_TMA_176_B_s0_a102_ac_ilastik_s2_Probabilitiescells_mask.tiff: ✅ Identica