In [1]:
working_directory = '/mnt/DataRAID/melismail/PDAC'
import os
os.chdir(working_directory)
from pickle_utils import write_pickle, read_pickle


import sys, cv2
import numpy as np
import pandas as pd


from os import listdir 
from os.path import isfile, join
from tifffile import imread, imsave
from glob import glob

np.random.seed(42)

Load data

In [2]:
base_path = '/mnt/DataRAID/melismail/PDAC/data'
preprocessing_path ='Preprocessing_celltypes_annotation'
img_path = 'images/pdac_he_warped/images'
features_path = 'Features_extraction'
model_path = 'InceptionV3' #ResNet50 #VGG-16
dim = 512 #224
dim_path = f"{dim}x{dim}"

In [3]:
df_labels = pd.read_csv(os.path.join(base_path, preprocessing_path, f"combined_info_celltypes.csv"), sep='\t')

In [4]:
print(df_labels.columns)
print(df_labels['celltypes_fine'].unique())
print(df_labels['celltypes_coarse'].unique())

Index(['Cartana', 'x', 'y', 'celltypes_fine', 'celltypes_coarse', 'Cartana_x',
       'Cartana_y', 'Patient ID', 'Pseudonym'],
      dtype='object')
['Schwann cells' 'myCAF_POSTN' 'Classical_KRT7' 'Classical_CEACAM'
 'Acinar cells' 'T cells' 'Beta cells' 'Basal' 'Classical_TFF1'
 'Endothelial cells' 'B cells' 'Ductal cells' 'Macrophages_M2'
 'Alpha cells' 'NK cells' 'Delta cells' 'myCAF_ACTA2' 'Dendritic cells'
 'iCAF' 'Classical_REG4' 'Macrophages_M1' 'Gamma cells']
['Schwann cells' 'Fibroblasts' 'Classical PDAC' 'Acinar cells'
 'Lymphocytes' 'Endocrine cells' 'Basal-like PDAC' 'Endothelial cells'
 'Ductal cells' 'Macrophages' 'Dendritic cells']


In [5]:
image_size_dict = read_pickle(path=os.path.join(base_path,img_path, f"img_size_dict.pkl"))
if image_size_dict is None:

        image_size_dict = {file.split('_')[0]:  cv2.imread(os.path.join(base_path, img_path, file), cv2.COLOR_BGR2RGB).shape
                               for file in [f
                                            for f in os.listdir(os.path.join(base_path, img_path))
                                            if os.path.isfile(os.path.join(base_path, img_path, f)) and ".tif" in f]
                              }

        write_pickle(path=os.path.join(base_path, img_path, f"img_size_dict.pkl"), obj=image_size_dict)
else: 
    print("Read from Disk")

Read from Disk


In [6]:
def find_tile_id(img_dim: tuple(), x:float, y:float, tile_dim: int):
    if img_dim is None:
        return None
    # get point coordinates from csv
    x_tile_coord = int(x//tile_dim)
    y_tile_coord = int(y//tile_dim)
    return (x_tile_coord, y_tile_coord)

In [7]:
df_labels["tile_id"] = df_labels.apply(lambda row: find_tile_id(img_dim=image_size_dict.get(row["Pseudonym"], None),
                                                              x=row["x"], y=row["y"], tile_dim=dim),
                                       axis=1)

In [8]:
label_dict = {"Basal": "Basal", "Classical_KRT7": "Classical_KRT7", "Classical_CEACAM": "Classical_CEACAM", "Classical_TFF1": "Classical_TFF1", "Classical_REG4": "Classical_REG4"}
df_labels["just_cancer"] = df_labels.apply(lambda x: label_dict.get(x["celltypes_fine"], "non-cancer"), axis=1)

df_labels_prevalent = df_labels[df_labels["just_cancer"]!="non-cancer"].groupby(by=["tile_id", "Pseudonym", "celltypes_fine"])["celltypes_fine"].count().sort_values().groupby(level=0).tail(1).rename("cell_types")
df_labels_prevalent = df_labels_prevalent.reset_index(level=2).rename(columns={"celltypes_fine": "most_prevalent_cancer"})[["most_prevalent_cancer"]]
df_labels_prevalent

Unnamed: 0_level_0,Unnamed: 1_level_0,most_prevalent_cancer
tile_id,Pseudonym,Unnamed: 2_level_1
"(13, 25)",I3T95PWU4,Basal
"(16, 0)",IL771K246,Classical_KRT7
"(14, 28)",IEV81RDT6,Classical_TFF1
"(15, 1)",I3T95PWU4,Classical_KRT7
"(9, 28)",IEV81RDT6,Classical_TFF1
...,...,...
"(18, 5)",IEV81RDT6,Classical_TFF1
"(20, 4)",IL771K246,Basal
"(20, 5)",IEV81RDT6,Classical_TFF1
"(20, 6)",IEV81RDT6,Classical_TFF1


In [9]:
df_labels_grouped = df_labels.groupby(by=["tile_id", "Pseudonym", "celltypes_fine"])["celltypes_fine"].count()
df_tile_celltypes = df_labels_grouped.unstack(level=2).fillna(0)
df_tile_celltypes

Unnamed: 0_level_0,celltypes_fine,Acinar cells,Alpha cells,B cells,Basal,Beta cells,Classical_CEACAM,Classical_KRT7,Classical_REG4,Classical_TFF1,Delta cells,...,Endothelial cells,Gamma cells,Macrophages_M1,Macrophages_M2,NK cells,Schwann cells,T cells,iCAF,myCAF_ACTA2,myCAF_POSTN
tile_id,Pseudonym,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
"(0, 10)",IAA2LDX17,0.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(0, 11)",IAA2LDX17,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(0, 12)",IAA2LDX17,1.0,0.0,4.0,6.0,1.0,30.0,1.0,0.0,10.0,0.0,...,2.0,0.0,4.0,1.0,2.0,2.0,1.0,1.0,2.0,0.0
"(0, 13)",IAA2LDX17,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(1, 2)",1C73PUTH4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"(36, 19)",IEV81RDT6,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.0,3.0,0.0,1.0,0.0,0.0,1.0,1.0
"(36, 20)",IEV81RDT6,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,1.0,0.0,3.0,0.0,0.0,3.0,1.0,0.0,1.0,4.0
"(36, 21)",IEV81RDT6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
"(36, 22)",IEV81RDT6,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
df_tile_celltypes = df_tile_celltypes.join(df_labels_prevalent).fillna("non-cancer").reset_index()
df_tile_celltypes

Unnamed: 0,tile_id,Pseudonym,Acinar cells,Alpha cells,B cells,Basal,Beta cells,Classical_CEACAM,Classical_KRT7,Classical_REG4,...,Gamma cells,Macrophages_M1,Macrophages_M2,NK cells,Schwann cells,T cells,iCAF,myCAF_ACTA2,myCAF_POSTN,most_prevalent_cancer
0,"(0, 10)",IAA2LDX17,0.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Classical_KRT7
1,"(0, 11)",IAA2LDX17,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Classical_CEACAM
2,"(0, 12)",IAA2LDX17,1.0,0.0,4.0,6.0,1.0,30.0,1.0,0.0,...,0.0,4.0,1.0,2.0,2.0,1.0,1.0,2.0,0.0,Classical_CEACAM
3,"(0, 13)",IAA2LDX17,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Classical_CEACAM
4,"(1, 2)",1C73PUTH4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,Basal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3304,"(36, 19)",IEV81RDT6,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,2.0,3.0,0.0,1.0,0.0,0.0,1.0,1.0,Basal
3305,"(36, 20)",IEV81RDT6,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,...,0.0,3.0,0.0,0.0,3.0,1.0,0.0,1.0,4.0,Classical_KRT7
3306,"(36, 21)",IEV81RDT6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,non-cancer
3307,"(36, 22)",IEV81RDT6,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,non-cancer


In [11]:
features = read_pickle(os.path.join(base_path, features_path, model_path, dim_path, f"{model_path}_{dim_path}_img_df.pkl"))

Process data

In [12]:
df_tiles_celltypes_and_features = pd.merge(df_tile_celltypes, features, on=["tile_id", "Pseudonym"])

In [13]:
print(df_tiles_celltypes_and_features.columns)

Index(['tile_id', 'Pseudonym', 'Acinar cells', 'Alpha cells', 'B cells',
       'Basal', 'Beta cells', 'Classical_CEACAM', 'Classical_KRT7',
       'Classical_REG4', 'Classical_TFF1', 'Delta cells', 'Dendritic cells',
       'Ductal cells', 'Endothelial cells', 'Gamma cells', 'Macrophages_M1',
       'Macrophages_M2', 'NK cells', 'Schwann cells', 'T cells', 'iCAF',
       'myCAF_ACTA2', 'myCAF_POSTN', 'most_prevalent_cancer', 'Features'],
      dtype='object')


In [14]:
df_tiles_celltypes_and_features["lbl"] = np.where((df_tiles_celltypes_and_features['most_prevalent_cancer'] == 'Basal') | (df_tiles_celltypes_and_features['most_prevalent_cancer'].str.contains('Classical')), "cancer", "non-cancer")

In [15]:
df_tiles_celltypes_and_features

Unnamed: 0,tile_id,Pseudonym,Acinar cells,Alpha cells,B cells,Basal,Beta cells,Classical_CEACAM,Classical_KRT7,Classical_REG4,...,Macrophages_M2,NK cells,Schwann cells,T cells,iCAF,myCAF_ACTA2,myCAF_POSTN,most_prevalent_cancer,Features,lbl
0,"(0, 10)",IAA2LDX17,0.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Classical_KRT7,"[0.11127892, 0.082550116, 0.013145252, 0.14383...",cancer
1,"(0, 11)",IAA2LDX17,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Classical_CEACAM,"[0.088218965, 0.045640353, 0.0048801894, 0.144...",cancer
2,"(0, 12)",IAA2LDX17,1.0,0.0,4.0,6.0,1.0,30.0,1.0,0.0,...,1.0,2.0,2.0,1.0,1.0,2.0,0.0,Classical_CEACAM,"[0.14599374, 0.060965236, 0.00016441147, 0.136...",cancer
3,"(0, 13)",IAA2LDX17,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Classical_CEACAM,"[0.16967882, 0.14898822, 0.01608835, 0.1665820...",cancer
4,"(1, 2)",1C73PUTH4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,Basal,"[0.036901504, 0.026912397, 0.0048136557, 0.123...",cancer
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2977,"(29, 18)",IEV81RDT6,3.0,0.0,4.0,0.0,2.0,1.0,3.0,6.0,...,0.0,0.0,1.0,1.0,3.0,8.0,0.0,Classical_TFF1,"[0.12156977, 0.05843921, 0.0, 0.08155618, 0.12...",cancer
2978,"(29, 19)",IEV81RDT6,2.0,2.0,4.0,0.0,1.0,2.0,2.0,0.0,...,0.0,0.0,2.0,0.0,1.0,10.0,2.0,Classical_KRT7,"[0.23044428, 0.06414968, 0.038849287, 0.092282...",cancer
2979,"(29, 20)",IEV81RDT6,0.0,0.0,1.0,1.0,0.0,1.0,0.0,2.0,...,1.0,0.0,0.0,1.0,0.0,2.0,0.0,Classical_TFF1,"[0.1988481, 0.023601765, 0.0003715857, 0.06648...",cancer
2980,"(29, 24)",IEV81RDT6,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,1.0,1.0,0.0,Classical_TFF1,"[0.13010928, 0.07688, 0.003299488, 0.102255315...",cancer


In [16]:
write_pickle(path=os.path.join(base_path, preprocessing_path, model_path, f"{model_path}_celltypes_lbl_df.pkl"), obj=df_tiles_celltypes_and_features)