In [3]:
import sys
sys.path.append('../utility_box/')
import image_utils as iu
import load

In [5]:
from cpath import WSI

In [6]:
from openslide import OpenSlide
from pathlib import Path
import pandas as pd

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
def split_positive_cases(positive_cases, val_ratio):
    train_positive_cases = pd.DataFrame()
    val_positive_cases = pd.DataFrame()

    # Group by 'group'
    grouped = positive_cases.groupby('group')

    for name, group in grouped:
        if len(group) > 1:  # Ensure at least 2 samples to split
            # Calculate the number of samples for validation
            n_val = int(len(group) * val_ratio)
            train, val = train_test_split(group, test_size=n_val, random_state=42)
            train_positive_cases = pd.concat([train_positive_cases, train])
            val_positive_cases = pd.concat([val_positive_cases, val])
        else:
            # If only one instance, add to train set only
            train_positive_cases = pd.concat([train_positive_cases, group])

    return train_positive_cases.reset_index(drop=True), val_positive_cases.reset_index(drop=True)

In [9]:
def get_numpy_mask(poly, contours, width, height, shift_x=0, shift_y=0, scale=1):
    
    min_x, min_y, max_x, max_y = poly.bounds
    poly_mask=np.zeros((width, height), dtype=np.uint8)

    for contour in contours:
        shifted_coords = np.array([(x - min_x, y - min_y) for x, y in np.array(contour)], dtype=np.int32)
        shifted_coords[:,0]=(shifted_coords[:,0]+shift_x)*scale
        shifted_coords[:,1]=(shifted_coords[:,1]+shift_y)*scale
        cv2contour=shifted_coords.reshape((-1, 1, 2))
        cv2.fillPoly(poly_mask, [cv2contour], color=1)
    
    return poly_mask

In [10]:
def get_bondary_coords(poly, patch_size):
    min_x, min_y, max_x, max_y = poly.bounds
    width=int(max_x - min_x)
    height=int(max_y - min_y)

    if patch_size>=max(width, height):
        diff=patch_size-max(width, height)
        delta=diff+patch_size+random.randint(0,patch_size)
        
    else:
        diff=max(width, height)-patch_size
        delta=diff+patch_size+random.randint(0,patch_size)
    
    start_x=min_x-delta
    start_y=min_y-delta
    
    stop_x=max_x+delta
    stop_y=max_y+delta

    return start_x,start_y,stop_x,stop_y

# Create Sampling Distribution

## CAMELYON17

In [15]:
annFolder=Path('/workspace/data/PublicDatasets/CAMELYON17/tumor_geoms')
wsisFolder=Path('/workspace/data/PublicDatasets/CAMELYON17/images')
annsPath=list(annFolder.iterdir())
annsName=[annPath.stem for annPath in annsPath]
stages=pd.read_csv('/workspace/data/PublicDatasets/CAMELYON17/stages.csv')

In [16]:
data_distribution=[]
for idx, row in stages.iterrows():
    
    if 'zip' in row['patient']:
        continue

    wsi_path=Path(f"/workspace/data/PublicDatasets/CAMELYON17/images/{row['patient']}")

    if wsi_path.exists():
    
        slideStem=row['patient'].split('.')[0]
        
        if row['stage']=='negative':
            tempDict={}
            
            tempDict['wsi_name']=row['patient']
            tempDict['group']=row['stage']
            tempDict['ann_name']=None
            tempDict['ann_folder']=None
            tempDict['wsi_folder']='/workspace/data/PublicDatasets/CAMELYON17/images'
            tempDict['mpp']=0.25
            data_distribution.append(tempDict)
            
        else:
            
            if slideStem in annsName:
                tempDict={}
                tempDict['wsi_name']=row['patient']
                tempDict['group']=row['stage']
                tempDict['ann_name']=f"{slideStem}.pkl"
                tempDict['ann_folder']='/workspace/data/PublicDatasets/CAMELYON17/tumor_geoms'
                tempDict['wsi_folder']='/workspace/data/PublicDatasets/CAMELYON17/images'
                tempDict['mpp']=0.25
                data_distribution.append(tempDict)

## CAMELYON16

In [17]:
annFolder=Path('/workspace/data/PublicDatasets/CAMELYON16/tumor_geoms')
wsisFolder=Path('/workspace/data/PublicDatasets/CAMELYON16/images')
annsPath=list(annFolder.iterdir())
annsName=[annPath.stem for annPath in annsPath]

In [18]:
for wsiFullName in list(wsisFolder.iterdir()):
    slideStem=wsiFullName.stem
    wsi_path=Path(f"/workspace/data/PublicDatasets/CAMELYON16/images/{wsiFullName.name}")
    if wsi_path.exists():
        if slideStem in annsName:
            
            tempDict={}
            tempDict['wsi_name']=wsiFullName.name
            tempDict['group']='tumor'
            tempDict['ann_name']=f"{slideStem}.pkl"
            tempDict['ann_folder']='/workspace/data/PublicDatasets/CAMELYON16/tumor_geoms'
            tempDict['wsi_folder']='/workspace/data/PublicDatasets/CAMELYON16/images'
            tempDict['mpp']=0.25
            
            data_distribution.append(tempDict)
            
        else:
            
            tempDict={}
            tempDict['wsi_name']=wsiFullName.name
            tempDict['group']='negative'
            tempDict['ann_name']=None
            tempDict['ann_folder']=None
            tempDict['wsi_folder']='/workspace/data/PublicDatasets/CAMELYON16/images'
            tempDict['mpp']=0.25
        
            data_distribution.append(tempDict)

## compileDistribution

In [19]:
data_distribution=pd.DataFrame(data_distribution)

In [20]:
data_distribution.to_csv('data_distribution.csv', index=False)

In [21]:
data_distribution.shape

(766, 6)

# split into test, train, val

In [22]:
import pandas as pd

In [23]:
data_distribution=pd.read_csv('data_distribution.csv')

positive_cases=data_distribution[data_distribution['ann_name'].notna()]
negative_cases=data_distribution[data_distribution['ann_name'].isna()]

In [24]:
train_ratio=0.80
val_ratio=0.10
test_ratio=0.10

In [25]:
train_positive_cases, temp_positive_cases = split_positive_cases(positive_cases, val_ratio+test_ratio)
train_negative_cases, temp_negative_cases = train_test_split(negative_cases, test_size=val_ratio+test_ratio, random_state=42)

val_positive_cases, test_positive_cases = split_positive_cases(temp_positive_cases, test_ratio/(test_ratio+val_ratio))
val_negative_cases, test_negative_cases = train_test_split(temp_negative_cases, test_size=test_ratio/(test_ratio+val_ratio), random_state=42)

train_distribution = pd.concat([train_positive_cases, train_negative_cases]).reset_index(drop=True)
val_distribution = pd.concat([val_positive_cases, val_negative_cases]).reset_index(drop=True)
test_distribution = pd.concat([test_positive_cases, test_negative_cases]).reset_index(drop=True)

assert val_distribution.shape[0]+test_distribution.shape[0]+train_distribution.shape[0] ==data_distribution.shape[0]

In [27]:
train_distribution.to_csv('train_distribution.csv', index=False)
val_distribution.to_csv('val_distribution.csv', index=False)
test_distribution.to_csv('test_distribution.csv', index=False)

In [28]:
test_distribution['group'].value_counts()

group
negative    56
tumor       16
itc          1
macro        1
micro        1
Name: count, dtype: int64

In [29]:
train_distribution['group'].value_counts()

group
negative    445
tumor       128
micro        14
itc          13
macro        13
Name: count, dtype: int64

In [30]:
val_distribution['group'].value_counts()

group
negative    56
tumor       16
itc          2
macro        2
micro        2
Name: count, dtype: int64