In [1]:
import torch
import torch.nn as nn
from math import floor
import os
import random
import numpy as np
import pandas as pd
import pdb
import time
from datasets.dataset_h5 import Dataset_All_Bags, Whole_Slide_Bag
from torch.utils.data import DataLoader
from models.resnet_custom import resnet50_baseline
import argparse
from utils.utils import print_network, collate_features
from utils.file_utils import save_hdf5
from PIL import Image
import h5py

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [2]:
device

device(type='cuda')

In [15]:
import os
import cv2
import h5py
import numpy as np

# Get all files in the directory
folder = './test_10x_patches'
img_ext = '.png'
files = [f for f in os.listdir(folder) if os.path.isfile(os.path.join(folder, f)) and f.endswith(img_ext)]
print(len(files))

# Read all images and append to a list
images = []
for file in files:
    img_path = os.path.join(folder, file)
    img = np.array(Image.open(img_path)) #cv2.imread(os.path.join(folder, file))
    if img is not None:
        images.append(img)
    else:
        print('WARNING: IMAGE NOT EXIST')

# Convert list to numpy array
images_np = np.array(images)

h5_folder = f'{folder}/h5'
# create csv folder if it doesn't exist
if not os.path.isdir(h5_folder):
    os.mkdir(h5_folder)
else: # remove all files in csv folder
    for f in os.listdir(h5_folder):
        os.remove(os.path.join(h5_folder, f))

# Create a new HDF5 file
h5f = h5py.File(os.path.join(folder, 'image_data.h5'), 'w')

# Store the images in the 'imgs' dataset
h5f.create_dataset('imgs', data=images_np)

# Close the file
h5f.close()


In [5]:
folder = 'test_10x_patches'
files = [f for f in os.listdir(folder) if os.path.isfile(os.path.join(folder, f))]
csv_folder = f'{folder}/csv'
# create csv folder if it doesn't exist
if not os.path.isdir(csv_folder):
    os.mkdir(csv_folder)
else: # remove all files in csv folder
    for f in os.listdir(csv_folder):
        os.remove(os.path.join(csv_folder, f))

# Generate full paths
full_paths = [f.split('.')[0] for f in files]

# Create a DataFrame
df = pd.DataFrame(full_paths, columns=['slide_id'])
df
# Save to CSV file
df.to_csv(os.path.join(csv_folder, 'slide_id.csv'), index=False)


In [4]:
os.path.join(folder, 'slide_id.csv')

'test_10x_patches/slide_id.csv'

In [3]:
from datasets.dataset_h5 import Dataset_All_Bags, Whole_Slide_Bag, Whole_Slide_Bag_FP

In [6]:
img_path = './test_10x_patches/patch_0.jpg'
img = Image.open(img_path)

In [11]:
np.array(img).shape

(201, 201, 3)

In [20]:
file_path = './test_10x_patches/h5/image_data.h5'
pretrained = True
custom_downsample=1
target_patch_size= 224

In [21]:
dataset = Whole_Slide_Bag(file_path=file_path, pretrained=True, 
							  target_patch_size=target_patch_size)

pretrained: True
transformations: <torchvision.transforms.Compose object at 0x7f69817dd7d0>
target_size:  (224, 224)


In [22]:
dataset[0]

KeyError: "Unable to open object (object 'coords' doesn't exist)"

In [20]:
def compute_w_loader(file_path, output_path, model, batch_size = 8, verbose = 0, 
	  				 print_every=20, pretrained=True, target_patch_size=-1):
	"""
	args:
		file_path: directory of bag (.h5 file)
		output_path: directory to save computed features (.h5 file)
		model: pytorch model
		batch_size: batch_size for computing features in batches
		verbose: level of feedback
		pretrained: use weights pretrained on imagenet
	"""
	dataset = Whole_Slide_Bag(file_path=file_path, pretrained=pretrained, 
							  target_patch_size=target_patch_size)
	# x, y = dataset[0]
	kwargs = {'num_workers': 4, 'pin_memory': True} if device.type == "cuda" else {}
	loader = DataLoader(dataset=dataset, batch_size=batch_size, **kwargs, collate_fn=collate_features)

	if verbose > 0:
		print('processing {}: total of {} batches'.format(file_path,len(loader)))

	mode = 'w'
	for count, (batch, coords) in enumerate(loader):
		with torch.no_grad():	
			if count % print_every == 0:
				print('batch {}/{}, {} files processed'.format(count, len(loader), count * batch_size))
			batch = batch.to(device, non_blocking=True)
			mini_bs = coords.shape[0]
			
			features = model(batch)
			
			features = features.cpu().numpy()

			asset_dict = {'features': features, 'coords': coords}
			
			save_hdf5(output_path, asset_dict, attr_dict= None, mode=mode)
			mode = 'a'
	
	return output_path

In [6]:
print('initializing dataset')
data_dir = './RESULTS_test_prostate'
csv_path = f'{data_dir}/csv/slide_id.csv' # args.csv_path
feat_dir = './FEATURE_DIRECTORY'
slide_ext = '.tif'


batch_size = 256
target_patch_size = 224
bags_dataset = Dataset_All_Bags(csv_path)

os.makedirs(feat_dir, exist_ok=True)
dest_files = os.listdir(feat_dir)

initializing dataset


In [8]:
total = len(bags_dataset)
slide_id = bags_dataset[0].split(slide_ext)[0]
bag_name = slide_id + '.h5'
os.path.join(data_dir, 'h5', f'{slide_id}.h5')

'./RESULTS_test_prostate/h5/prostate_adenocarcinoma_1.3.0.h5'

In [11]:
print('loading model checkpoint')
model = resnet50_baseline(pretrained=True)
model = model.to(device)

# print_network(model)
if torch.cuda.device_count() > 1:
    model = nn.DataParallel(model)
    
model.eval()
total = len(bags_dataset)

loading model checkpoint


In [22]:
def create_folder(folder_path, clean_folder = True):
    if not os.path.isdir(folder_path):
        os.mkdir(folder_path)
    else: # remove all files in folder
        if clean_folder:
            for f in os.listdir(folder_path):
                os.remove(os.path.join(folder_path, f))

In [23]:
for bag_candidate_idx in range(total):
    slide_id = bags_dataset[bag_candidate_idx].split(slide_ext)[0]
    bag_name = slide_id + '.h5'
    bag_candidate = os.path.join(data_dir, 'h5', bag_name)
    # bag_candidate = os.path.join(data_dir, 'h5', 'image_data.h5')

    print('\nprogress: {}/{}'.format(bag_candidate_idx, total))
    print(bag_name)
    # if not no_auto_skip and slide_id+'.pt' in dest_files:
    #     print('skipped {}'.format(slide_id))
    #     continue 

    output_folder = os.path.join(feat_dir, 'h5_files')
    output_path = os.path.join(output_folder, bag_name)
    
    create_folder(output_folder, clean_folder = True)

    pt_folder = os.path.join(feat_dir, 'pt_files')
    create_folder(pt_folder, clean_folder = True)
    
    file_path = bag_candidate
    time_start = time.time()
    output_file_path = compute_w_loader(file_path, output_path, 
                                        model = model, batch_size = batch_size, 
                                        verbose = 1, print_every = 20,
                                        target_patch_size=target_patch_size)
    time_elapsed = time.time() - time_start
    print('\ncomputing features for {} took {} s'.format(output_file_path, time_elapsed))
    file = h5py.File(output_file_path, "r")

    features = file['features'][:]
    print('features size: ', features.shape)
    print('coordinates size: ', file['coords'].shape)
    features = torch.from_numpy(features)
    bag_base, _ = os.path.splitext(bag_name)
    torch.save(features, os.path.join(feat_dir, 'pt_files', bag_base+'.pt'))


progress: 0/1
prostate_adenocarcinoma_1.3.0.h5
pretrained: True
transformations: <torchvision.transforms.Compose object at 0x7f4988f6a4d0>
target_size:  (224, 224)
processing ./RESULTS_test_prostate/h5/prostate_adenocarcinoma_1.3.0.h5: total of 18 batches
batch 0/18, 0 files processed

computing features for ./FEATURE_DIRECTORY/h5_files/prostate_adenocarcinoma_1.3.0.h5 took 2.8208231925964355 s
features size:  (4371, 1024)
coordinates size:  (4371, 2)


In [18]:
os.mkdir(os.path.join(feat_dir, 'h5_files'), exist_ok = True)

TypeError: 'exist_ok' is an invalid keyword argument for mkdir()

In [15]:


h5f = h5py.File('./FEATURE_DIRECTORY/h5_files/prostate_adenocarcinoma_1.3.0.h5', 'w')

OSError: Unable to create file (unable to open file: name = './FEATURE_DIRECTORY/h5_files/prostate_adenocarcinoma_1.3.0.h5', errno = 2, error message = 'No such file or directory', flags = 13, o_flags = 242)