In [5]:
from pathlib import Path
import sys
sys.path.append(str(Path('../../').resolve()))
from utils.tools import save_multichannel_preview
import pandas as pd
from fastai.vision.all import *
from torchvision import transforms
from scifAI.dl.dataset import DatasetGenerator
from scifAI.dl.utils import get_statistics
from torch.utils.data import DataLoader

In [6]:
metadata = pd.read_csv("/home/jedrzej/projects/image_flow_cytometry_fine_tune/data/jedrzej/metadata_subset.csv.gz")
metadata.set.unique()
indx = metadata.condition.isin(["-SEA","+SEA"])
metadata = metadata.loc[indx, :].reset_index(drop = True )
set_of_interesting_classes = ['B_cell',  'T_cell', 
                        'T_cell_with_signaling',
                        'T_cell_with_B_cell_fragments',
                        'B_T_cell_in_one_layer',
                        'Synapses_without_signaling', 
                        'Synapses_with_signaling',
                        'No_cell_cell_interaction', 
                        'Multiplets'] 

indx = metadata.set.isin([ "train", "validation","test" ])
indx = indx & metadata.label.isin(set_of_interesting_classes)

train_index = metadata["set"] == "train"
train_index = train_index & metadata.label.isin(set_of_interesting_classes)
train_index = train_index[train_index].index

validation_index = metadata["set"] == "validation"
validation_index = validation_index & metadata.label.isin(set_of_interesting_classes)
validation_index = validation_index[validation_index].index

test_index = metadata["set"] == "test"
test_index = test_index & metadata.label.isin(set_of_interesting_classes)
test_index = test_index[test_index].index

  metadata = pd.read_csv("/home/jedrzej/projects/image_flow_cytometry_fine_tune/data/jedrzej/metadata_subset.csv.gz")


In [7]:
metadata["set"].unique()
label_map = dict()
for i, cl in enumerate(set_of_interesting_classes):
    label_map[cl] = i

label_map['-1'] = -1
label_map[-1] = -1


channels = {
     "Ch1": ("Greys", "BF"),  
     "Ch2": ("Greens", "Antibody"),
     "Ch3": ("Reds", "CD18"),
     "Ch4": ("Oranges", "F-Actin"),
     "Ch6": ("RdPu", "MHCII"),
     "Ch7": ("Purples", "CD3/CD4"),
     "Ch11": ("Blues", "P-CD3zeta"),
     "Ch12": ("Greens", "Live-Dead")
 }

selected_channels = [0,3,4,5,6]
model_dir = "models"
log_dir = "logs"
scaling_factor = 4095.
reshape_size = 256
train_transform = [
         transforms.RandomVerticalFlip(),
         transforms.RandomHorizontalFlip(),
         transforms.RandomRotation(45)
        ]
test_transform = [ ]

train_dataset = DatasetGenerator(metadata=metadata.loc[train_index,:],
                                 label_map=label_map,
                                 selected_channels=selected_channels,
                                 scaling_factor=scaling_factor,
                                 reshape_size=reshape_size,
                                 transform=transforms.Compose(train_transform))

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=False, num_workers=1)
statistics = get_statistics(train_loader, selected_channels=selected_channels)

class AddGaussianNoise(object):
    def __init__(self, mean=0., std=1.):
        self.std = std
        self.mean = mean
        
    def __call__(self, tensor):
        return tensor + torch.randn(tensor.size()) * self.std + self.mean
    
    def __repr__(self):
        return self.__class__.__name__ + '(mean={0}, std={1})'.format(self.mean, self.std)

class MinMaxScaler(object):
    def __init__(self, min_in , max_in, min_out, max_out):
        self.min_in = min_in.reshape(-1,1,1)
        self.max_in = max_in.reshape(-1,1,1)
        self.min_out = min_out
        self.max_out = max_out
        
    def __call__(self, tensor):
        
        tensor_ = (tensor - self.min_in)/(self.max_in - self.min_in)
        tensor_ = tensor_*(self.max_out - self.min_out) + self.min_out
        tensor_[tensor_<self.min_out]= self.min_out
        tensor_[tensor_>self.max_out]= self.max_out
        return tensor_
    
    def __repr__(self):
        return self.__class__.__name__ + '(min_out={0}, max_out={1})'.format(self.min_out, self.max_out)

train_transform = transforms.Compose([ 
        MinMaxScaler(           min_in =  statistics["p05"] , 
                                max_in =  statistics["p95"] , 
                                min_out =  0. , 
                                max_out =  1.),
        transforms.RandomResizedCrop(reshape_size, scale=(0.6, 1.0), ratio=(0.8, 1.2)),
        transforms.RandomHorizontalFlip(),
        transforms.RandomVerticalFlip(),
        AddGaussianNoise(mean=0., std=0.01),
])

validation_transform =  transforms.Compose([ 
        MinMaxScaler(           min_in =  statistics["p05"] , 
                                max_in =  statistics["p95"] , 
                                min_out =  0. , 
                                max_out =  1.),
        transforms.RandomHorizontalFlip(),
        transforms.RandomVerticalFlip(),
        AddGaussianNoise(mean=0., std=0.01),
])

test_transform =  transforms.Compose([ 
        MinMaxScaler(           min_in =  statistics["p05"] , 
                                max_in =  statistics["p95"] , 
                                min_out =  0. , 
                                max_out =  1.),
])


batch_size=128
train_dataset = DatasetGenerator(metadata=metadata.loc[train_index, :],
                                 label_map=label_map,
                                 selected_channels=selected_channels,
                                 scaling_factor=scaling_factor,
                                 reshape_size=reshape_size,
                                 transform=train_transform)

valid_dataset = DatasetGenerator(metadata=metadata.loc[validation_index, :],
                                 label_map=label_map,
                                 selected_channels=selected_channels,
                                 scaling_factor=scaling_factor,
                                 reshape_size=reshape_size,
                                 transform=validation_transform)

# Convert to FastAI DataLoaders
train_dl = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=1)
valid_dl = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, num_workers=1)
dls = DataLoaders(train_dl, valid_dl)

  0%|          | 0/23 [00:00<?, ?it/s]

100%|██████████| 23/23 [05:28<00:00, 14.30s/it]

statistics used: {'min': tensor([0., 0., 0., 0., 0.]), 'p01': tensor([0., 0., 0., 0., 0.]), 'p05': tensor([0., 0., 0., 0., 0.]), 'p25': tensor([0.1943, 0.0128, 0.0092, 0.0129, 0.0094]), 'p50': tensor([0.1950, 0.0163, 0.0136, 0.0200, 0.0096]), 'p75': tensor([0.1956, 0.0205, 0.0193, 0.0256, 0.0097]), 'p95': tensor([0.1962, 0.0347, 0.0295, 0.0346, 0.0101]), 'p99': tensor([0.1989, 0.0587, 0.0474, 0.0478, 0.0107]), 'max': tensor([0.3295, 0.7863, 0.4366, 0.3361, 0.1161]), 'mean': tensor([0.1721, 0.0172, 0.0148, 0.0198, 0.0085]), 'std': tensor([0.0629, 0.0141, 0.0109, 0.0122, 0.0031])}





In [None]:
channel_map = {
     "Ch1": ("Greys", "BF"),  
     "Ch2": ("Greens", "Antibody"),
     "Ch3": ("Reds", "CD18"),
     "Ch4": ("Oranges", "F-Actin"),
     "Ch6": ("RdPu", "MHCII"),
     "Ch7": ("Purples", "CD3/CD4"),
     "Ch11": ("Blues", "P-CD3zeta"),
     "Ch12": ("Greens", "Live-Dead")
 }

save_multichannel_preview(train_dl, n_samples=10, save_path="train_multichannel_preview.png")

In [15]:
metadata_exp1 = metadata[metadata['experiment'] == "Experiment 1"]

# Create dataset and DataLoader for Experiment 1
train_dataset_exp1 = DatasetGenerator(metadata=metadata_exp1.loc[train_index, :],
                                      label_map=label_map,
                                      selected_channels=selected_channels,
                                      scaling_factor=scaling_factor,
                                      reshape_size=reshape_size,
                                      transform=train_transform)

train_loader_exp1 = DataLoader(train_dataset_exp1, batch_size=batch_size, shuffle=True, num_workers=1)

# Preview a batch for Experiment 1
save_multichannel_preview(train_loader_exp1, title="Experiment 1 - Batch Preview")

# Filter metadata for Experiment 2
metadata_exp2 = metadata[metadata['experiment'] == "Experiment 2"]

# Create dataset and DataLoader for Experiment 2
train_dataset_exp2 = DatasetGenerator(metadata=metadata_exp2.loc[train_index, :],
                                      label_map=label_map,
                                      selected_channels=selected_channels,
                                      scaling_factor=scaling_factor,
                                      reshape_size=reshape_size,
                                      transform=train_transform)

train_loader_exp2 = DataLoader(train_dataset_exp2, batch_size=batch_size, shuffle=True, num_workers=1)

# Preview a batch for Experiment 2
save_multichannel_preview(train_loader_exp2, title="Experiment 2 - Batch Preview")

KeyError: "None of [Int64Index([     4,     36,    105,    203,    245,    288,    300,    350,\n               436,    446,\n            ...\n            424050, 424118, 424306, 424541, 424700, 424703, 424714, 424740,\n            425025, 425063],\n           dtype='int64', length=2923)] are in the [index]"