# Test Data Providers

In [1]:
import os
import json

import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder

import torch
from torchvision import transforms

from pytorch_learning_tools.utils.dataframe_utils import filter_dataframe
from pytorch_learning_tools.utils.data_utils import classes_and_weights
from pytorch_learning_tools.data_providers.DataProviderDataFrame import dataframeDataProvider, dataframeDatasetFeatures, dataframeDatasetPIL, dataframeDatasetHDF5

  from ._conv import register_converters as _register_converters


In [2]:
GPU_ID = 3
BATCH_SIZE = 32

## PIL image DP

### load csv file as dataframe

In [3]:
# read file
df = pd.read_csv('/root/aics/modeling/gregj/results/ipp/ipp_17_12_03/data_jobs_out.csv',
                 dtype={'structureSegOutputFilename':str, 'structureSegOutputFolder':str})

# filter for mito annotations
df = df.query('mitoticLabel >= 0')
df = df.reset_index(drop=True)

# filter for rows where images are actually present
df = filter_dataframe(df,'/root/aics/modeling/gregj/results/ipp/ipp_17_12_03/','save_flat_reg_path')

# add numeric labels
le = LabelEncoder()
df['targetNumeric'] = le.fit_transform(df['structureProteinName']).astype(int)

# print label map
print(json.dumps(dict(zip(le.classes_,[int(i) for i in le.transform(le.classes_)])), indent = 2))

scanning files: 100%|██████████| 3513/3513 [00:05<00:00, 639.94it/s]

{
  "Alpha tubulin": 0,
  "Beta actin": 1,
  "Desmoplakin": 2,
  "Lamin B1": 3,
  "ST6GAL1": 4,
  "Sec61 beta": 5,
  "Tom20": 6
}





### load dataprovider from dataframe

In [4]:
split_fracs={'train': 0.8, 'test': 0.2}
split_seed=1

dataloader_kwargs={split:{'batch_size':BATCH_SIZE,
                          'shuffle':True,
                          'drop_last':True,
                          'num_workers':4,
                          'pin_memory':True} for split in split_fracs.keys()}

dataset_kwargs={split:{'image_root_dir':'/root/aics/modeling/gregj/results/ipp/ipp_17_12_03/',
                       'image_path_col':'save_flat_reg_path',
                       'image_type':'png',
                       'image_channels':(0,2),
                       'target_col':'targetNumeric',
                       'unique_id_col':'save_h5_reg_path'} for split in split_fracs.keys()}

dataset_kwargs['train']['image_transform'] = transforms.Compose([transforms.RandomRotation(degrees=90),
                                             transforms.RandomHorizontalFlip(),
                                             transforms.RandomVerticalFlip(),
#                                              transforms.RandomResizedCrop(224, scale=(0.9,1.1)),
                                             transforms.ToTensor(),
                                             transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])

dataset_kwargs['test']['image_transform'] = transforms.Compose([transforms.Resize(256),
                                            transforms.CenterCrop(224),
                                            transforms.ToTensor(),
                                            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])

dp = dataframeDataProvider(df, dset_type=dataframeDatasetPIL,
                           split_fracs=split_fracs,
                           split_seed=split_seed,
                           dataset_kwargs=dataset_kwargs,
                           dataloader_kwargs=dataloader_kwargs)

### test dataset i/o (via indices)

In [5]:
dset = dp._datasets['test']

#### with single index

In [6]:
x,y,u = dset[4]
print(y.shape)
print(x.shape)
print(len(u), type(u))

torch.Size([1])
torch.Size([3, 224, 224])
125 <class 'str'>


#### with multiple indices

In [7]:
x,y,u = dset[[110,111,112,113]]
print(y.shape)
print(x.shape)
print(len(u), type(u))

torch.Size([4, 1])
torch.Size([4, 3, 224, 224])
4 <class 'tuple'>


### test dataprovider i/o (via unique ids)

#### with single id

In [8]:
x,y,u = dp['3500001157_100X_20170807_5-Scene-10-P40-E08.czi_f28487ddfa8d6d4b4ce77ff5d/3500001157_100X_20170807_5-Scene-10-P40-E08.czi_13.0.h5']
print(y.shape)
print(x.shape)
print(len(u), type(u))

torch.Size([1])
torch.Size([3, 224, 224])
129 <class 'str'>


#### with multiple ids

In [9]:
x,y,u = dp[['3500000949_100X_20170531_7-Scene-1-P16-E05.czi_a2fb0a90e0f5b8f69a1cb1d3b/3500000949_100X_20170531_7-Scene-1-P16-E05.czi_11.0.h5',
            '3500001238_10X_20170828_2-Scene-05-P8-E05.czi_83ced757a579063fc7b5e71a2/3500001238_10X_20170828_2-Scene-05-P8-E05.czi_2.0.h5',
            '3500000939_100X_20170526_7-Scene-05-P35-E07.czi_2f1b3595040633b64c247ef9b/3500000939_100X_20170526_7-Scene-05-P35-E07.czi_10.0.h5']]
print(y.shape)
print(x.shape)
print(len(u), type(u))
u

torch.Size([3, 1])
torch.Size([3, 3, 224, 224])
3 <class 'tuple'>


('3500000949_100X_20170531_7-Scene-1-P16-E05.czi_a2fb0a90e0f5b8f69a1cb1d3b/3500000949_100X_20170531_7-Scene-1-P16-E05.czi_11.0.h5',
 '3500001238_10X_20170828_2-Scene-05-P8-E05.czi_83ced757a579063fc7b5e71a2/3500001238_10X_20170828_2-Scene-05-P8-E05.czi_2.0.h5',
 '3500000939_100X_20170526_7-Scene-05-P35-E07.czi_2f1b3595040633b64c247ef9b/3500000939_100X_20170526_7-Scene-05-P35-E07.czi_10.0.h5')

### test dataloader i/o

In [10]:
i,(x,y,u) = next(enumerate(dp.dataloaders['test']))

In [11]:
print(y.shape)
print(x.shape)
print(len(u), type(u))

torch.Size([32, 1])
torch.Size([32, 3, 224, 224])
32 <class 'list'>


## HDF5 DP

### load csv file as dataframe

In [12]:
# read file
df = pd.read_csv('/root/aics/modeling/gregj/results/ipp/ipp_17_12_03/data_jobs_out.csv',
                 dtype={'structureSegOutputFilename':str, 'structureSegOutputFolder':str})

# add column with clean paths to h5 files (TODO: report bug in greg's code)

clean_paths = [p.replace('.0.h5', '.h5') for p in df['save_h5_reg_path']]
df['save_h5_reg_path_clean'] = clean_paths

# filter for mito annotations
df = df.query('mitoticLabel >= 0')
df = df.reset_index(drop=True)

# filter for rows where images are actually present
df = filter_dataframe(df,'/root/aics/modeling/gregj/results/ipp/ipp_17_12_03/','save_flat_reg_path')

# add numeric labels
le = LabelEncoder()
df['targetNumeric'] = le.fit_transform(df['structureProteinName']).astype(int)

# print label map
print(json.dumps(dict(zip(le.classes_,[int(i) for i in le.transform(le.classes_)])), indent = 2))

scanning files: 100%|██████████| 3513/3513 [00:00<00:00, 8266.76it/s]

{
  "Alpha tubulin": 0,
  "Beta actin": 1,
  "Desmoplakin": 2,
  "Lamin B1": 3,
  "ST6GAL1": 4,
  "Sec61 beta": 5,
  "Tom20": 6
}





### load dataprovider from dataframe

In [13]:
split_fracs={'train': 0.8, 'test': 0.2}
split_seed=1

dataloader_kwargs={split:{'batch_size':BATCH_SIZE,
                          'shuffle':True,
                          'drop_last':True,
                          'num_workers':4,
                          'pin_memory':True} for split in split_fracs.keys()}

dataset_kwargs={split:{'image_root_dir':'/root/aics/modeling/gregj/results/ipp/ipp_17_10_25/',
                       'image_path_col':'save_h5_reg_path_clean',
                       'image_channels':(3,4,2),
                       'target_col':'targetNumeric',
                       'unique_id_col':'save_h5_reg_path'} for split in split_fracs.keys()}

dp = dataframeDataProvider(df, dset_type=dataframeDatasetHDF5,
                           split_fracs=split_fracs,
                           split_seed=split_seed,
                           dataset_kwargs=dataset_kwargs,
                           dataloader_kwargs=dataloader_kwargs)

### test dataset i/o (via indices)

In [14]:
dset = dp._datasets['test']

#### with single index

In [15]:
x,y,u = dset[4]
print(y.shape)
print(x.shape)
print(len(u), type(u))

torch.Size([1])
torch.Size([3, 128, 96, 64])
125 <class 'str'>


#### with multiple indices

In [16]:
x,y,u = dset[[110,111,112,113]]
print(y.shape)
print(x.shape)
print(len(u), type(u))

torch.Size([4, 1])
torch.Size([4, 3, 128, 96, 64])
4 <class 'tuple'>


### test dataprovider i/o (via unique ids)

#### with single id

In [17]:
x,y,u = dp['3500001157_100X_20170807_5-Scene-10-P40-E08.czi_f28487ddfa8d6d4b4ce77ff5d/3500001157_100X_20170807_5-Scene-10-P40-E08.czi_13.0.h5']
print(y.shape)
print(x.shape)
print(len(u), type(u))

torch.Size([1])
torch.Size([3, 128, 96, 64])
129 <class 'str'>


#### with multiple ids

In [18]:
x,y,u = dp[['3500000949_100X_20170531_7-Scene-1-P16-E05.czi_a2fb0a90e0f5b8f69a1cb1d3b/3500000949_100X_20170531_7-Scene-1-P16-E05.czi_11.0.h5',
            '3500001238_10X_20170828_2-Scene-05-P8-E05.czi_83ced757a579063fc7b5e71a2/3500001238_10X_20170828_2-Scene-05-P8-E05.czi_2.0.h5',
            '3500000939_100X_20170526_7-Scene-05-P35-E07.czi_2f1b3595040633b64c247ef9b/3500000939_100X_20170526_7-Scene-05-P35-E07.czi_10.0.h5']]
print(y.shape)
print(x.shape)
print(len(u), type(u))

torch.Size([3, 1])
torch.Size([3, 3, 128, 96, 64])
3 <class 'tuple'>


### test dataloader i/o

In [19]:
i,(x,y,u) = next(enumerate(dp.dataloaders['test']))

In [20]:
print(y.shape)
print(x.shape)
print(len(u), type(u))

torch.Size([32, 1])
torch.Size([32, 3, 128, 96, 64])
32 <class 'list'>


## Feature DP

In [21]:
df = pd.read_csv('/root/aics/modeling/gregj/results/ipp/ipp_17_12_03/feats_out.csv',
                 dtype={'colony_position':str, 'position_ID':str, 'well_ID':str,
                        'structureSegOutputFilename':str, 'structureSegOutputFolder':str})

# filter for mito annotations
df = df.query('mitoticLabel >= 0')
df = df.reset_index(drop=True)

# add numeric labels
le = LabelEncoder()
df['targetNumeric'] = le.fit_transform(df['structureProteinName'])

# convert df column types -- np -> torch conversion follows these types
feat_cols = df.columns[df.columns.str.contains(pat='feat_')]
df[feat_cols] = df[feat_cols].astype(np.float32)
df['targetNumeric'] = df['targetNumeric'].astype(np.int16)

# print label map
print(json.dumps(dict(zip(le.classes_,[int(i) for i in le.transform(le.classes_)])), indent = 2))

{
  "Alpha tubulin": 0,
  "Beta actin": 1,
  "Desmoplakin": 2,
  "Lamin B1": 3,
  "ST6GAL1": 4,
  "Sec61 beta": 5,
  "Tom20": 6
}


In [22]:
split_fracs={'train': 0.8, 'test': 0.2}
split_seed=1

dataloader_kwargs={split:{'batch_size':BATCH_SIZE,
                          'shuffle':True,
                          'drop_last':True,
                          'num_workers':4,
                          'pin_memory':True} for split in split_fracs.keys()}

dataset_kwargs={split:{'feat_col_pattern':'feat_',
                       'target_col':'targetNumeric',
                       'unique_id_col':'save_h5_reg_path'} for split in split_fracs.keys()}

dp = dataframeDataProvider(df, dset_type=dataframeDatasetFeatures,
                           split_fracs=split_fracs,
                           split_seed=split_seed,
                           dataset_kwargs=dataset_kwargs,
                           dataloader_kwargs=dataloader_kwargs)

### test dataset i/o (via indices)

In [23]:
dset = dp._datasets['test']

#### with single index

In [24]:
x,y,u = dset[4]
print(y.shape)
print(x.shape)
print(len(u), type(u))

torch.Size([])
torch.Size([2479])
125 <class 'str'>


#### with multiple indices

In [25]:
x,y,u = dset[[110,111,112,113]]
print(y.shape, y.type())
print(x.shape, x.type())
print(len(u), type(u))

torch.Size([4]) torch.ShortTensor
torch.Size([4, 2479]) torch.FloatTensor
4 <class 'tuple'>


### test dataprovider i/o (via unique ids)

#### with single id

In [26]:
x,y,u = dp['3500001157_100X_20170807_5-Scene-10-P40-E08.czi_f28487ddfa8d6d4b4ce77ff5d/3500001157_100X_20170807_5-Scene-10-P40-E08.czi_13.0.h5']
print(y.shape, y.type())
print(x.shape, x.type())
print(len(u), type(u))

torch.Size([]) torch.ShortTensor
torch.Size([2479]) torch.FloatTensor
129 <class 'str'>


#### with multiple ids

In [27]:
x,y,u = dp[['3500000949_100X_20170531_7-Scene-1-P16-E05.czi_a2fb0a90e0f5b8f69a1cb1d3b/3500000949_100X_20170531_7-Scene-1-P16-E05.czi_11.0.h5',
            '3500001238_10X_20170828_2-Scene-05-P8-E05.czi_83ced757a579063fc7b5e71a2/3500001238_10X_20170828_2-Scene-05-P8-E05.czi_2.0.h5',
            '3500000939_100X_20170526_7-Scene-05-P35-E07.czi_2f1b3595040633b64c247ef9b/3500000939_100X_20170526_7-Scene-05-P35-E07.czi_10.0.h5']]
print(y.shape, y.type())
print(x.shape, x.type())
print(len(u), type(u))

torch.Size([3]) torch.ShortTensor
torch.Size([3, 2479]) torch.FloatTensor
3 <class 'tuple'>


### test dataloader i/o

In [28]:
i,(x,y,u) = next(enumerate(dp.dataloaders['test']))

In [29]:
print(y.shape, y.type())
print(x.shape, x.type())
print(len(u), type(u))

torch.Size([32]) torch.ShortTensor
torch.Size([32, 2479]) torch.FloatTensor
32 <class 'list'>
