# Test Data Providers

In [1]:
import os, sys, gc, copy, itertools, json

import numpy as np
import pandas as pd
from tqdm import tqdm, tqdm_notebook

from scipy.stats import mode

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

import torch
from torch import nn
from torch.autograd import Variable
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.data.sampler import Sampler, SubsetRandomSampler
from torchvision import transforms, utils, models

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

from PIL import Image
from IPython.core.display import display

In [2]:
GPU_ID = 1

## PIL image DP

In [3]:
from pytorch_learning_tools.data_providers.DataProviderDataFramePIL import dataframeDataProvider, dataframeDataset

### load csv file as dataframe

In [4]:
# read file
df = pd.read_csv('/root/aics/modeling/gregj/results/ipp/ipp_17_12_03/data_jobs_out.csv',
                 dtype={'structureSegOutputFilename':str, 'structureSegOutputFolder':str})

# filter for mito annotations
df = df.query('mitoticLabel >= 0')
df = df.reset_index(drop=True)

# add numeric labels
le = LabelEncoder()
df['targetNumeric'] = le.fit_transform(df['structureProteinName']).astype(int)

# print label map
print(json.dumps(dict(zip(le.classes_,[int(i) for i in le.transform(le.classes_)])), indent = 2))

{
  "Alpha tubulin": 0,
  "Beta actin": 1,
  "Desmoplakin": 2,
  "Lamin B1": 3,
  "ST6GAL1": 4,
  "Sec61 beta": 5,
  "Tom20": 6
}


### load dataprovider from dataframe

In [5]:
dp = dataframeDataProvider(df,
                           image_root_dir='/root/aics/modeling/gregj/results/ipp/ipp_17_12_03/',
                           image_path_col='save_flat_reg_path',
                           image_type='png',
                           image_channels=(0,2),
                           image_transform=transforms.Compose([transforms.ToTensor(), lambda x: x[(0,2),:,:]]),
                           target_col='targetNumeric',
                           unique_id_col='save_h5_reg_path',
                           split_fracs={'train': 0.8, 'test': 0.2},
                           batch_size=32,
                           shuffle=True,
                           split_seed=1,
                           num_workers=4,
                           pin_memory=True)

scanning files: 100%|██████████| 693/693 [00:01<00:00, 601.81it/s]
scanning files: 100%|██████████| 2820/2820 [00:03<00:00, 909.46it/s]

dropped 16 data points in test split
dropped 32 data points in train split





### test dataset i/o (via indices)

In [6]:
dset = dp._datasets['test']

#### with single index

In [7]:
x,y,u = dset[4]
print(y.shape)
print(x.shape)
print(len(u))

torch.Size([1])
torch.Size([2, 161, 101])
125


#### with multiple indices

In [8]:
x,y,u = dset[[110,111,112,113]]
print(y.shape)
print(x.shape)
print(len(u))

torch.Size([4, 1])
torch.Size([4, 2, 161, 101])
4


### test dataprovider i/o (via unique ids)

#### with single id

In [9]:
x,y,u = dp['3500001157_100X_20170807_5-Scene-10-P40-E08.czi_f28487ddfa8d6d4b4ce77ff5d/3500001157_100X_20170807_5-Scene-10-P40-E08.czi_13.0.h5']
print(y.shape)
print(x.shape)
print(len(u))

torch.Size([1])
torch.Size([2, 161, 101])
129


#### with multiple ids

In [10]:
x,y,u = dp[['3500000949_100X_20170531_7-Scene-1-P16-E05.czi_a2fb0a90e0f5b8f69a1cb1d3b/3500000949_100X_20170531_7-Scene-1-P16-E05.czi_11.0.h5',
            '3500001238_10X_20170828_2-Scene-05-P8-E05.czi_83ced757a579063fc7b5e71a2/3500001238_10X_20170828_2-Scene-05-P8-E05.czi_2.0.h5',
            '3500000939_100X_20170526_7-Scene-05-P35-E07.czi_2f1b3595040633b64c247ef9b/3500000939_100X_20170526_7-Scene-05-P35-E07.czi_10.0.h5']]
print(y.shape)
print(x.shape)
print(len(u))

torch.Size([3, 1])
torch.Size([3, 2, 161, 101])
3


### test dataloader i/o

In [11]:
i,(x,y,u) = next(enumerate(dp.dataloaders['test']))

In [12]:
print(y.shape)
print(x.shape)
print(len(u))

torch.Size([32, 1])
torch.Size([32, 2, 161, 101])
32


## HDF5 DP

In [13]:
from pytorch_learning_tools.data_providers.DataProviderDataFrameHDF5 import dataframeDataProvider, dataframeDataset

  from ._conv import register_converters as _register_converters


### load csv file as dataframe

In [14]:
# read file
df = pd.read_csv('/root/aics/modeling/gregj/results/ipp/ipp_17_12_03/data_jobs_out.csv',
                 dtype={'structureSegOutputFilename':str, 'structureSegOutputFolder':str})

# add column with clean paths to h5 files (TODO: report bug in greg's code)

clean_paths = [p.replace('.0.h5', '.h5') for p in df['save_h5_reg_path']]
df['save_h5_reg_path_clean'] = clean_paths

# filter for mito annotations
df = df.query('mitoticLabel >= 0')
df = df.reset_index(drop=True)

# add numeric labels
le = LabelEncoder()
df['targetNumeric'] = le.fit_transform(df['structureProteinName']).astype(int)

# print label map
print(json.dumps(dict(zip(le.classes_,[int(i) for i in le.transform(le.classes_)])), indent = 2))

{
  "Alpha tubulin": 0,
  "Beta actin": 1,
  "Desmoplakin": 2,
  "Lamin B1": 3,
  "ST6GAL1": 4,
  "Sec61 beta": 5,
  "Tom20": 6
}


### load dataprovider from dataframe

In [15]:
dp = dataframeDataProvider(df,
                           image_root_dir='/root/aics/modeling/gregj/results/ipp/ipp_17_10_25/',
                           image_path_col='save_h5_reg_path_clean',
                           image_channels=(3,4,2),
                           target_col='targetNumeric',
                           unique_id_col='save_h5_reg_path',
                           split_fracs={'train': 0.8, 'test': 0.2},
                           batch_size=32,
                           shuffle=True,
                           split_seed=1,
                           num_workers=4,
                           pin_memory=True)

scanning files: 100%|██████████| 693/693 [00:00<00:00, 755.80it/s]
scanning files: 100%|██████████| 2820/2820 [00:02<00:00, 969.24it/s]

dropped 0 data points in test split
dropped 0 data points in train split





### test dataset i/o (via indices)

In [16]:
dset = dp._datasets['test']

#### with single index

In [17]:
x,y,u = dset[4]
print(y.shape)
print(x.shape)
print(len(u))

torch.Size([1])
torch.Size([3, 128, 96, 64])
125


#### with multiple indices

In [18]:
x,y,u = dset[[110,111,112,113]]
print(y.shape)
print(x.shape)
print(len(u))

torch.Size([4, 1])
torch.Size([4, 3, 128, 96, 64])
4


### test dataprovider i/o (via unique ids)

#### with single id

In [19]:
x,y,u = dp['3500001157_100X_20170807_5-Scene-10-P40-E08.czi_f28487ddfa8d6d4b4ce77ff5d/3500001157_100X_20170807_5-Scene-10-P40-E08.czi_13.0.h5']
print(y.shape)
print(x.shape)
print(len(u))

torch.Size([1])
torch.Size([3, 128, 96, 64])
129


#### with multiple ids

In [20]:
x,y,u = dp[['3500000949_100X_20170531_7-Scene-1-P16-E05.czi_a2fb0a90e0f5b8f69a1cb1d3b/3500000949_100X_20170531_7-Scene-1-P16-E05.czi_11.0.h5',
            '3500001238_10X_20170828_2-Scene-05-P8-E05.czi_83ced757a579063fc7b5e71a2/3500001238_10X_20170828_2-Scene-05-P8-E05.czi_2.0.h5',
            '3500000939_100X_20170526_7-Scene-05-P35-E07.czi_2f1b3595040633b64c247ef9b/3500000939_100X_20170526_7-Scene-05-P35-E07.czi_10.0.h5']]
print(y.shape)
print(x.shape)
print(len(u))

torch.Size([3, 1])
torch.Size([3, 3, 128, 96, 64])
3


### test dataloader i/o

In [21]:
i,(x,y,u) = next(enumerate(dp.dataloaders['test']))

In [22]:
print(y.shape)
print(x.shape)
print(len(u))

torch.Size([32, 1])
torch.Size([32, 3, 128, 96, 64])
32


## Feature DP

In [23]:
from pytorch_learning_tools.data_providers.DataProviderDataFrameFeatures import dataframeDataProvider, dataframeDataset

In [24]:
df = pd.read_csv('/root/aics/modeling/gregj/results/ipp/ipp_17_12_03/feats_out.csv')

# filter for mito annotations
df = df.query('mitoticLabel >= 0')
df = df.reset_index(drop=True)

# add numeric labels
le = LabelEncoder()
df['targetNumeric'] = le.fit_transform(df['structureProteinName']).astype(int)

# print label map
print(json.dumps(dict(zip(le.classes_,[int(i) for i in le.transform(le.classes_)])), indent = 2))

  interactivity=interactivity, compiler=compiler, result=result)


{
  "Alpha tubulin": 0,
  "Beta actin": 1,
  "Desmoplakin": 2,
  "Lamin B1": 3,
  "ST6GAL1": 4,
  "Sec61 beta": 5,
  "Tom20": 6
}


In [25]:
dp = dataframeDataProvider(df,
                           feat_col_pattern='feat_',
                           target_col='mitoticLabel',
                           unique_id_col='save_h5_reg_path',
                           batch_size=32,
                           split_fracs={'train': 0.8, 'test': 0.2},
                           split_seed=1,
                           num_workers=4,
                           pin_memory=True)

### test dataset i/o (via indices)

In [26]:
dset = dp._datasets['test']

#### with single index

In [27]:
x,y,u = dset[4]
print(y.shape)
print(x.shape)
print(u)

torch.Size([1])
torch.Size([1, 2479])
['3500000943_100X_20170530_2-Scene-2-P6-E04.czi_c2fc6577038a71c9dc3a28465/3500000943_100X_20170530_2-Scene-2-P6-E04.czi_12.0.h5']


#### with multiple indices

In [28]:
x,y,u = dset[[110,111,112,113]]
print(y.shape)
print(x.shape)
print(len(u))

torch.Size([4])
torch.Size([4, 2479])
4


### test dataprovider i/o (via unique ids)

#### with single id

In [29]:
x,y,u = dp['3500001157_100X_20170807_5-Scene-10-P40-E08.czi_f28487ddfa8d6d4b4ce77ff5d/3500001157_100X_20170807_5-Scene-10-P40-E08.czi_13.0.h5']
print(y.shape)
print(x.shape)
print(u)

torch.Size([1])
torch.Size([1, 2479])
['3500001157_100X_20170807_5-Scene-10-P40-E08.czi_f28487ddfa8d6d4b4ce77ff5d/3500001157_100X_20170807_5-Scene-10-P40-E08.czi_13.0.h5']


#### with multiple ids

In [30]:
x,y,u = dp[['3500000949_100X_20170531_7-Scene-1-P16-E05.czi_a2fb0a90e0f5b8f69a1cb1d3b/3500000949_100X_20170531_7-Scene-1-P16-E05.czi_11.0.h5',
            '3500001238_10X_20170828_2-Scene-05-P8-E05.czi_83ced757a579063fc7b5e71a2/3500001238_10X_20170828_2-Scene-05-P8-E05.czi_2.0.h5',
            '3500000939_100X_20170526_7-Scene-05-P35-E07.czi_2f1b3595040633b64c247ef9b/3500000939_100X_20170526_7-Scene-05-P35-E07.czi_10.0.h5']]
print(y.shape)
print(x.shape)
print(len(u))

torch.Size([3, 1])
torch.Size([3, 1, 2479])
3


### test dataloader i/o

In [31]:
i,(x,y,u) = next(enumerate(dp.dataloaders['test']))

In [32]:
print(y.shape)
print(x.shape)
print(len(u))

torch.Size([32, 1])
torch.Size([32, 1, 2479])
1
