In [1]:
# Don't need to run this each time notebook is started 
!pip install tifffile 
!pip install imagecodecs
!pip install torch
!pip install torchvision

Collecting tifffile
  Downloading tifffile-2020.7.24-py3-none-any.whl (146 kB)
[K     |████████████████████████████████| 146 kB 3.2 MB/s eta 0:00:01
Installing collected packages: tifffile
Successfully installed tifffile-2020.7.24
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m
Collecting imagecodecs
  Downloading imagecodecs-2020.5.30-cp36-cp36m-manylinux2014_x86_64.whl (17.9 MB)
[K     |████████████████████████████████| 17.9 MB 2.8 MB/s eta 0:00:01
Installing collected packages: imagecodecs
Successfully installed imagecodecs-2020.5.30
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m
Collecting torch
  Downloading torch-1.6.0-cp36-cp36m-manylinux1_x86_64.whl (748.8 MB)
[K     |████████████████████████████████| 748.8 MB 5.1 kB/s  eta 0:00:01    |███▊                            | 87.9 MB 74.7 MB/s eta 0:00:09     |███████

In [2]:
# Imports here
import matplotlib.pyplot as plt
import torch
from torch import nn
from torch import optim
import torch.nn.functional as F
from torchvision import datasets, transforms, models
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
import seaborn as sns
import csv
import pandas as pd
import os
import random
import math
from csv_loader import load_csv

# Tiff visualisation imports and downloads
import numpy as np
import tifffile as tiff

# For re-importing python modules
import importlib
#importlib.reload(csv_loader.py)

Before setting up a preliminary model, I would like to have a list of all of the files available and then define a small sample of this list to perform preliminary model and function constructions around

In [7]:
file_info = pd.read_csv('train.csv').copy()

In [4]:
file_info

Unnamed: 0,image_id,data_provider,isup_grade,gleason_score
0,0005f7aaab2800f6170c399693a96917,karolinska,0,0+0
1,000920ad0b612851f8e01bcc880d9b3d,karolinska,0,0+0
2,0018ae58b01bdadc8e347995b69f99aa,radboud,4,4+4
3,001c62abd11fa4b57bf7a6c603a11bb9,karolinska,4,4+4
4,001d865e65ef5d2579c190a0e0350d8f,karolinska,0,0+0
...,...,...,...,...
10611,ffd2841373b39792ab0c84cccd066e31,radboud,0,negative
10612,ffdc59cd580a1468eac0e6a32dd1ff2d,radboud,5,4+5
10613,ffe06afd66a93258f8fabdef6044e181,radboud,0,negative
10614,ffe236a25d4cbed59438220799920749,radboud,2,3+4


In [3]:
print(file_info.groupby("gleason_score")["data_provider"].unique())
print(file_info.groupby("isup_grade")["data_provider"].unique())

gleason_score
0+0                  [karolinska]
3+3         [karolinska, radboud]
3+4         [radboud, karolinska]
3+5         [karolinska, radboud]
4+3         [radboud, karolinska]
4+4         [radboud, karolinska]
4+5         [radboud, karolinska]
5+3         [radboud, karolinska]
5+4         [radboud, karolinska]
5+5         [radboud, karolinska]
negative                [radboud]
Name: data_provider, dtype: object
isup_grade
0    [karolinska, radboud]
1    [karolinska, radboud]
2    [radboud, karolinska]
3    [radboud, karolinska]
4    [radboud, karolinska]
5    [radboud, karolinska]
Name: data_provider, dtype: object


It may be worth standardizing reported Gleasons scores. Radboud uses "negative" instead of "0+0". 

In [3]:
file_info['std_gleason']= ["0+0" if x=='negative' else x for x in file_info['gleason_score']]

In [8]:
# Creating a subset of 50 sample images to work with will be good to start building a first neural network and debug .TIFF related issues
sample_size = 10
train_ratio = .45
valid_ratio = .25
test_ratio = 1-(train_ratio + valid_ratio)
train_size = int(train_ratio*sample_size)
valid_size = int(valid_ratio*sample_size)
test_size = int(test_ratio*sample_size)

# In case there are decimal/rounding errors
if train_size + valid_size + test_size > sample_size:
    train_size-=1
elif train_size + valid_size + test_size < sample_size:
    train_size+=1

# List of sample images with corresponding labels
image_id = random.sample(list(file_info['image_id']), sample_size)
y_label = [int(file_info.loc[file_info['image_id'] == x, 'isup_grade']) for x in image_id]
image_id = [str(x) + '.tiff' for x in image_id]
sample_imgs = pd.DataFrame({'image_id':image_id, 'y_label':y_label})

In [5]:
sample_imgs

Unnamed: 0,image_id,y_label
0,98ec8d1647323eabbf356e9a87065edf.tiff,1
1,8fced11c69f98368c493a9fdb733544c.tiff,5
2,67b316fcbdac6392f500c587f2cb2b16.tiff,0
3,9469e763e512e05eca69e2e28b6f4ef7.tiff,1
4,458bcc2e21a2359ff2e4013522bc40b0.tiff,1
5,957cf20a917829002fc55aeecaef794c.tiff,2
6,105ae7bb05346ace6021610f723adf97.tiff,1
7,231c7be81560497b2015a0b9a9a04d32.tiff,4
8,140543a4d39b1a3ed0fce740ba2feb3d.tiff,0
9,df02f5c5fa4e85f143fef3593167ce62.tiff,5


In [6]:
sample_dir = sample_imgs.to_csv("sample_dir/sample.csv", sep=",", index=False)

In [8]:
test_img = 'train_images/6aff87e11871f4ce9682eec497239c71.tiff'

In [9]:
torch.from_numpy(tiff.imread(test_img)).permute(2,0,1)

tensor([[[255, 255, 255,  ..., 255, 255, 255],
         [255, 255, 255,  ..., 255, 255, 255],
         [255, 255, 255,  ..., 255, 255, 255],
         ...,
         [255, 255, 255,  ..., 255, 255, 255],
         [255, 255, 255,  ..., 255, 255, 255],
         [255, 255, 255,  ..., 255, 255, 255]],

        [[255, 255, 255,  ..., 255, 255, 255],
         [255, 255, 255,  ..., 255, 255, 255],
         [255, 255, 255,  ..., 255, 255, 255],
         ...,
         [255, 255, 255,  ..., 255, 255, 255],
         [255, 255, 255,  ..., 255, 255, 255],
         [255, 255, 255,  ..., 255, 255, 255]],

        [[255, 255, 255,  ..., 255, 255, 255],
         [255, 255, 255,  ..., 255, 255, 255],
         [255, 255, 255,  ..., 255, 255, 255],
         ...,
         [255, 255, 255,  ..., 255, 255, 255],
         [255, 255, 255,  ..., 255, 255, 255],
         [255, 255, 255,  ..., 255, 255, 255]]], dtype=torch.uint8)

In [9]:
# TODO: Define your transforms for the training, validation, and testing sets

# torch.transforms are made in the csv_loader.py funcion (called in next cell)
'''
train_transform = transforms.Compose([transforms.RandomRotation(30),
                                       transforms.RandomResizedCrop(224),
                                       transforms.RandomHorizontalFlip(),
                                       transforms.ToTensor(),
                                       transforms.Normalize([0.485, 0.456, 0.406],
                                                            [0.229, 0.224, 0.225])])

valid_transform = transforms.Compose([transforms.Resize(256),
                                      transforms.CenterCrop(224),
                                      transforms.ToTensor(),
                                      transforms.Normalize([0.485, 0.456, 0.406], 
                                                           [0.229, 0.224, 0.225])])

test_transform = transforms.Compose([transforms.Resize(256),
                                      transforms.CenterCrop(224),
                                      transforms.ToTensor(),
                                      transforms.Normalize([0.485, 0.456, 0.406], 
                                                           [0.229, 0.224, 0.225])])
                                                           
'''

In [9]:
#dataset = load_csv(csv_file='sample_dir/sample.csv', root_dir='train_images', transform=transforms.ToTensor())
dataset = load_csv(csv_file='sample_dir/sample.csv', root_dir='train_images')

In [10]:
train_set, valid_set, test_set = torch.utils.data.random_split(dataset, [train_size, valid_size, test_size])

In [11]:
# Pass transforms in here, then run the next cell to see how the transforms look
train_loader = torch.utils.data.DataLoader(train_set, batch_size=1, shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid_set, batch_size=1, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_set, batch_size=1, shuffle=True)

In [14]:
#img = 'train_images/bbcd685ce62f785a441a2c8246fa93a6.tiff'

In [15]:
#image = transforms.ToPILImage(tiff.imread(img))

In [20]:
#tensorimage = torch.from_numpy(tiff.imread(img))

In [6]:
#image = np.asarray(tiff.imread(img))

In [None]:
#image = transforms.ToPILImage(image)

In [None]:
# data_dir = 'flowers'
# train_dir = data_dir + '/train'
# valid_dir = data_dir + '/valid'
# test_dir = data_dir + '/test'

In [12]:
#use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define model type and freeze parameters
model = models.vgg16(pretrained=True)

for param in model.parameters():
    param.requires_grad = False

Downloading: "https://download.pytorch.org/models/vgg16-397923af.pth" to /home/ec2-user/.cache/torch/hub/checkpoints/vgg16-397923af.pth


HBox(children=(FloatProgress(value=0.0, max=553433881.0), HTML(value='')))




In [13]:
# Defining Squential Classifier transformation with high drop out rate
classifier = nn.Sequential(nn.Linear(25088, 6320, bias=True),
                           nn.ReLU(),
                           nn.Dropout(.5),
                           nn.Linear(6320, 1580, bias=True),
                           nn.ReLU(),
                           nn.Dropout(.5),
                           nn.Linear(1580, 102, bias=True),
                           nn.LogSoftmax(dim=1))
    
model.classifier = classifier
model.to(device);

In [14]:
criterion = nn.NLLLoss()
optimizer = optim.Adam(model.classifier.parameters(), lr=0.001)

epochs = 2
print_every = 1
steps = 0

In [15]:
def validate_data_function(model, test_loader, criterion):
    test_loss = 0
    accuracy = 0
    
    for ii, (inputs, labels) in enumerate(test_loader):
        
        inputs, labels = inputs.to(device), labels.to(device)
        
        output = model.forward(inputs)
        test_loss += criterion(output, labels).item()
        
        ps = torch.exp(output)
        equality = (labels.data == ps.max(dim=1)[1])
        accuracy += equality.type(torch.FloatTensor).mean()
    
    return test_loss, accuracy

In [16]:
test_loss = 0
for epoch in range(epochs):
    model.train()
    running_loss = 0
        
    for ii, (inputs, labels) in enumerate(train_loader):
        steps += 1
        
        inputs, labels = inputs.to(device), labels.to(device)
        
        optimizer.zero_grad()
        
       
        outputs = model.forward(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        
        if steps % print_every == 0:
            model.eval()

            with torch.no_grad():
                valid_loss, accuracy = validate_data_function(model, valid_loader, criterion)
            
            print(f"Epoch {epoch+1}/{epochs}..| "
                  f"Train loss: {running_loss/print_every:.3f}..| "
                  f"Validation loss: {valid_loss/print_every:.3f}..| "                  
                  f"Validation accuracy: {accuracy/len(valid_loader):.3f}|")
            
            running_loss = 0
            model.train()

RuntimeError: [enforce fail at CPUAllocator.cpp:64] . DefaultCPUAllocator: can't allocate memory: you tried to allocate 123681636352 bytes. Error code 12 (Cannot allocate memory)