In [17]:
!pip install skorch

Collecting skorch
  Downloading skorch-0.11.0-py3-none-any.whl (155 kB)
     |████████████████████████████████| 155 kB 2.3 MB/s            
Installing collected packages: skorch
Successfully installed skorch-0.11.0


In [18]:
import os

import numpy as np
import pandas as pd
from skimage.morphology import skeletonize
from PIL import Image

from skorch import NeuralNetClassifier
from skorch.callbacks import LRScheduler, Checkpoint, EpochScoring, EarlyStopping
from skorch.dataset import Dataset
from skorch.helper import predefined_split
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, models, transforms
from sklearn.model_selection import train_test_split

from pathlib import Path
import tensorflow as tf
from tensorflow import keras
import matplotlib.patches as patches

from tqdm import tqdm

In [None]:
!nvidia-smi

In [19]:
segmentation_classifier = keras.models.load_model('models/MIMIC-256x25680-20-split-resnet-Float16_2-race_detection_rop_seg_data_rop_seg-0.001_20220321-054140_epoch:011.hdf5')
segmentation_classifier.summary()

Model: "functional_11"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, 256, 256, 3) 0                                            
__________________________________________________________________________________________________
bn_data (BatchNormalization)    (None, 256, 256, 3)  9           input_3[0][0]                    
__________________________________________________________________________________________________
zero_padding2d_68 (ZeroPadding2 (None, 262, 262, 3)  0           bn_data[0][0]                    
__________________________________________________________________________________________________
conv0 (Conv2D)                  (None, 128, 128, 64) 9408        zero_padding2d_68[0][0]          
______________________________________________________________________________________

In [124]:
# code in one place, running in another directory
# code is to be run in below directory

os.chdir("/users/riya/race/classifier_experiments/CNN_train")

### Prepare Dataset

We'll try a 70/10/20 split (train/val/test). We don't have access to info outside of black/white, so we'll just do a simple split.

In [116]:
def prepare_dataset():
    
    csv_path = "/users/riya/race/csv/image_race_data.csv"
    data_path = "/users/riya/race/dataset/segmentations/"
    save_path = "/users/riya/race/classifier_experiments/CNN_train/dataset/"
    
    race_data = pd.read_csv(csv_path)
    race_data['stratify'] = race_data['race'] + '_' + race_data['variable'] 
    # new column so I can account for both variable and race in my stratification
    
    ratio_train = 0.7
    ratio_val = 0.1
    ratio_test = 0.2
    
    # split into 80% train and val, 20% test
    
    X_intermediate, X_test, y_intermediate, y_test = train_test_split(race_data, race_data['race'], test_size=ratio_test, 
                                                        stratify = race_data['stratify'], random_state=86)
    
    ratio_remaining = 1 - ratio_test
    ratio_val_adjusted = ratio_val / ratio_remaining
    
    # split into 70% train and 10% val
    
    X_train, X_val, y_train, y_val = train_test_split(X_intermediate, X_intermediate['race'], test_size=ratio_val_adjusted, 
                                                        stratify = X_intermediate['stratify'], random_state=86)

    
    def populate_folders(data_df, data_type):
    
        for i in tqdm(range(len(data_df))):
            data_df.reset_index(drop=True, inplace=True)
            img_id = data_df['image_id'][i]
            race = data_df['race'][i]

            img = np.array(Image.open(data_path + str(img_id) + '.bmp'))
            img = Image.fromarray(img)

            img.save(save_path + str(data_type) + '/' + str(race) + '/' + str(img_id) + '.bmp')
    
    populate_folders(X_train, 'train')
    populate_folders(X_val, 'val')
    populate_folders(X_test, 'test')


In [117]:
prepare_dataset()

100%|██████████| 3181/3181 [00:23<00:00, 137.97it/s]
100%|██████████| 455/455 [00:04<00:00, 113.36it/s]
100%|██████████| 910/910 [00:08<00:00, 112.69it/s]


### Model Definitions

In [None]:
class PretrainedModel(nn.Module):
    def __init__(self, output_features):
        super().__init__()
        model = models.resnet18(pretrained=True)
        num_ftrs = model.fc.in_features
        model.fc = nn.Linear(num_ftrs, output_features)
        self.model = model

    def forward(self, x):
        return self.model(x)

### Preprocessing

In [9]:
# Already done this, before I saved my images
# Yay, we did the same thing, repeating the image thrice to create the three layers. I can just do that part.

# Oh, I see. Rather than saving ALL the images (as I wrongly did), it's so much easier to just make them in the train code!

# depending on the inputs, I can run this code 8 times to train this model. It'll be easy to train at that point!

def shadow_regions(img, skeleton, shadow, radius, region, image_size = (224, 224)):
    
    img = np.array(img)
    img = cv2.resize(img, image_size)

    if skeleton is True:
        img[img > 0] = 255
        img = skeletonize(img, method='lee')
    
    if shadow is True:
        # developing mask that darkens center portion
        center_mask = np.full(image_size, 255, dtype=np.uint8) 
        # radius i changes, center, color, fill is the same
        cv2.circle(center_mask, (image_size[0]/2, image_size[0]/2), radius, (0, 0, 0), -1)

        # developing mask that darkens background region
        back_mask = cv2.bitwise_not(center_mask)

        if (region == 'dark_center'):
            img = cv2.bitwise_or(img, img, mask=center_mask)

        if (region == 'dark_background'):
            img = cv2.bitwise_or(img, img, mask=back_mask)

    img = np.repeat(img[:, :, np.newaxis], 3, axis=2).reshape((image_size[0],image_size[1],3)) # (1, 224, 224, 3)?
    img = Image.fromarray(img)

    return img

Options for different saving:
1. Skeleton = True,
    Shadow = True then
    Region has two options 
    Radius has two options
2. Original Training: shadow = False & Skeleton = False
3. no skeletonization training: Skeleton = false & Shadow = true

In [126]:
val_folder = os.path.join("/users/riya/race/classifier_experiments/CNN_train/dataset", 'val')
val_dataset = datasets.ImageFolder(val_folder)


FileNotFoundError: Found no valid file for the classes .ipynb_checkpoints. Supported extensions are: .jpg, .jpeg, .png, .ppm, .bmp, .pgm, .tif, .tiff, .webp

In [138]:
!ls -a dataset/val/black

.	    19757.bmp  31240.bmp  44765.bmp  7394.bmp	85889.bmp  96156.bmp
..	    19760.bmp  31330.bmp  45047.bmp  74081.bmp	87358.bmp  96175.bmp
101724.bmp  19761.bmp  31335.bmp  45050.bmp  74085.bmp	87402.bmp  96239.bmp
101738.bmp  19767.bmp  31338.bmp  45253.bmp  7419.bmp	87406.bmp  96248.bmp
101915.bmp  19768.bmp  31339.bmp  45258.bmp  74709.bmp	87411.bmp  96257.bmp
101921.bmp  24132.bmp  31341.bmp  45262.bmp  74711.bmp	87432.bmp  96631.bmp
105469.bmp  24133.bmp  33987.bmp  45267.bmp  74723.bmp	88464.bmp  96635.bmp
105646.bmp  25503.bmp  34075.bmp  45283.bmp  74727.bmp	88762.bmp  96850.bmp
105655.bmp  25506.bmp  34424.bmp  45286.bmp  74883.bmp	91925.bmp  96859.bmp
105667.bmp  25804.bmp  35273.bmp  45340.bmp  75661.bmp	91929.bmp  96870.bmp
107480.bmp  25813.bmp  37053.bmp  47256.bmp  75667.bmp	91946.bmp  96979.bmp
112614.bmp  25815.bmp  37988.bmp  47275.bmp  75669.bmp	91951.bmp  96982.bmp
112616.bmp  25822.bmp  37993.bmp  49165.bmp  75670.bmp	91964.bmp  96983.bmp
16522.bmp   

In [133]:
!ls -a

.  ..  dataset	outputs


In [137]:
!rm -rf dataset/val/.ipynb_checkpoints

### Train Code

In [None]:
def train(data_dir, skeleton=False, shadow = False, radius, region, num_classes=2, batch_size=64, num_epochs=10, lr=0.001):
    device = torch.device('cuda:0' if torch.cuda.is_available() else "cpu")
    if device == 'cuda:0':
        torch.cuda.empty_cache()
    if skeleton is True: # Experiment #2: skeleton true, shadow true
        shadow = True
        f_params = f'./outputs/checkpoints/model_shadow_regions_{region}_{radius}_skeletonized.pt'
        f_history = f'./outputs/histories/model_shadow_regions_{region}_{radius}_skeletonized.json'
        csv_name = f'./outputs/probabilities/shadow_regions_{region}_{radius}_skeletonized.csv'
    elif shadow is True: # Experiment #3: skeleton false, shadow true
        f_params = f'./outputs/checkpoints/model_shadow_regions_{region}_{radius}.pt'
        f_history = f'./outputs/histories/model_shadow_regions_{region}_{radius}.json'
        csv_name = f'./outputs/probabilities/shadow_regions_{region}_{radius}.csv'
    else: # Original training: skeleton false, shadow false
        f_params = f'./outputs/checkpoints/model_original.pt'
        f_history = f'./outputs/histories/model_original.json'
        csv_name = f'./outputs/probabilities/original.csv'
        
    train_transforms = transforms.Compose([transforms.Lambda(lambda img: filter(img, skeleton,
                                                                                shadow, radius,
                                                                                region)), # image size pre-defined
                                           # transforms.Resize(image_size),
                                           transforms.RandomHorizontalFlip(),
                                           transforms.RandomVerticalFlip(),
                                           transforms.RandomRotation(25),
                                           transforms.ToTensor(),
                                           transforms.Normalize([0.5, 0.5, 0.5],
                                                                [0.5, 0.5, 0.5])]) # why this normalizing?

    test_transforms = transforms.Compose([transforms.Lambda(lambda img: filter(img, skeleton,
                                                                                shadow, radius,
                                                                                region)),
                                          # transforms.Resize(image_size),
                                          transforms.ToTensor(),
                                          transforms.Normalize([0.5, 0.5, 0.5],
                                                               [0.5, 0.5, 0.5])])

    train_folder = os.path.join(data_dir, 'train') # only training on segmentations      
    val_folder = os.path.join(data_dir, 'val')
    test_folder = os.path.join(data_dir, 'test')

    
    train_dataset = datasets.ImageFolder(train_folder, train_transforms)
    val_dataset = datasets.ImageFolder(val_folder, test_transforms)
    test_dataset = datasets.ImageFolder(test_folder, test_transforms)

    
    labels = np.array(train_dataset.samples)[:,1]
    
    # what even does the below code do?    
    labels = labels.astype(int)
    black_weight = 1 / len(labels[labels == 0])
    white_weight = 1 / len(labels[labels == 1])
    sample_weights = np.array([black_weight, white_weight])
    weights = sample_weights[labels]
    sampler = torch.utils.data.WeightedRandomSampler(weights, len(train_dataset), replacement=True)

    print()
    print(f'Data Directory: {data_dir}')
    print(f'Image Type: {image_type}')
    print(f'Threshold: {threshold}')
    print(f'Binarize: {binary}')
    print(f'Skeletonize: {skeleton}')
    print(f'Number of Classes: {num_classes}')
    print(f'Number of black eyes: {len(labels[labels == 0])}')
    print(f'Number of white eyes: {len(labels[labels == 1])}')
    print(f'Batch Size: {batch_size}')
    print(f'Number of Epochs: {num_epochs}')
    print(f'Initial Learning Rate: {lr}')
    print(f'Device: {device}')
    print()

    checkpoint = Checkpoint(monitor='valid_loss_best',
                            f_params=f_params,
                            f_history=f_history,
                            f_optimizer=None,
                            f_criterion=None)

    train_acc = EpochScoring(scoring='accuracy',
                             on_train=True,
                             name='train_acc',
                             lower_is_better=False)

    early_stopping = EarlyStopping()

    callbacks = [checkpoint, train_acc, early_stopping]

    net = NeuralNetClassifier(PretrainedModel,
                              criterion=nn.CrossEntropyLoss,
                              lr=lr,
                              batch_size=batch_size,
                              max_epochs=num_epochs,
                              module__output_features=num_classes,
                              optimizer=optim.SGD,
                              optimizer__momentum=0.9,
                              iterator_train__num_workers=16,
                              iterator_train__sampler=sampler,
                              iterator_valid__shuffle=False,
                              iterator_valid__num_workers=16,
                              train_split=predefined_split(val_dataset),
                              callbacks=callbacks,
                              device=device)

    net.fit(train_dataset, y=None)

    img_locs = [loc for loc, _ in test_dataset.samples]
    test_probs = net.predict_proba(test_dataset)
    test_probs = [prob[0] for prob in test_probs]
    data = {'img_loc' : img_locs, 'probability' : test_probs}
    pd.DataFrame(data=data).to_csv(csv_name, index=False)

In [None]:
def prepare_for_classification(img_id, img_path, race, radius):
    
    if (radius != None):
        img = Image.open(img_path + str(race) + '/' + str(radius) + '/' + str(img_id) + '.bmp')
    else: # case of original images
        img = Image.open(img_path + str(img_id) + '.bmp')
        
    arr = np.array(img)
    channels = np.repeat(resized[:, :, np.newaxis], 3, axis=2).reshape((1,256,256,3))
    
    return channels

# will replace this with train transforms from monai as Aaron did

In [None]:
def combine_images(data, img_path, race, radius):
    
    img_arr = np.zeros([1, 256, 256, 3], dtype='int')

    for i in tqdm(range(len(data))):
        img_id = data['image_id'][i] # because of this, predictions are in the right order, IMPORTANT
        # original preds
        img_prepared = prepare_for_classification(img_id, img_path, race, radius)
        img_arr = np.concatenate((img_arr, img_prepared), axis = 0)
        
    img_arr = img_arr[1:] # removing the unnecessary first element
    
    return img_arr