In [2]:
!pip install skorch

Collecting skorch
  Downloading skorch-0.11.0-py3-none-any.whl (155 kB)
[K     |████████████████████████████████| 155 kB 2.0 MB/s eta 0:00:01
Installing collected packages: skorch
Successfully installed skorch-0.11.0
You should consider upgrading via the '/usr/bin/python3.6 -m pip install --upgrade pip' command.[0m


In [3]:
!pip install torch==1.8.0+cu111 torchvision==0.9.0+cu111 torchaudio==0.8.0 -f https://download.pytorch.org/whl/torch_stable.html


Looking in links: https://download.pytorch.org/whl/torch_stable.html
Collecting torch==1.8.0+cu111
  Downloading https://download.pytorch.org/whl/cu111/torch-1.8.0%2Bcu111-cp36-cp36m-linux_x86_64.whl (1982.2 MB)
[K     |████████████████████████████████| 1982.2 MB 4.1 kB/s eta 0:00:011  |█                               | 61.6 MB 8.6 MB/s eta 0:03:44     |█████▌                          | 342.8 MB 10.5 MB/s eta 0:02:36     |██████████████████████▌         | 1393.1 MB 9.6 MB/s eta 0:01:02     |█████████████████████████████▉  | 1847.7 MB 8.6 MB/s eta 0:00:16
[?25hCollecting torchvision==0.9.0+cu111
  Downloading https://download.pytorch.org/whl/cu111/torchvision-0.9.0%2Bcu111-cp36-cp36m-linux_x86_64.whl (17.6 MB)
[K     |████████████████████████████████| 17.6 MB 144 kB/s eta 0:00:01
[?25hCollecting torchaudio==0.8.0
  Downloading torchaudio-0.8.0-cp36-cp36m-manylinux1_x86_64.whl (1.9 MB)
[K     |████████████████████████████████| 1.9 MB 2.3 MB/s eta 0:00:01
Installing collected package

In [4]:
import os

import numpy as np
import pandas as pd
from skimage.morphology import skeletonize
from PIL import Image
import cv2

from skorch import NeuralNetClassifier
from skorch.callbacks import LRScheduler, Checkpoint, EpochScoring, EarlyStopping
from skorch.dataset import Dataset
from skorch.helper import predefined_split
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, models, transforms
from sklearn.model_selection import train_test_split

from pathlib import Path
import tensorflow as tf
from tensorflow import keras
import matplotlib.patches as patches
import matplotlib.pyplot as plt

from tqdm import tqdm

In [5]:
!nvidia-smi

Sun Jun 12 04:18:32 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.129.06   Driver Version: 470.129.06   CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:06:00.0 Off |                    0 |
| N/A   31C    P8    25W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla K80           Off  | 00000000:07:00.0 Off |                    0 |
| N/A   24C    P8    30W / 149W |      0MiB / 11441MiB |      0%      Defaul

In [2]:
segmentation_classifier = keras.models.load_model('models/MIMIC-256x25680-20-split-resnet-Float16_2-race_detection_rop_seg_data_rop_seg-0.001_20220321-054140_epoch:011.hdf5')
segmentation_classifier.summary()

The dtype policy mixed_float16 may run slowly because this machine does not have a GPU. Only Nvidia GPUs with compute capability of at least 7.0 run quickly with mixed_float16.
Instructions for updating:
Use tf.keras.mixed_precision.LossScaleOptimizer instead. LossScaleOptimizer now has all the functionality of DynamicLossScale
Model: "functional_11"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, 256, 256, 3) 0                                            
__________________________________________________________________________________________________
bn_data (BatchNormalization)    (None, 256, 256, 3)  9           input_3[0][0]                    
__________________________________________________________________________________________________
zero_padding2d_68 (ZeroPadding2 (None, 262, 262, 3)  

In [6]:
# code in one place, running in another directory
# code is to be run in below directory

os.chdir("/users/riya/race/classifier_experiments/CNN_train")

In [7]:
!ls

dataset  outputs


### Prepare Dataset

We'll try a 70/10/20 split (train/val/test). We don't have access to info outside of black/white, so we'll just do a simple split.

In [None]:
race_data = pd.read_csv("/users/riya/race/csv/image_race_data.csv")

In [None]:
pd.value_counts(race_data['race'])

In [None]:
1709 / (1709 + 2837)

In [None]:
def prepare_dataset():
    
    csv_path = "/users/riya/race/csv/image_race_data.csv"
    data_path = "/users/riya/race/dataset/segmentations/"
    save_path = "/users/riya/race/classifier_experiments/CNN_train/dataset/"
    
    race_data = pd.read_csv(csv_path)
    race_data['stratify'] = race_data['race'] + '_' + race_data['variable'] 
    # new column so I can account for both variable and race in my stratification
    
    ratio_train = 0.7
    ratio_val = 0.1
    ratio_test = 0.2
    
    # split into 80% train and val, 20% test
    
    X_intermediate, X_test, y_intermediate, y_test = train_test_split(race_data, race_data['race'], test_size=ratio_test, 
                                                        stratify = race_data['stratify'], random_state=86)
    
    ratio_remaining = 1 - ratio_test
    ratio_val_adjusted = ratio_val / ratio_remaining
    
    # split into 70% train and 10% val
    
    X_train, X_val, y_train, y_val = train_test_split(X_intermediate, X_intermediate['race'], test_size=ratio_val_adjusted, 
                                                        stratify = X_intermediate['stratify'], random_state=86)

    
    def populate_folders(data_df, data_type):
    
        for i in tqdm(range(len(data_df))):
            data_df.reset_index(drop=True, inplace=True)
            img_id = data_df['image_id'][i]
            race = data_df['race'][i]

            img = np.array(Image.open(data_path + str(img_id) + '.bmp'))
            img = Image.fromarray(img)

            img.save(save_path + str(data_type) + '/' + str(race) + '/' + str(img_id) + '.bmp')
    
    populate_folders(X_train, 'train')
    populate_folders(X_val, 'val')
    populate_folders(X_test, 'test')


In [None]:
prepare_dataset()

### Model Definitions

In [8]:
class PretrainedModel(nn.Module):
    def __init__(self, output_features):
        super().__init__()
        model = models.resnet18(pretrained=True)
        num_ftrs = model.fc.in_features
        model.fc = nn.Linear(num_ftrs, output_features)
        self.model = model

    def forward(self, x):
        return self.model(x)

### Preprocessing

In [9]:
# Already done this, before I saved my images
# Yay, we did the same thing, repeating the image thrice to create the three layers. I can just do that part.

# Oh, I see. Rather than saving ALL the images (as I wrongly did), it's so much easier to just make them in the train code!

# depending on the inputs, I can run this code 8 times to train this model. It'll be easy to train at that point!

def shadow_regions(img, skeleton, shadow, radius, region, image_size = (224, 224)):
    
    img = np.array(img)
    img = cv2.resize(img, image_size)
    
    # defining channel which will be duplicated late (in case it's not already with Image Folder??)
    channel = img[:,:,0]

    if skeleton is True:
        # can binarize all 3 channels, but will go 1 at a time
        channel[channel > 0] = 255       
        modified_img = skeletonize(channel, method='lee')
    
    if shadow is True:
        # developing mask that darkens center portion
        center_mask = np.full(image_size, 255, dtype=np.uint8) 
        # radius i changes, center, color, fill is the same
        cv2.circle(center_mask, (int(image_size[0]/2), int(image_size[0]/2)), radius, (0, 0, 0), -1)

        # developing mask that darkens background region
        back_mask = cv2.bitwise_not(center_mask)

        if (region == 'dark_center'):
            modified_img = cv2.bitwise_or(channel, channel, mask=center_mask)

        if (region == 'dark_background'):
            modified_img = cv2.bitwise_or(channel, channel, mask=back_mask)
    
    if skeleton is not True and shadow is not True: # if condition here for clarity     
        modified_img = channel
        
    img[:,:,0] = modified_img
    img[:,:,1] = modified_img
    img[:,:,2] = modified_img
    
    img = Image.fromarray(img)

    return img

Options for different saving:
1. Skeleton = True,
    Shadow = True then
    Region has two options 
    Radius has two options
2. Original Training: shadow = False & Skeleton = False
3. no skeletonization training: Skeleton = false & Shadow = true

In [None]:
img_arr = np.array(Image.open("/users/riya/race/dataset/segmentations/" + str(7571) + '.bmp'))
img = Image.fromarray(img_arr)

In [None]:
plt.imshow(img, interpolation = 'nearest', cmap = 'gray')
plt.show()

In [None]:
img_arr.shape

In [None]:
image_size = (224, 224)

In [None]:
img = cv2.resize(img_arr, image_size)

In [None]:
img = np.repeat(img[:, :, np.newaxis], 3, axis=2).reshape((image_size[0],image_size[1],3)) # (1, 224, 224, 3)?

### Train Code

In [10]:
def train(data_dir, radius, region, skeleton=False, shadow = False, num_classes=2, batch_size=64, num_epochs=10, lr=0.001):
    device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu") # 
    if device == 'cuda:1': # using all available gpus
        torch.cuda.empty_cache()
    if skeleton is True: # Experiment #2: skeleton true, shadow true
        shadow = True
        f_params = f'./outputs/checkpoints/model_shadow_regions_{region}_{radius}_skeletonized.pt'
        f_history = f'./outputs/histories/model_shadow_regions_{region}_{radius}_skeletonized.json'
        csv_name = f'./outputs/probabilities/shadow_regions_{region}_{radius}_skeletonized.csv'
    elif shadow is True: # Experiment #3: skeleton false, shadow true
        f_params = f'./outputs/checkpoints/model_shadow_regions_{region}_{radius}.pt'
        f_history = f'./outputs/histories/model_shadow_regions_{region}_{radius}.json'
        csv_name = f'./outputs/probabilities/shadow_regions_{region}_{radius}.csv'
    else: # Original training: skeleton false, shadow false
        f_params = f'./outputs/checkpoints/model_original.pt'
        f_history = f'./outputs/histories/model_original.json'
        csv_name = f'./outputs/probabilities/original.csv'
        
    train_transforms = transforms.Compose([transforms.Lambda(lambda img: shadow_regions(img, skeleton,
                                                                                shadow, radius,
                                                                                region)), # image size pre-defined
                                           # transforms.Resize(image_size),
                                           transforms.RandomHorizontalFlip(),
                                           transforms.RandomVerticalFlip(),
                                           transforms.RandomRotation(25),
                                           transforms.ToTensor(),
                                           transforms.Normalize([0.5, 0.5, 0.5],
                                                                [0.5, 0.5, 0.5])]) # why this normalizing?

    test_transforms = transforms.Compose([transforms.Lambda(lambda img: shadow_regions(img, skeleton,
                                                                                shadow, radius,
                                                                                region)),
                                          # transforms.Resize(image_size),
                                          transforms.ToTensor(),
                                          transforms.Normalize([0.5, 0.5, 0.5],
                                                               [0.5, 0.5, 0.5])])

    train_folder = os.path.join(data_dir, 'train') # only training on segmentations      
    val_folder = os.path.join(data_dir, 'val')
    test_folder = os.path.join(data_dir, 'test')

    # I guess this automatically creates 3 channels
    train_dataset = datasets.ImageFolder(train_folder, train_transforms)
    val_dataset = datasets.ImageFolder(val_folder, test_transforms)
    test_dataset = datasets.ImageFolder(test_folder, test_transforms)

    
    labels = np.array(train_dataset.samples)[:,1]
    
    # what even does the below code do?    
    labels = labels.astype(int)
    black_weight = 1 / len(labels[labels == 0])
    white_weight = 1 / len(labels[labels == 1])
    sample_weights = np.array([black_weight, white_weight])
    weights = sample_weights[labels]
    sampler = torch.utils.data.WeightedRandomSampler(weights, len(train_dataset), replacement=True)

    print()
    print(f'Data Directory: {data_dir}')
    print(f'Skeletonize: {skeleton}')
    print(f'Shadow: {shadow}')
    print(f'Number of Classes: {num_classes}')
    print(f'Number of black eyes: {len(labels[labels == 0])}')
    print(f'Number of white eyes: {len(labels[labels == 1])}')
    print(f'Batch Size: {batch_size}')
    print(f'Number of Epochs: {num_epochs}')
    print(f'Initial Learning Rate: {lr}')
    print(f'Device: {device}')
    print()

    # maybe increase size of validation set??
    
    checkpoint = Checkpoint(monitor='valid_loss_best',
                            f_params=f_params,
                            f_history=f_history,
                            f_optimizer=None,
                            f_criterion=None)

    # accuracy on train/validation?
    
    train_acc = EpochScoring(scoring='accuracy',
                             on_train=True,
                             name='train_acc',
                             lower_is_better=False)

    early_stopping = EarlyStopping()

    callbacks = [checkpoint, train_acc, early_stopping]

    net = NeuralNetClassifier(PretrainedModel,
                              criterion=nn.CrossEntropyLoss,
                              lr=lr,
                              batch_size=batch_size,
                              max_epochs=num_epochs,
                              module__output_features=num_classes,
                              optimizer=optim.SGD,
                              optimizer__momentum=0.9,
                              iterator_train__num_workers=16,
                              iterator_train__sampler=sampler,
                              iterator_valid__shuffle=False,
                              iterator_valid__num_workers=16,
                              train_split=predefined_split(val_dataset),
                              callbacks=callbacks,
                              device=device)

    net.fit(train_dataset, y=None)

    img_locs = [loc for loc, _ in test_dataset.samples]
    test_probs = net.predict_proba(test_dataset)
    test_probs = [prob[0] for prob in test_probs]
    data = {'img_loc' : img_locs, 'probability' : test_probs}
    pd.DataFrame(data=data).to_csv(csv_name, index=False)

In [11]:
if __name__ == '__main__':
    if not os.path.isdir(os.path.join('outputs', 'probabilities')):
        os.makedirs(os.path.join('outputs', 'probabilities'))
    if not os.path.isdir(os.path.join('outputs', 'checkpoints')):
        os.makedirs(os.path.join('outputs', 'checkpoints'))
    if not os.path.isdir(os.path.join('outputs', 'histories')):
        os.makedirs(os.path.join('outputs', 'histories'))

    data_dir = os.path.join('dataset')

    train(data_dir, 0, 0)
    
    # training 4 skeleton + shadow models for experiment #2
    train(data_dir, 45, 'dark_center',skeleton=True, shadow = True)
    train(data_dir, 45, 'dark_background',skeleton=True, shadow = True)
    train(data_dir, 90, 'dark_center',skeleton=True, shadow = True)
    train(data_dir, 90, 'dark_background',skeleton=True, shadow = True)


Data Directory: dataset
Skeletonize: False
Shadow: False
Number of Classes: 2
Number of black eyes: 1197
Number of white eyes: 1984
Batch Size: 64
Number of Epochs: 10
Initial Learning Rate: 0.001
Device: cuda:1



Downloading: "https://download.pytorch.org/models/resnet18-5c106cde.pth" to /root/.cache/torch/hub/checkpoints/resnet18-5c106cde.pth


  0%|          | 0.00/44.7M [00:00<?, ?B/s]

  epoch    train_acc    train_loss    valid_acc    valid_loss    cp      dur
-------  -----------  ------------  -----------  ------------  ----  -------
      1       [36m0.6863[0m        [32m0.5921[0m       [35m0.7495[0m        [31m0.4688[0m     +  22.8890
      2       [36m0.8155[0m        [32m0.4124[0m       0.7407        0.5353        21.8471
      3       [36m0.8368[0m        [32m0.3691[0m       [35m0.8154[0m        [31m0.4002[0m     +  21.8220
      4       [36m0.8554[0m        [32m0.3248[0m       [35m0.8242[0m        0.4030        21.9117
      5       [36m0.8752[0m        [32m0.2868[0m       [35m0.8440[0m        [31m0.3105[0m     +  21.9440
      6       [36m0.8790[0m        [32m0.2809[0m       0.8440        [31m0.3081[0m     +  21.9060
      7       [36m0.8950[0m        [32m0.2489[0m       0.8396        0.3381        22.0916
      8       0.8887        0.2568       [35m0.8681[0m        [31m0.2731[0m     +  22.0817
      9       

In [12]:
if __name__ == '__main__':
    if not os.path.isdir(os.path.join('outputs', 'probabilities')):
        os.makedirs(os.path.join('outputs', 'probabilities'))
    if not os.path.isdir(os.path.join('outputs', 'checkpoints')):
        os.makedirs(os.path.join('outputs', 'checkpoints'))
    if not os.path.isdir(os.path.join('outputs', 'histories')):
        os.makedirs(os.path.join('outputs', 'histories'))

    data_dir = os.path.join('dataset')
    # training 4 no skeleton + shadow models for experiment #2
    train(data_dir, 45, 'dark_center',skeleton=False, shadow = True)
    train(data_dir, 45, 'dark_background',skeleton=False, shadow = True)
    train(data_dir, 90, 'dark_center',skeleton=False, shadow = True)
    train(data_dir, 90, 'dark_background',skeleton=False, shadow = True)


Data Directory: dataset
Skeletonize: False
Shadow: True
Number of Classes: 2
Number of black eyes: 1197
Number of white eyes: 1984
Batch Size: 64
Number of Epochs: 10
Initial Learning Rate: 0.001
Device: cuda:1

  epoch    train_acc    train_loss    valid_acc    valid_loss    cp      dur
-------  -----------  ------------  -----------  ------------  ----  -------
      1       [36m0.6841[0m        [32m0.5804[0m       [35m0.7231[0m        [31m0.5297[0m     +  22.4284
      2       [36m0.7843[0m        [32m0.4495[0m       0.6374        0.7344        22.4209
      3       [36m0.8255[0m        [32m0.3987[0m       [35m0.7934[0m        [31m0.4313[0m     +  22.4387
      4       [36m0.8412[0m        [32m0.3616[0m       [35m0.7956[0m        [31m0.4073[0m     +  22.3537
      5       [36m0.8434[0m        [32m0.3474[0m       0.6484        0.8646        22.3640
      6       0.8428        [32m0.3353[0m       [35m0.8176[0m        [31m0.3857[0m     +  22.4248


After, try training more than 10 epochs!