In [None]:
PROJECT_PATH = '/content/gdrive/MyDrive/CI Final Project'
DATASET_PATH = f'{PROJECT_PATH}/ICAR 2018 BACH Dataset'

: 

In [None]:
import os
import glob

import random
import numpy as np
import matplotlib.pyplot as plt

import torch
from torch.utils.data import Dataset, DataLoader, random_split
import torch.nn as nn
import torch.nn.functional as F
import torchvision 
from torchvision import *
import torchvision.transforms as transforms

from numpy import genfromtxt
from PIL import Image, ImageEnhance

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix

: 

# Dataset and Dataloaders

In [None]:
LABELS = ['Normal', 'Benign', 'InSitu', 'Invasive']
IMAGE_SIZE = (2048, 1536)
PATCH_SIZE = 512
PROJECT_PATH = '/content/gdrive/MyDrive/CI Final Project'
DATASET_PATH = f'{PROJECT_PATH}/ICAR 2018 BACH Dataset'
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'


In [None]:
indexes_for_dataset = [i for i in range(400)]
TRAIN_INDEXES, TEST_INDEXES = train_test_split(indexes_for_dataset, test_size=0.1, random_state=5)

In [None]:
def __crop_img__(img, size, n):
    h, w, c = img.shape
    crops = []
    
    for _ in range(n):
        top = np.random.randint(low=0, high=h - size + 1)
        left = np.random.randint(low=0, high=w - size + 1)
        crop = img[top: top + size, left: left + size].copy()
        crop = np.rot90(crop, np.random.randint(low=0, high=4))
        if np.random.random() > 0.5:
            crop = np.flipud(crop)
        if np.random.random() > 0.5:
            crop = np.fliplr(crop)
        crops.append(crop)

    crops = np.stack(crops)
    return crops

In [None]:
class ICARDataset(Dataset):
    def __init__(self, path, rotate=False, flip=False, enhance=False, train=False):
        super().__init__()

        dataset = {name: index for index in range(len(LABELS)) for name in glob.glob(path + '/' + LABELS[index] + '/*.tif')}
        self.path = path
        self.dataset = dataset
        self.names = list(sorted(dataset.keys()))
        self.rotate = rotate
        self.flip = flip
        self.enhance = enhance
        self.train = train

        self.flip_options = [Image.FLIP_LEFT_RIGHT, Image.FLIP_TOP_BOTTOM]

    def __getitem__(self, index):

        if self.train:
          index = random.choice(TRAIN_INDEXES)
        else:
          index = random.choice(TEST_INDEXES)
          
        img_path = self.names[index]
        label = self.dataset[img_path]
        with Image.open(img_path) as img:

            # Apply flip or not
            if random.randint(0,1) == 1 and self.flip:
              flip_random_ind = random.randint(0,1)
              img = img.transpose(self.flip_options[flip_random_ind])

            if self.rotate:
              rotate_random_angle = random.randint(0,3)
              img = img.rotate(rotate_random_angle * 90)

            # Apply enhance or not
            if random.randint(0,1) == 1 and self.enhance:
              factors = np.random.uniform(.5, 1.5, 3)

              img = ImageEnhance.Color(img).enhance(factors[0])
              img = ImageEnhance.Contrast(img).enhance(factors[1])
              img = ImageEnhance.Brightness(img).enhance(factors[2])


            img = transforms.ToTensor()(img)

            crops = __crop_img__(img.permute(1,2,0).numpy(), 512, 20)
            crops = torch.from_numpy(crops).permute(0,3,1,2)
            return crops, label

    def __len__(self):
        return len(self.dataset)



In [None]:
# create the dataset
dataset_train = ICARDataset(DATASET_PATH, rotate=True, flip=True, enhance=True, train=True)
dataset_test = ICARDataset(DATASET_PATH, rotate=True, flip=True, enhance=True, train=False)

# create a data loader for train and test sets
train_dl = DataLoader(dataset_train, batch_size=25, shuffle=True)
test_dl = DataLoader(dataset_test, batch_size=25, shuffle=False)

# Feature Extraction


In [None]:
class Identity(torch.nn.Module):
    def __init__(self):
        super(Identity, self).__init__()

    def forward(self, x):
        return x

In [None]:
class BasicFeatureExtracter(nn.Module):
    def __init__(self):
        super(BasicFeatureExtracter, self).__init__()
        #Resnet50
        self.resnet50 = models.resnet50(pretrained=True)
        self.resnet50.fc = Identity()
        self.resnet50.to(device=DEVICE)
        #Inception V3
        self.inception_v3 = models.inception_v3(pretrained=True)
        self.inception_v3.fc = Identity()
        self.inception_v3.to(device=DEVICE)
        #VGG16
        self.vgg16 = models.vgg16(pretrained=True)
        self.vgg16 = self.vgg16.features
        self.vgg16.to(device=DEVICE)

    def extract_feat_per_crop(self, x):
      with torch.no_grad():
        res_feat = self.resnet50.forward(x)
        inc_v3_feat = self.inception_v3.forward(x)[0]
        vgg16_feat = self.vgg16.forward(x)
        
        avg_pool = nn.AvgPool2d(6)
        vgg16_feat = avg_pool(vgg16_feat)
        vgg16_feat = vgg16_feat.flatten(start_dim=1)

      features_concat = torch.cat((res_feat, inc_v3_feat,vgg16_feat), axis=1)
      return features_concat


    #shape = (Crops=20, x_size=450 ,y_size=450, channels=3)
    def forward(self,batch_data):
      feature_array = []

      for each_image in batch_data:
        each_image=each_image.to(device=DEVICE)
        features = self.extract_feat_per_crop(each_image)
        norm_features = torch.pow(features, 3)
        norm_features = torch.sum(norm_features, dim=0)
        norm_features = torch.div(norm_features, features.shape[0])
        norm_features = torch.pow(norm_features, 1/3)
        feature_array.append(norm_features.cpu().numpy())
      return np.array(feature_array)

basicFeatureExtracter =  BasicFeatureExtracter()

# Generate Data

In [None]:
# run this if you want to generate data

train_dataset_path = f'{PROJECT_PATH}/results_v2/train'
test_dataset_path = f'{PROJECT_PATH}/results_v2/test'

# lucia 0 berni 160 kuba 320 demetre 480
starting_index = 160

# generate train data
for i in range(160):
  print(f'Epoch: {i+1}')
  train_features, train_labels = next(iter(train_dl))
  train_labels = train_labels.numpy()
  feature_array_train = basicFeatureExtracter.forward(train_features)

  feature_csv_name = f'feature_array_{starting_index + i + 1}.csv'
  label_csv_name = f'feature_label_{starting_index + i + 1}.csv'
  np.savetxt(f'{train_dataset_path}/{feature_csv_name}', feature_array_train, delimiter=",")
  np.savetxt(f'{train_dataset_path}/{label_csv_name}', train_labels, delimiter=",")

starting_index = 0
# generate test data
for i in range(40):
  print(f'Epoch: {i+1}')
  test_features, test_labels = next(iter(test_dl))
  test_labels = test_labels.numpy()
  feature_array_train = basicFeatureExtracter.forward(test_features)

  feature_csv_name = f'feature_array_{starting_index + i + 1}.csv'
  label_csv_name = f'feature_label_{starting_index + i + 1}.csv'
  np.savetxt(f'{test_dataset_path}/{feature_csv_name}', feature_array_train, delimiter=",")
  np.savetxt(f'{test_dataset_path}/{label_csv_name}', test_labels, delimiter=",")

# Load Data


In [None]:
drive_path = f'{PROJECT_PATH}/results_v2'


X_train, X_test, y_train, y_test = None, None, None, None


# load train data
for i in range(600):
  print(f'Feautres for file {i + 1} was loaded')

  features_path = f'feature_array_{i+1}.csv'
  labels_path = f'feature_label_{i+1}.csv'
  features = genfromtxt(f'{drive_path}/train/{features_path}', delimiter=',')
  label = genfromtxt(f'{drive_path}/train/{labels_path}', delimiter=',')

  if X_train is None:
    X_train = features
    y_train = label
  else:
    X_train = np.concatenate([X_train, features])
    y_train = np.concatenate([y_train, label])



# load test data
for i in range(40):
  print(f'Feautres for file {i + 1} was loaded')

  features_path = f'feature_array_{i+1}.csv'
  labels_path = f'feature_label_{i+1}.csv'
  features = genfromtxt(f'{drive_path}/test/{features_path}', delimiter=',')
  label = genfromtxt(f'{drive_path}/test/{labels_path}', delimiter=',')

  if X_test is None:
    X_test = features
    y_test = label
  else:
    X_test = np.concatenate([X_test, features])
    y_test = np.concatenate([y_test, label])

Feautres for file 1 was loaded
Feautres for file 2 was loaded
Feautres for file 3 was loaded
Feautres for file 4 was loaded
Feautres for file 5 was loaded
Feautres for file 6 was loaded
Feautres for file 7 was loaded
Feautres for file 8 was loaded
Feautres for file 9 was loaded
Feautres for file 10 was loaded
Feautres for file 11 was loaded
Feautres for file 12 was loaded
Feautres for file 13 was loaded
Feautres for file 14 was loaded
Feautres for file 15 was loaded
Feautres for file 16 was loaded
Feautres for file 17 was loaded
Feautres for file 18 was loaded
Feautres for file 19 was loaded
Feautres for file 20 was loaded
Feautres for file 21 was loaded
Feautres for file 22 was loaded
Feautres for file 23 was loaded
Feautres for file 24 was loaded
Feautres for file 25 was loaded
Feautres for file 26 was loaded
Feautres for file 27 was loaded
Feautres for file 28 was loaded
Feautres for file 29 was loaded
Feautres for file 30 was loaded
Feautres for file 31 was loaded
Feautres for file

In [None]:
print(f'Training Dataset Size: {len(X_train)}')
print(f'Test Dataset Size: {len(X_test)}')

Training Dataset Size: 15000
Test Dataset Size: 1000


In [None]:
X_train.shape

(15000, 6144)

# Training and Testing the Gradient Boosting Classifier

In [None]:
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0, verbose=1).fit(X_train, y_train)
clf.score(X_test, y_test)

      Iter       Train Loss   Remaining Time 
         1           1.1397          107.10m
         2           1.0221          104.47m
         3           0.9415          106.14m
         4           0.8762          104.14m
         5           0.8356          102.05m
         6           0.7923          101.27m
         7           0.7578           99.69m
         8           0.7217           98.28m
         9           0.6984           97.00m
        10           0.6726           95.70m
        20           0.4842           81.10m
        30           0.3752           67.17m
        40           0.3022           55.83m
        50           0.2479           45.65m
        60           0.2102           36.11m
        70           0.1809           26.85m
        80           0.1602           17.77m
        90           0.1422            8.84m
       100           0.1271            0.00s


0.807

In [None]:
y_pred = clf.predict(X_test)
print(confusion_matrix(y_test, y_pred))

[[283  63  21   8]
 [ 28 160   3   7]
 [  5  26 222  16]
 [  0  12   4 142]]


In [None]:
dataset = {name: index for index in range(len(LABELS)) for name in glob.glob(DATASET_PATH + '/' + LABELS[index] + '/*.tif')}
labels = []
feature_array = []
for img_inx in TEST_INDEXES:
  img_path = list(dataset.keys())[img_inx]
  label = dataset[img_path]
  labels.append(label)
  with Image.open(img_path) as img:
      img = transforms.ToTensor()(img)
      img = img.to(device=DEVICE)

      crops = __crop_img__(img.permute(1,2,0).numpy(), 512, 20)
      feature_array.append(crops)
labels = np.array(labels)
feature_array = torch.from_numpy(np.array(feature_array)).permute(0,1,4,2,3)
print(f'Shape of feature array: {feature_array.shape}')
print(f'Shape of labels: {labels.shape}')
original_image_features = basicFeatureExtracter.forward(feature_array)


Shape of feature array: torch.Size([40, 20, 3, 512, 512])
Shape of labels: (40,)


In [None]:
clf.score(original_image_features, labels)

0.925

In [None]:
y_pred_org = clf.predict(original_image_features)
print(confusion_matrix(labels, y_pred_org))

[[ 5  2  1  0]
 [ 0 10  0  0]
 [ 0  0  7  0]
 [ 0  0  0 15]]
