In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import time
import os
import copy
#!pip install opencv-python 
import cv2


## Data Processing

In [6]:
healthy_images = '../data/ML/train/target_0'
tumor_images = '../data/ML/train/target_1'


def load_images_from_folder(folder):
    images = []
    for filename in os.listdir(folder):
        img = cv2.imread(os.path.join(folder,filename))
        if img is not None:
            images.append(img)
    return images

In [7]:
healthy_loaded = load_images_from_folder(healthy_images)

In [8]:
len(healthy_loaded)

3618

In [9]:
healthy_loaded[3000].shape

(224, 224, 3)

In [10]:
tumor_loaded = load_images_from_folder(tumor_images)

In [11]:
len(tumor_loaded)

1382

In [12]:
tumor_loaded[300].shape

(224, 224, 3)

In [13]:
import os
import numpy as np
import shutil
import random

root_dir = '/home/user/data/ML/train' # data root path
classes_names = ['target_0', 'target_1']


## Split in train/test/val folders

In [14]:
def create_dirs():
    
    for cls in classes_names:
        os.makedirs(os.path.join(root_dir, 'train', cls))
        os.makedirs(os.path.join(root_dir, 'val', cls))
        os.makedirs(os.path.join(root_dir, 'test', cls))

In [15]:
test_ratio = 0.05
val_ratio = 0.15


all_healthy_filenames = os.listdir(os.path.join(root_dir,classes_names[0]))

np.random.shuffle(all_healthy_filenames)
train_healthy, val_healthy, test_healthy = np.split(np.array(all_healthy_filenames),
                                                          [int(len(all_healthy_filenames)* (1 - (val_ratio + test_ratio))), 
                                                           int(len(all_healthy_filenames)* (1 - test_ratio))])


train_healthy = [os.path.join(root_dir, classes_names[0], name) for name in train_healthy]
val_healthy = [os.path.join(root_dir, classes_names[0],  name) for name in val_healthy]
test_healthy = [os.path.join(root_dir, classes_names[0], name) for name in test_healthy]

In [16]:
def copy_data_healthy():
    for name in train_healthy:
        shutil.copy(name, os.path.join(root_dir, 'train', classes_names[0]))
    for name in test_healthy:
        shutil.copy(name, os.path.join(root_dir, 'test', classes_names[0]))
    for name in val_healthy:
        shutil.copy(name, os.path.join(root_dir, 'val', classes_names[0]))

In [17]:
print('Total images: ', len(all_healthy_filenames))
print('Training: ', len(train_healthy))
print('Validation: ', len(val_healthy))
print('Testing: ', len(test_healthy))

Total images:  3618
Training:  2894
Validation:  543
Testing:  181


In [18]:
all_tumor_filenames = os.listdir(os.path.join(root_dir,classes_names[1]))

np.random.shuffle(all_tumor_filenames)
train_tumor, val_tumor, test_tumor = np.split(np.array(all_tumor_filenames),
                                                          [int(len(all_tumor_filenames)* (1 - (val_ratio + test_ratio))), 
                                                           int(len(all_tumor_filenames)* (1 - test_ratio))])


train_tumor = [os.path.join(root_dir, classes_names[1], name) for name in train_tumor]
val_tumor = [os.path.join(root_dir, classes_names[1],  name) for name in val_tumor]
test_tumor = [os.path.join(root_dir, classes_names[1], name) for name in test_tumor]


In [19]:
print('Total images: ', len(all_tumor_filenames))
print('Training: ', len(train_tumor))
print('Validation: ', len(val_tumor))
print('Testing: ', len(test_tumor))

Total images:  1382
Training:  1105
Validation:  207
Testing:  70


In [20]:
def copy_data_tumor():
    for name in train_tumor:
        shutil.copy(name, os.path.join(root_dir, 'train', classes_names[1]))
    for name in val_tumor:
        shutil.copy(name, os.path.join(root_dir, 'test', classes_names[1]))
    for name in test_tumor:
        shutil.copy(name, os.path.join(root_dir, 'val', classes_names[1]))

In [21]:
train_tumor_check = os.listdir(os.path.join(root_dir,'train', classes_names[1]))
len(train_tumor_check)

1105

## Add augmentations (so far None)

In [22]:
# TODO: add augmentations

data_transforms = {
    'train': transforms.Compose([
        transforms.ToTensor(),
    ]),
    
    'val': transforms.Compose([
        transforms.ToTensor(),
        
    ]),    
    'test': transforms.Compose([
        transforms.ToTensor(),
    ]),
}

## Create DataSet and DataLoader for Training and Val Data

In [23]:
data_dir = '/home/user/data/ML/train'
image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x), data_transforms[x])
                  for x in ['train', 'val']}

In [24]:
image_datasets

{'train': Dataset ImageFolder
     Number of datapoints: 3999
     Root location: /home/user/data/ML/train/train
     StandardTransform
 Transform: Compose(
                ToTensor()
            ),
 'val': Dataset ImageFolder
     Number of datapoints: 613
     Root location: /home/user/data/ML/train/val
     StandardTransform
 Transform: Compose(
                ToTensor()
            )}

In [25]:
dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=1,
                                             shuffle=True, num_workers=0)
              for x in ['train', 'val']}

In [26]:
dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']}
class_names = image_datasets['train'].classes
class_names

['target_0', 'target_1']

In [27]:
dataset_sizes

{'train': 3999, 'val': 613}

In [28]:
len(dataloaders['train'])

3999

## Create DataSet and DataLoader for test data

In [29]:
import pandas as pd
import numpy as np


test_data_path = '/home/user/data/test'
classes_names = ['target_0', 'target_1']


In [30]:
image_datasets = {'test': datasets.ImageFolder(os.path.join(test_data_path), data_transforms['test'])}
image_datasets['test'][3300][1]

1

In [31]:
test_loader = {'test': torch.utils.data.DataLoader(image_datasets['test'], batch_size=1,
                                             shuffle=True, num_workers=0)}

In [32]:
len(test_loader['test'])

6000

## Train

In [35]:
def train_model(model, model_save, criterion, optimizer, scheduler, num_epochs=4):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)
                labels = labels.float()
                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    output = model(inputs).squeeze(1)
                    #print(outputs.data, labels)
                    loss = criterion(output, labels)
                    preds = torch.sigmoid(output)
                    thre = 0.5
                    threshold = torch.tensor([0.5]).to(device)
                    preds = (preds>threshold).float()
                    #print(preds, labels)
                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels)
            if phase == 'train':
                scheduler.step()

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))

            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())

        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    
    PATH = model_save
    torch.save(best_model_wts, PATH)
        
    
    return model

In [36]:
def test(model, test_loader, saved_weights, criterion, optimizer, scheduler):
    
    model.load_state_dict(torch.load(saved_weights))
    
    correct = 0
    total = 0
    
    predictions = []
            
    # since we're not training, we don't need to calculate the gradients for our outputs
    with torch.no_grad():
        for data in test_loader['test']:
            images, labels = data[0].to(device), data[1].to(device)
            labels = labels.float()
            # calculate outputs by running images through the network
            output = model(images).squeeze(1)
            # the class with the highest energy is what we choose as prediction
            loss = criterion(output, labels)
            preds = torch.sigmoid(output)
            predictions.append(preds)
            thre = 0.5
            threshold = torch.tensor([0.5]).to(device)
            pred_binary = (preds>threshold).float()
            total += labels.size(0)
            correct += (pred_binary == labels).sum().item()

    print(total)
    print('Accuracy of the network on the 6000 test images: %d %%' % (
        100 * correct / total))
    
    return predictions


# ResNet

In [83]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model_ft = models.resnet18(pretrained=True)
num_ftrs = model_ft.fc.in_features

# Here the size of each output sample is set to 2.
model_ft.fc = nn.Linear(num_ftrs, 1)

model_ft = model_ft.to(device)

criterion = nn.BCEWithLogitsLoss()

# Observe that all parameters are being optimized
optimizer_ft = optim.Adam(model_ft.parameters(), lr=0.001)

# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)

In [84]:
# train 
model_ft = train_model(model_ft, "resnet_best_model_weights.pth", criterion, optimizer_ft, exp_lr_scheduler,
                       num_epochs=30)


Epoch 0/29
----------
train Loss: 0.6207 Acc: 0.7059
val Loss: 9.0782 Acc: 0.4976

Epoch 1/29
----------
train Loss: 0.6067 Acc: 0.7182
val Loss: 0.3427 Acc: 0.8858

Epoch 2/29
----------
train Loss: 0.5978 Acc: 0.7229
val Loss: 0.4550 Acc: 0.8858

Epoch 3/29
----------
train Loss: 0.5969 Acc: 0.7237
val Loss: 0.4630 Acc: 0.8858

Epoch 4/29
----------
train Loss: 0.5939 Acc: 0.7237
val Loss: 0.4704 Acc: 0.8858

Epoch 5/29
----------
train Loss: 0.5914 Acc: 0.7232
val Loss: 0.4032 Acc: 0.8858

Epoch 6/29
----------
train Loss: 0.5928 Acc: 0.7237
val Loss: 0.6857 Acc: 0.8858

Epoch 7/29
----------
train Loss: 0.5888 Acc: 0.7237
val Loss: 0.5096 Acc: 0.8858

Epoch 8/29
----------
train Loss: 0.5865 Acc: 0.7237
val Loss: 0.5328 Acc: 0.8858

Epoch 9/29
----------
train Loss: 0.5842 Acc: 0.7237
val Loss: 0.6153 Acc: 0.8858

Epoch 10/29
----------
train Loss: 0.5806 Acc: 0.7237
val Loss: 0.6899 Acc: 0.8858

Epoch 11/29
----------
train Loss: 0.5761 Acc: 0.7237
val Loss: 0.8890 Acc: 0.8858

Ep

In [104]:
#test 
PATH = './resnet_best_model_weights.pth'

predictions = test(model_ft, test_loader, PATH, criterion, optimizer_ft, exp_lr_scheduler)

6000
Accuracy of the network on the 6000 test images: 54 %


In [None]:
df_submission = pd.DataFrame(data={'file_paths': file_paths, 'predictions': predictions})
df_submission["file_paths"] = df_submission["file_paths"].apply(lambda x: x.replace("/home/user/data/test","/data/challenges_data/test"))

df_submission.to_csv('df_submission.csv', index=False)
df_submission.head()

## DenseNet

In [39]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model_ft = models.densenet161(pretrained=True)
num_ftrs = model_ft.classifier.in_features
# Here the size of each output sample is set to 2.
# Alternatively, it can be generalized to nn.Linear(num_ftrs, len(class_names)).
model_ft.classifier = nn.Linear(num_ftrs, 1)

model_ft = model_ft.to(device)

criterion = nn.BCEWithLogitsLoss()

# Observe that all parameters are being optimized
optimizer_ft = optim.SGD(model_ft.parameters(), lr=0.001, momentum=0.9)

# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)

In [42]:
# train 
model_ft = train_model(model_ft, "densenet_best_model_weights.pth", criterion, optimizer_ft, exp_lr_scheduler,
                       num_epochs=30)

Epoch 0/29
----------
train Loss: 0.2673 Acc: 0.9057
val Loss: 0.7758 Acc: 0.7031

Epoch 1/29
----------
train Loss: 0.2388 Acc: 0.9172
val Loss: 0.3115 Acc: 0.9413

Epoch 2/29
----------
train Loss: 0.2085 Acc: 0.9237
val Loss: 0.5749 Acc: 0.9005

Epoch 3/29
----------
train Loss: 0.1228 Acc: 0.9595
val Loss: 0.4590 Acc: 0.8989

Epoch 4/29
----------
train Loss: 0.0719 Acc: 0.9780
val Loss: 0.7910 Acc: 0.9070

Epoch 5/29
----------
train Loss: 0.0461 Acc: 0.9885
val Loss: 0.2410 Acc: 0.9347

Epoch 6/29
----------
train Loss: 0.0297 Acc: 0.9942
val Loss: 0.6810 Acc: 0.7292

Epoch 7/29
----------
train Loss: 0.0208 Acc: 0.9957
val Loss: 0.3362 Acc: 0.9103

Epoch 8/29
----------
train Loss: 0.0143 Acc: 0.9970
val Loss: 0.5129 Acc: 0.9038

Epoch 9/29
----------
train Loss: 0.0111 Acc: 0.9977
val Loss: 1.0931 Acc: 0.8940

Epoch 10/29
----------
train Loss: 0.0087 Acc: 0.9980
val Loss: 0.5821 Acc: 0.8630

Epoch 11/29
----------
train Loss: 0.0084 Acc: 0.9980
val Loss: 0.3389 Acc: 0.9021

Ep

In [43]:
# test 
PATH = './densenet_best_model_weights.pth'

predictions = test(model_ft, test_loader, PATH, criterion, optimizer_ft, exp_lr_scheduler)

6000
Accuracy of the network on the 6000 test images: 68 %


In [None]:
df_submission = pd.DataFrame(data={'file_paths': file_paths, 'predictions': predictions})
df_submission["file_paths"] = df_submission["file_paths"].apply(lambda x: x.replace("/home/user/data/test","/data/challenges_data/test"))

In [None]:
df_submission.to_csv('df_submission.csv', index=False)
df_submission.head()