In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import time
import os
import copy
#!pip install opencv-python 
import cv2


## Data Processing

In [7]:
healthy_images = '../data/ML/train/target_0'
tumor_images = '../data/ML/train/target_1'


def load_images_from_folder(folder):
    images = []
    for filename in os.listdir(folder):
        img = cv2.imread(os.path.join(folder,filename))
        if img is not None:
            images.append(img)
    return images

In [8]:
healthy_loaded = load_images_from_folder(healthy_images)

In [9]:
len(healthy_loaded)

3618

In [10]:
healthy_loaded[3000].shape

(224, 224, 3)

In [11]:
tumor_loaded = load_images_from_folder(tumor_images)

In [12]:
len(tumor_loaded)

1382

In [13]:
tumor_loaded[300].shape

(224, 224, 3)

In [2]:
import os
import numpy as np
import shutil
import random

root_dir = '/home/user/data/ML/train' # data root path
classes_names = ['target_0', 'target_1']


## Split in train/test/val folders

In [None]:
def create_dirs():
    
    for cls in classes_names:
        os.makedirs(os.path.join(root_dir, 'train', cls))
        os.makedirs(os.path.join(root_dir, 'val', cls))
        os.makedirs(os.path.join(root_dir, 'test', cls))

In [3]:
test_ratio = 0.05
val_ratio = 0.15


all_healthy_filenames = os.listdir(os.path.join(root_dir,classes_names[0]))

np.random.shuffle(all_healthy_filenames)
train_healthy, val_healthy, test_healthy = np.split(np.array(all_healthy_filenames),
                                                          [int(len(all_healthy_filenames)* (1 - (val_ratio + test_ratio))), 
                                                           int(len(all_healthy_filenames)* (1 - test_ratio))])


train_healthy = [os.path.join(root_dir, classes_names[0], name) for name in train_healthy]
val_healthy = [os.path.join(root_dir, classes_names[0],  name) for name in val_healthy]
test_healthy = [os.path.join(root_dir, classes_names[0], name) for name in test_healthy]

In [4]:
def copy_data_healthy():
    for name in train_healthy:
        shutil.copy(name, os.path.join(root_dir, 'train', classes_names[0]))
    for name in test_healthy:
        shutil.copy(name, os.path.join(root_dir, 'test', classes_names[0]))
    for name in val_healthy:
        shutil.copy(name, os.path.join(root_dir, 'val', classes_names[0]))

In [5]:
print('Total images: ', len(all_healthy_filenames))
print('Training: ', len(train_healthy))
print('Validation: ', len(val_healthy))
print('Testing: ', len(test_healthy))

Total images:  3618
Training:  2894
Validation:  543
Testing:  181


In [6]:
all_tumor_filenames = os.listdir(os.path.join(root_dir,classes_names[1]))

np.random.shuffle(all_tumor_filenames)
train_tumor, val_tumor, test_tumor = np.split(np.array(all_tumor_filenames),
                                                          [int(len(all_tumor_filenames)* (1 - (val_ratio + test_ratio))), 
                                                           int(len(all_tumor_filenames)* (1 - test_ratio))])


train_tumor = [os.path.join(root_dir, classes_names[1], name) for name in train_tumor]
val_tumor = [os.path.join(root_dir, classes_names[1],  name) for name in val_tumor]
test_tumor = [os.path.join(root_dir, classes_names[1], name) for name in test_tumor]


In [7]:
print('Total images: ', len(all_tumor_filenames))
print('Training: ', len(train_tumor))
print('Validation: ', len(val_tumor))
print('Testing: ', len(test_tumor))

Total images:  1382
Training:  1105
Validation:  207
Testing:  70


In [8]:
def copy_data_tumor():
    for name in train_tumor:
        shutil.copy(name, os.path.join(root_dir, 'train', classes_names[1]))
    for name in val_tumor:
        shutil.copy(name, os.path.join(root_dir, 'test', classes_names[1]))
    for name in test_tumor:
        shutil.copy(name, os.path.join(root_dir, 'val', classes_names[1]))

In [9]:
train_tumor_check = os.listdir(os.path.join(root_dir,'train', classes_names[1]))
len(train_tumor_check)

1105

## Add augmentations (so far None)

In [10]:
# TODO: add augmentations

data_transforms = {
    'train': transforms.Compose([
        transforms.ToTensor(),
    ]),
    
    'val': transforms.Compose([
        transforms.ToTensor(),
        
    ]),    
    'test': transforms.Compose([
        transforms.ToTensor(),
    ]),
}

## Create DataSet and DataLoader for Training and Val Data

In [11]:
data_dir = '/home/user/data/ML/train'
image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x), data_transforms[x])
                  for x in ['train', 'val']}

In [12]:
image_datasets

{'train': Dataset ImageFolder
     Number of datapoints: 3999
     Root location: /home/user/data/ML/train/train
     StandardTransform
 Transform: Compose(
                ToTensor()
            ),
 'val': Dataset ImageFolder
     Number of datapoints: 613
     Root location: /home/user/data/ML/train/val
     StandardTransform
 Transform: Compose(
                ToTensor()
            )}

In [13]:
dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=1,
                                             shuffle=True, num_workers=0)
              for x in ['train', 'val']}

In [14]:
dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']}
class_names = image_datasets['train'].classes
class_names

['target_0', 'target_1']

In [15]:
dataset_sizes

{'train': 3999, 'val': 613}

In [16]:
len(dataloaders['train'])

3999

## Create DataSet and DataLoader for test data

In [32]:
import pandas as pd
import numpy as np


test_data_path = '/home/user/data/test'
classes_names = ['target_0', 'target_1']


In [33]:
image_datasets = {'test': datasets.ImageFolder(os.path.join(test_data_path), data_transforms['test'])}

In [34]:
test_loader = {'test': torch.utils.data.DataLoader(image_datasets['test'], batch_size=1,
                                             shuffle=True, num_workers=0)}

In [35]:
len(test_loader['test'])

6000

## Train

In [63]:
def train_model(model, model_save, criterion, optimizer, scheduler, num_epochs=4):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)
            if phase == 'train':
                scheduler.step()

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))

            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())

        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    
    PATH = model_save
    torch.save(best_model_wts, PATH)
        
    
    return model

In [88]:
def test(model, test_loader, saved_weights, criterion, optimizer, scheduler):
    
    model.load_state_dict(torch.load(saved_weights))
    
    correct = 0
    total = 0
    
    predictions = np.array([])
    labels =  np.array([])
            
    # since we're not training, we don't need to calculate the gradients for our outputs
    with torch.no_grad():
        for data in test_loader['test']:
            images, labels = data[0].to(device), data[1].to(device)

            # calculate outputs by running images through the network
            outputs = model(images)
            # the class with the highest energy is what we choose as prediction
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            sigmoid = torch.sigmoid(outputs)    
            print(sigmoid)


    print('Accuracy of the network on the 10000 test images: %d %%' % (
        100 * correct / total))


# ResNet

In [60]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model_ft = models.resnet18(pretrained=True)
num_ftrs = model_ft.fc.in_features

# Here the size of each output sample is set to 2.
model_ft.fc = nn.Linear(num_ftrs, 2)

model_ft = model_ft.to(device)

criterion = nn.CrossEntropyLoss()

# Observe that all parameters are being optimized
optimizer_ft = optim.Adam(model_ft.parameters(), lr=0.001)

# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)

In [61]:
# train 
model_ft = train_model(model_ft, "resnet_best_model_weights.pth", criterion, optimizer_ft, exp_lr_scheduler,
                       num_epochs=4)


Epoch 0/3
----------
train Loss: 0.6338 Acc: 0.7054
val Loss: 0.4059 Acc: 0.8858

Epoch 1/3
----------
train Loss: 0.6141 Acc: 0.7157
val Loss: 0.4370 Acc: 0.8858

Epoch 2/3
----------
train Loss: 0.5931 Acc: 0.7247
val Loss: 0.3559 Acc: 0.8858

Epoch 3/3
----------
train Loss: 0.5316 Acc: 0.7567
val Loss: 0.4506 Acc: 0.8858

Training complete in 4m 55s
Best val Acc: 0.885808


In [89]:
#test 
PATH = './_best_val_model_weights.pth'

predictions = test(model_ft, test_loader, PATH, criterion, optimizer_ft, exp_lr_scheduler)

tensor([[0.6545, 0.3498]], device='cuda:0')
tensor([[0.6396, 0.3620]], device='cuda:0')
tensor([[0.6349, 0.3545]], device='cuda:0')
tensor([[0.6545, 0.3503]], device='cuda:0')
tensor([[0.6523, 0.3516]], device='cuda:0')
tensor([[0.6253, 0.3470]], device='cuda:0')
tensor([[0.6283, 0.3470]], device='cuda:0')
tensor([[0.6330, 0.3544]], device='cuda:0')
tensor([[0.6552, 0.3486]], device='cuda:0')
tensor([[0.6259, 0.3457]], device='cuda:0')
tensor([[0.6327, 0.3573]], device='cuda:0')
tensor([[0.6373, 0.3622]], device='cuda:0')
tensor([[0.6242, 0.3447]], device='cuda:0')
tensor([[0.6419, 0.3591]], device='cuda:0')
tensor([[0.6518, 0.3560]], device='cuda:0')
tensor([[0.6549, 0.3492]], device='cuda:0')
tensor([[0.6536, 0.3514]], device='cuda:0')
tensor([[0.6450, 0.3569]], device='cuda:0')
tensor([[0.6448, 0.3573]], device='cuda:0')
tensor([[0.6525, 0.3552]], device='cuda:0')
tensor([[0.6418, 0.3585]], device='cuda:0')
tensor([[0.6526, 0.3531]], device='cuda:0')
tensor([[0.6256, 0.3463]], devic

KeyboardInterrupt: 

In [None]:
df_submission = pd.DataFrame(data={'file_paths': file_paths, 'predictions': predictions})
df_submission["file_paths"] = df_submission["file_paths"].apply(lambda x: x.replace("/home/user/data/test","/data/challenges_data/test"))

df_submission.to_csv('df_submission.csv', index=False)
df_submission.head()

## DenseNet

In [20]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model_ft = models.densenet161(pretrained=True)
num_ftrs = model_ft.classifier.in_features
# Here the size of each output sample is set to 2.
# Alternatively, it can be generalized to nn.Linear(num_ftrs, len(class_names)).
model_ft.classifier = nn.Linear(num_ftrs, 2)

model_ft = model_ft.to(device)

criterion = nn.CrossEntropyLoss()

# Observe that all parameters are being optimized
optimizer_ft = optim.SGD(model_ft.parameters(), lr=0.001, momentum=0.9)

# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)

In [None]:
# train 
model_ft = train_model(model_ft, "densenet_best_model_weights.pth", criterion, optimizer_ft, exp_lr_scheduler,
                       num_epochs=4)

In [None]:
# test 
PATH = './densenet_best_model_weights.pth'

predictions = test(model_ft, test_loader, PATH, criterion, optimizer_ft, exp_lr_scheduler)

2021-11-10 08:35:45.247863: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


NameError: name 'model' is not defined

In [44]:
df_submission = pd.DataFrame(data={'file_paths': file_paths, 'predictions': predictions})
df_submission["file_paths"] = df_submission["file_paths"].apply(lambda x: x.replace("/home/user/data/test","/data/challenges_data/test"))

NameError: name 'file_paths' is not defined

In [None]:
df_submission.to_csv('df_submission.csv', index=False)
df_submission.head()