In [2]:
# Matplotlib
import matplotlib.pyplot as plt
# Numpy
import numpy as np
# Pillow
from PIL import Image
# Torch
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchvision import transforms
#OS
import os

In [3]:
class Lung_Dataset(Dataset):
    """
    Lung Dataset Consisting of Infected and Non-Infected.
    """

    def __init__(self, purpose, verbose=0):
        """
        Constructor for generic Dataset class - simply assembles
        the important parameters in attributes.
        
        Parameter:
        -purpose variable should be set to a string of either 'train', 'test' or 'val'
        -verbose takes an int of either 0,1 or 2. 0 will only differentiate between normal and infected, 1 will differentiate
            between normal, covid and non-covid while 2 will only differentiate between covid and non-covid
        """
        self.purpose = purpose
        self.verbose = verbose
        
        # All images are of size 150 x 150
        self.img_size = (150, 150)
            
        # The dataset has been split in training, testing and validation datasets
        self.groups = ['train', 'test', 'val']
        
        # Path to images for different parts of the dataset
        self.dataset_paths = {'train_normal': './dataset/train/normal/',
                              'train_infected': './dataset/train/infected/',
                              'train_infected_covid': './dataset/train/infected/covid',
                              'train_infected_non_covid': './dataset/train/infected/non-covid',
                              'test_normal': './dataset/test/normal/',
                              'test_infected': './dataset/test/infected/',
                              'test_infected_covid': './dataset/test/infected/covid',
                              'test_infected_non_covid': './dataset/test/infected/non-covid',
                              'val_normal': './dataset/val/normal/',
                              'val_infected': './dataset/val/infected/',
                              'val_infected_covid': './dataset/val/infected/covid',
                              'val_infected_non_covid': './dataset/val/infected/non-covid'}
        
        self.dataset_numbers = {}
        
        # Consider normal and infected only
        if verbose == 0:
            self.classes = {0: 'normal', 1: 'infected'}
            
            #Populate self.dataset_numbers
            for condition in self.classes.values():
                key = "{}_{}".format(self.purpose, condition)
                if condition == "normal":
                    file_path = self.dataset_paths[key]
                    count = len(os.listdir(file_path))
                    self.dataset_numbers[key] = count
                else:
                    key1 = key + "_covid"
                    key2 = key + "_non_covid"
                    file_path1 = self.dataset_paths[key1]
                    file_path2 = self.dataset_paths[key2]
                    count1 = len(os.listdir(file_path1))
                    count2 = len(os.listdir(file_path2))
                    count = count1 + count2
                    self.dataset_numbers[key] = count
                       
        #Consider normal, covid and non-covid
        elif verbose == 1:
            self.classes = {0: 'normal', 1: 'covid', 2: 'non_covid'}
        
            #Populate self.dataset_numbers
            for condition in self.classes.values():
                if condition == "normal":
                    key = "{}_{}".format(self.purpose, condition)
                    file_path = self.dataset_paths[key]
                    count = len(os.listdir(file_path))
                    self.dataset_numbers[key] = count
                else:
                    key = "{}_infected".format(self.purpose)
                    key1 = key + "_covid"
                    key2 = key + "_non_covid"
                    file_path1 = self.dataset_paths[key1]
                    file_path2 = self.dataset_paths[key2]
                    count1 = len(os.listdir(file_path1))
                    count2 = len(os.listdir(file_path2))
                    self.dataset_numbers[key1] = count1
                    self.dataset_numbers[key2] = count2
                
        #Consider covid and non-covid
        elif verbose == 2:
            self.classes = {0: 'covid', 1 :'non_covid' }

            #Populate self.dataset_numbers
            for condition in self.classes.values():
                key = "{}_infected".format(self.purpose)
                key1 = key + "_covid"
                key2 = key + "_non_covid"
                file_path1 = self.dataset_paths[key1]
                file_path2 = self.dataset_paths[key2]
                count1 = len(os.listdir(file_path1))
                count2 = len(os.listdir(file_path2))
                self.dataset_numbers[key1] = count1
                self.dataset_numbers[key2] = count2
            
        else:
            err_msg  = "Verbose argument only takes in an int of either 0,1 or 2"
            raise TypeError(err_msg)
        
        
    def describe(self):
        """
        Descriptor function.
        Will print details about the dataset when called.
        """
        
        # Generate description
        msg = "This is the Lung {} Dataset in the 50.039 Deep Learning class project".format(self.purpose)
        msg += " in Feb-March 2021. \n"
        msg += "It contains a total of {} images, ".format(sum(self.dataset_numbers.values()))
        msg += "of size {} by {}.\n".format(self.img_size[0], self.img_size[1])
        msg += "The images are stored in the following locations "
        msg += "and each one contains the following number of images:\n"
        for key, val in self.dataset_numbers.items():
            if key != 'infected':
                file_path = self.dataset_paths[key]
            else:
                file_path = self.dataset_paths
            msg += " - {}, in folder {}: {} images.\n".format(key, file_path, val)
        print(msg)
        
        
    def open_img(self, class_val, index_val):
        """
        Opens image with specified parameters.
        
        Parameters:
        - class_val variable should be set to 'normal' or 'infected'.
        - index_val should be an integer with values between 0 and the maximal number of images in dataset.
        
        Returns loaded image as a normalized Numpy array.
        """
        group_val = self.purpose
        err_msg = "Error - class_val variable should be set to 'normal', 'infected', 'covid' or 'non_covid'."
        assert class_val in self.classes.values(), err_msg
        
        if class_val == 'covid' or class_val == 'non_covid':
            class_val = 'infected_' + class_val
            
        max_val = self.dataset_numbers['{}_{}'.format(group_val, class_val)]
        err_msg = "Error - index_val variable should be an integer between 0 and the maximal number of images."
        err_msg += "\n(In {}/{}, you have {} images.)".format(group_val, class_val, max_val)
        assert isinstance(index_val, int), err_msg
        assert index_val >= 0 and index_val <= max_val, err_msg
        
        # Open file as before
        if class_val != "infected":
            path_to_file = '{}/{}.jpg'.format(self.dataset_paths['{}_{}'.format(group_val, class_val)], index_val)
        else:
            covid_count = len(os.listdir(self.dataset_paths['{}_{}_covid'.format(group_val, class_val)]))
            if index_val < covid_count:
                path_to_file = '{}/{}.jpg'.format(self.dataset_paths['{}_{}_covid'.format(group_val, class_val)], index_val)
            else:
                index_val = index_val - covid_count
                path_to_file = '{}/{}.jpg'.format(self.dataset_paths['{}_{}_non_covid'.format(group_val, class_val)], index_val)
        with open(path_to_file, 'rb') as f:
            im = np.asarray(Image.open(f))/255
        f.close()
        return im
    
    
    def show_img(self, class_val, index_val):
        """
        Opens, then displays image with specified parameters.
        
        Parameters:
        - class_val variable should be set to 'normal' or 'infected'.
        - index_val should be an integer with values between 0 and the maximal number of images in dataset.
        """
        # Open image
        im = self.open_img(class_val, index_val)
        
        # Display
        plt.imshow(im)
        
    def __len__(self):
        """
        Length special method, returns the number of images in dataset.
        """
        
        # Length function
        return sum(self.dataset_numbers.values())
    
    
    def __getitem__(self, index):
        """
        Getitem special method.
        
        Expects an integer value index, between 0 and len(self) - 1.
        
        Returns the image and its label as a one hot vector, both
        in torch tensor format in dataset.
        """
        #If we only have 2 classes
        if self.verbose == 0 or self.verbose == 2:
            first_val = int(list(self.dataset_numbers.values())[0])
            if index < first_val:
                class_val = self.classes[0]
                label = torch.Tensor([1, 0])
            else:
                class_val = self.classes[1]
                index = index - first_val
                label = torch.Tensor([0, 1])
            im = self.open_img(class_val, index)
            im = transforms.functional.to_tensor(np.array(im)).float()
          
        #If we have 3 classes to consider
        elif self.verbose == 1:
            first_val = int(list(self.dataset_numbers.values())[0])
            second_val = int(list(self.dataset_numbers.values())[1])
            if index < first_val:
                class_val = self.classes[0]
                label = torch.Tensor([1, 0, 0])
            elif index >= first_val and index < first_val + second_val:
                index = index - first_val
                class_val = self.classes[1]
                label = torch.Tensor([0,1,0])
            else:
                index = index-(first_val + second_val)
                class_val = self.classes[2]
                label = torch.Tensor([0,0,1])
            im = self.open_img(class_val, index)
            im = transforms.functional.to_tensor(np.array(im)).float()
                
        else:
            raise TypeError("Verbose value is not 0,1 or 2")
        return im, label

In [9]:
ld_train = Lung_Dataset('train', verbose = 1)
ld_train.describe()

This is the Lung train Dataset in the 50.039 Deep Learning class project in Feb-March 2021. 
It contains a total of 5216 images, of size 150 by 150.
The images are stored in the following locations and each one contains the following number of images:
 - train_normal, in folder ./dataset/train/normal/: 1341 images.
 - train_infected_covid, in folder ./dataset/train/infected/covid: 1345 images.
 - train_infected_non_covid, in folder ./dataset/train/infected/non-covid: 2530 images.



In [10]:
print(len(ld_train))

5216


In [11]:
print(ld_train.dataset_numbers)

{'train_normal': 1341, 'train_infected_covid': 1345, 'train_infected_non_covid': 2530}


In [12]:
im, class_oh = ld_train[8]
print(im.shape)
print(im)
print(class_oh)

torch.Size([1, 150, 150])
tensor([[[0.0431, 0.0275, 0.0000,  ..., 0.2863, 0.3373, 0.5098],
         [0.0706, 0.0471, 0.0000,  ..., 0.3020, 0.3137, 0.4902],
         [0.1137, 0.0824, 0.0157,  ..., 0.3255, 0.3098, 0.4784],
         ...,
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0039, 0.0078],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0039, 0.0039],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0039, 0.0039]]])
tensor([1., 0., 0.])


In [13]:
ld_test = Lung_Dataset('test', verbose = 1)
ld_val = Lung_Dataset('val', verbose = 1)

In [14]:
# Batch size value to be used (to be decided freely, but set to 4 for demo)
bs_val = 32

In [15]:
train_loader = DataLoader(ld_train, batch_size = bs_val, shuffle = True)
test_loader = DataLoader(ld_test, batch_size = bs_val, shuffle = True)
val_loader = DataLoader(ld_val, batch_size = bs_val, shuffle = True)

In [107]:
# Typical mini-batch for loop on dataloader (train)
for k, v in enumerate(train_loader):
    print("-----")
    print(k)
    print(v[0])
    print(v[1])
    # Forced stop
    break
    #assert False, "Forced stop after one iteration of the for loop"

-----
0
tensor([[[[0.0549, 0.1098, 0.1686,  ..., 0.0118, 0.0000, 0.0039],
          [0.1176, 0.1686, 0.2196,  ..., 0.0627, 0.0196, 0.0118],
          [0.1882, 0.2314, 0.2706,  ..., 0.1373, 0.0706, 0.0353],
          ...,
          [0.0353, 0.0392, 0.0431,  ..., 0.0118, 0.0196, 0.0196],
          [0.0392, 0.0431, 0.0431,  ..., 0.0118, 0.0157, 0.0196],
          [0.0431, 0.0431, 0.0431,  ..., 0.0118, 0.0157, 0.0157]]],


        [[[0.4667, 0.2118, 0.1255,  ..., 0.0824, 0.0784, 0.0784],
          [0.4824, 0.1725, 0.1961,  ..., 0.0824, 0.0745, 0.0667],
          [0.4784, 0.1451, 0.1333,  ..., 0.0824, 0.0902, 0.0980],
          ...,
          [0.1294, 0.1569, 0.1961,  ..., 0.3020, 0.2627, 0.0941],
          [0.1294, 0.1569, 0.1961,  ..., 0.2902, 0.1765, 0.0706],
          [0.1294, 0.1608, 0.2000,  ..., 0.2627, 0.1020, 0.0706]]],


        [[[0.0000, 0.6588, 0.4588,  ..., 0.0863, 0.0706, 0.0392],
          [0.0000, 0.6510, 0.4353,  ..., 0.1098, 0.0824, 0.0392],
          [0.0196, 0.6314, 0.4

In [108]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        # Conv2D: 1 input channel, 8 output channels, 3 by 3 kernel, stride of 1.
        self.conv1 = nn.Conv2d(1, 4, 3, 1)
        self.fc1 = nn.Linear(87616, 2)

    def forward(self, x):
        x = self.conv1(x)
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        output = F.log_softmax(x, dim = 1)
        return output

In [109]:
model = Net()

In [110]:
# Try model on one mini-batch
for batch_idx, (images_data, target_labels) in enumerate(train_loader):
    predicted_labels = model(images_data)
    print(predicted_labels)
    print(target_labels)
    # Forced stop
    break
    #assert False, "Forced stop after one iteration of the mini-batch for loop"

tensor([[-0.7402, -0.6482],
        [-0.7618, -0.6289],
        [-0.7417, -0.6468],
        [-0.7235, -0.6637],
        [-0.7988, -0.5976],
        [-0.7516, -0.6379],
        [-0.7613, -0.6294],
        [-0.7057, -0.6807],
        [-0.7129, -0.6738],
        [-0.6982, -0.6882],
        [-0.7671, -0.6243],
        [-0.7569, -0.6332],
        [-0.7477, -0.6414],
        [-0.7313, -0.6564],
        [-0.7472, -0.6419],
        [-0.7414, -0.6471],
        [-0.7165, -0.6703],
        [-0.7204, -0.6666],
        [-0.7218, -0.6652],
        [-0.8364, -0.5678],
        [-0.7819, -0.6116],
        [-0.7147, -0.6721],
        [-0.7805, -0.6128],
        [-0.7543, -0.6355],
        [-0.7485, -0.6407],
        [-0.7606, -0.6300],
        [-0.7434, -0.6453],
        [-0.7564, -0.6337],
        [-0.7337, -0.6542],
        [-0.7400, -0.6483],
        [-0.7241, -0.6631],
        [-0.7676, -0.6239]], grad_fn=<LogSoftmaxBackward>)
tensor([[1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
 