In [1]:
# Matplotlib
import matplotlib.pyplot as plt
# Numpy
import numpy as np
# Pillow
from PIL import Image
# Torch
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchvision import transforms
#OS
import os

In [2]:
class Lung_Dataset(Dataset):
    """
    Lung Dataset Consisting of Infected and Non-Infected.
    """

    def __init__(self, purpose, verbose=0):
        """
        Constructor for generic Dataset class - simply assembles
        the important parameters in attributes.
        
        Parameter:
        -purpose variable should be set to a string of either 'train', 'test' or 'val'
        -verbose takes an int of either 0,1 or 2. 0 will only differentiate between normal and infected, 1 will differentiate
            between normal, covid and non-covid while 2 will only differentiate between covid and non-covid
        """
        self.purpose = purpose
        self.verbose = verbose

        # All images are of size 150 x 150
        self.img_size = (150, 150)
            
        # The dataset has been split in training, testing and validation datasets
        self.groups = ['train', 'test', 'val']
        
        # Path to images for different parts of the dataset
        self.dataset_paths = {'train_normal': './dataset/train/normal/',
                              'train_infected': './dataset/train/infected/',
                              'train_infected_covid': './dataset/train/infected/covid',
                              'train_infected_non_covid': './dataset/train/infected/non-covid',
                              'test_normal': './dataset/test/normal/',
                              'test_infected': './dataset/test/infected/',
                              'test_infected_covid': './dataset/test/infected/covid',
                              'test_infected_non_covid': './dataset/test/infected/non-covid',
                              'val_normal': './dataset/val/normal/',
                              'val_infected': './dataset/val/infected/',
                              'val_infected_covid': './dataset/val/infected/covid',
                              'val_infected_non_covid': './dataset/val/infected/non-covid'}
        
        #Contains the number of images that will be used in our dataset. To be populated below
        self.dataset_numbers = {}
        
        # Consider normal and infected only
        if verbose == 0:
            self.classes = {0: 'normal', 1: 'infected'}
            
            #Populate self.dataset_numbers
            for condition in self.classes.values():
                #Key that will be used to access the dictionary
                key = "{}_{}".format(self.purpose, condition)
                if condition == "normal":
                    #Retrieve the filepath and populate dataset_numbers with the number of images in that folder
                    file_path = self.dataset_paths[key]
                    count = len(os.listdir(file_path))
                    self.dataset_numbers[key] = count
                    
                #For the infected case, we will be combining the covid and non-covid images into 1 category
                else:
                    #Need the keys for both covid and non-covid
                    key1 = key + "_covid"
                    key2 = key + "_non_covid"
                    file_path1 = self.dataset_paths[key1]
                    file_path2 = self.dataset_paths[key2]
                    count1 = len(os.listdir(file_path1))
                    count2 = len(os.listdir(file_path2))
                    #Number of infected images will be number of covid + number of non-covid
                    count = count1 + count2
                    self.dataset_numbers[key] = count
                       
        #Consider normal, covid and non-covid
        elif verbose == 1:
            self.classes = {0: 'normal', 1: 'covid', 2: 'non_covid'}
        
            #Populate self.dataset_numbers
            for condition in self.classes.values():
                #Similar to verbose == 0 above
                if condition == "normal":
                    key = "{}_{}".format(self.purpose, condition)
                    file_path = self.dataset_paths[key]
                    count = len(os.listdir(file_path))
                    self.dataset_numbers[key] = count
                    
                #For the infected case, we will be considering the covid and non-covid separately
                else:
                    #key = {purpose}_infected_covid or key = {purpose})_infected_non_covid
                    key = "{}_infected".format(self.purpose)
                    #Obtain the respective keys, retrieve the filepaths, populate dataset_numbers accordingly
                    key1 = key + "_covid"
                    key2 = key + "_non_covid"
                    file_path1 = self.dataset_paths[key1]
                    file_path2 = self.dataset_paths[key2]
                    count1 = len(os.listdir(file_path1))
                    count2 = len(os.listdir(file_path2))
                    self.dataset_numbers[key1] = count1
                    self.dataset_numbers[key2] = count2
                
        #Consider covid and non-covid
        elif verbose == 2:
            self.classes = {0: 'covid', 1 :'non_covid' }

            #Populate self.dataset_numbers
            for condition in self.classes.values():
                #Similar to the infected case in verbose 2
                key = "{}_infected".format(self.purpose)
                key1 = key + "_covid"
                key2 = key + "_non_covid"
                file_path1 = self.dataset_paths[key1]
                file_path2 = self.dataset_paths[key2]
                count1 = len(os.listdir(file_path1))
                count2 = len(os.listdir(file_path2))
                self.dataset_numbers[key1] = count1
                self.dataset_numbers[key2] = count2
            
        else:
            err_msg  = "Verbose argument only takes in an int of either 0,1 or 2"
            raise TypeError(err_msg)
        
        
    def describe(self):
        """
        Descriptor function.
        Will print details about the dataset when called.
        """
        
        # Generate description
        msg = "This is the Lung {} Dataset in the 50.039 Deep Learning class project".format(self.purpose)
        msg += " in Feb-March 2021. \n"
        msg += "It contains a total of {} images, ".format(sum(self.dataset_numbers.values()))
        msg += "of size {} by {}.\n".format(self.img_size[0], self.img_size[1])
        msg += "The images are stored in the following locations "
        msg += "and each one contains the following number of images:\n"
        for key, val in self.dataset_numbers.items():
            file_path = self.dataset_paths[key]
            msg += " - {}, in folder {}: {} images.\n".format(key, file_path, val)
        print(msg)
        
        
    def open_img(self, class_val, index_val):
        """
        Opens image with specified parameters.
        
        Parameters:
        - class_val variable should be set to 'normal' or 'infected'.
        - index_val should be an integer with values between 0 and the maximal number of images in dataset.
        
        Returns loaded image as a normalized Numpy array.
        """
        
        #Error handling 1
        group_val = self.purpose
        err_msg = "Error - For verbose = {}, class_val variable should be set to ".format(self.verbose)
        count = 0
        for value in self.classes.values():
            add_on = "'" + value + "'"
            if count < (len(self.classes.values()) -1):
                add_on += ' or '
            err_msg += add_on
            count += 1
        assert class_val in self.classes.values(), err_msg
        
        #For covid and non_covid, we want the class val to be "infected_covid" or "infected_non_covid"
        if class_val == 'covid' or class_val == 'non_covid':
            class_val = 'infected_' + class_val
            
        #Error Handling 2
        #Retrieve the max_val from self.dataset_numbers using the respective keys
        max_val = self.dataset_numbers['{}_{}'.format(group_val, class_val)]
        err_msg = "Error - index_val variable should be an integer between 0 and the maximal number of images."
        class_val_err = class_val.replace('_', '/')
        err_msg += "\n(In {}/{}, you have {} images.)".format(group_val, class_val_err, max_val)
        assert isinstance(index_val, int), err_msg
        assert index_val >= 0 and index_val <= max_val, err_msg
        
        #Retrieve the image file path
        if class_val != "infected":
            #path_to_file will be the filepath/index_val 
            #Filepath is the path to the dataset folder as stored in self.dataset_paths
            #index_val is the image number that we want to retrieve
            path_to_file = '{}/{}.jpg'.format(self.dataset_paths['{}_{}'.format(group_val, class_val)], index_val)
        
        #If the class_val == infected, we need to take into account for both covid and non_covid images as they both
        #make up the infected class
        else:
            #Retrieve the number of images in the covid folder
            covid_count = len(os.listdir(self.dataset_paths['{}_{}_covid'.format(group_val, class_val)]))
            
            #If the infected image number we want to retrieve is smaller than the max image number in the covid folder,
            #We retrieve the infected image from the covid folder
            if index_val < covid_count:
                path_to_file = '{}/{}.jpg'.format(self.dataset_paths['{}_{}_covid'.format(group_val, class_val)], index_val)
            
            #Else if it is greater, we move on to the non_covid folder and retrieve the image from there
            else:
                index_val = index_val - covid_count #remember that the image index starts from 0, so we got to reset the index
                path_to_file = '{}/{}.jpg'.format(self.dataset_paths['{}_{}_non_covid'.format(group_val, class_val)], index_val)
        
        #Retrieve the image using the file path
        with open(path_to_file, 'rb') as f:
            im = np.asarray(Image.open(f))/255 #normalize the pixel values to be in the range of [0,1]
        f.close()
        return im
    
    def show_img(self, class_val, index_val):
        """
        Opens, then displays image with specified parameters.
        
        Parameters:
        - class_val variable should be set to 'normal' or 'infected'.
        - index_val should be an integer with values between 0 and the maximal number of images in dataset.
        """
        # Open image
        im = self.open_img(class_val, index_val)
        
        # Display
        plt.imshow(im)
        
    def __len__(self):
        """
        Length special method, returns the number of images in dataset.
        """
        
        # Length function
        return sum(self.dataset_numbers.values())
    
    def length(self):
        return sum(self.dataset_numbers.values())
    
    
    def __getitem__(self, index):
        """
        Getitem special method.
        
        Expects an integer value index, between 0 and len(self) - 1.
        
        Returns the image and its label as a one hot vector, both
        in torch tensor format in dataset.
        """

        #If we only have 2 classes
        if self.verbose == 0 or self.verbose == 2:
            #Get the total number of images belonging to the first class
            first_val = int(list(self.dataset_numbers.values())[0])
            #As long as the index is smaller thatn the total, this image belongs to that class
            if index < first_val:
                class_val = self.classes[0]
                label = torch.Tensor([1, 0])
                
            #Else, we move on to the next class
            else:
                class_val = self.classes[1]
                index = index - first_val #Remember to reset the index
                label = torch.Tensor([0, 1])
            im = self.open_img(class_val, index)
            im = transforms.functional.to_tensor(np.array(im)).float()
          
        #If we have 3 classes to consider
        elif self.verbose == 1:
            #Similar to the 2 class problem, just that we have 3 classes to consider now
            first_val = int(list(self.dataset_numbers.values())[0]) #total number belonging to first class
            second_val = int(list(self.dataset_numbers.values())[1]) #total number belonging to second class
            
            #First class
            if index < first_val:
                class_val = self.classes[0]
                label = torch.Tensor([1, 0, 0])
            
            #Second class
            elif index >= first_val and index < first_val + second_val:
                index = index - first_val #Reset Index
                class_val = self.classes[1]
                label = torch.Tensor([0,1,0])
            
            #Third class
            else:
                index = index-(first_val + second_val) #Reset index
                class_val = self.classes[2]
                label = torch.Tensor([0,0,1])
            im = self.open_img(class_val, index)
            im = transforms.functional.to_tensor(np.array(im)).float()
                
        else:
            raise TypeError("Verbose value is not 0,1 or 2")
        return im, label

In [29]:
ld_train = Lung_Dataset('test', verbose =0)
ld_train.describe()
print(ld_train.dataset_numbers)

This is the Lung test Dataset in the 50.039 Deep Learning class project in Feb-March 2021. 
It contains a total of 615 images, of size 150 by 150.
The images are stored in the following locations and each one contains the following number of images:
 - test_normal, in folder ./dataset/test/normal/: 234 images.
 - test_infected, in folder ./dataset/test/infected/: 381 images.

{'test_normal': 234, 'test_infected': 381}


In [30]:
print(len(ld_train))

615


In [31]:
print(ld_train.dataset_numbers)

{'test_normal': 234, 'test_infected': 381}


In [32]:
im, class_oh = ld_train[139]
print(im.shape)
print(im)
print(class_oh)

torch.Size([1, 150, 150])
tensor([[[0.1373, 0.1647, 0.1961,  ..., 0.1569, 0.1294, 0.1059],
         [0.1255, 0.1569, 0.1882,  ..., 0.1529, 0.1255, 0.0980],
         [0.1176, 0.1451, 0.1804,  ..., 0.1529, 0.1216, 0.0941],
         ...,
         [0.0627, 0.0667, 0.0627,  ..., 0.0941, 0.0941, 0.0980],
         [0.0627, 0.0667, 0.0627,  ..., 0.0941, 0.0941, 0.0980],
         [0.0627, 0.0667, 0.0627,  ..., 0.0941, 0.0941, 0.0980]]])
tensor([1., 0.])


In [33]:
ld_test = Lung_Dataset('test', verbose = 1)
ld_val = Lung_Dataset('val', verbose = 1)

In [34]:
# Batch size value to be used (to be decided freely, but set to 4 for demo)
bs_val = 32

In [35]:
train_loader = DataLoader(ld_train, batch_size = bs_val, shuffle = True)
test_loader = DataLoader(ld_test, batch_size = bs_val, shuffle = True)
val_loader = DataLoader(ld_val, batch_size = bs_val, shuffle = True)

In [36]:
# Typical mini-batch for loop on dataloader (train)
for k, v in enumerate(train_loader):
    print("-----")
    print(k)
    print(v[0])
    print(v[1])
    # Forced stop
    break
    #assert False, "Forced stop after one iteration of the for loop"

-----
0
tensor([[[[0.0588, 0.0745, 0.1059,  ..., 0.3804, 0.3608, 0.3451],
          [0.0510, 0.0706, 0.1020,  ..., 0.3725, 0.3451, 0.3294],
          [0.0431, 0.0627, 0.0941,  ..., 0.3608, 0.3294, 0.3098],
          ...,
          [0.0000, 0.0039, 0.0039,  ..., 0.0000, 0.0000, 0.0000],
          [0.0000, 0.0039, 0.0039,  ..., 0.0000, 0.0000, 0.0000],
          [0.0000, 0.0039, 0.0039,  ..., 0.0000, 0.0000, 0.0000]]],


        [[[0.5608, 0.5569, 0.5294,  ..., 0.3020, 0.2980, 0.2980],
          [0.5608, 0.5725, 0.5569,  ..., 0.3098, 0.3059, 0.3059],
          [0.5451, 0.5647, 0.5608,  ..., 0.3176, 0.3137, 0.3098],
          ...,
          [0.0667, 0.0627, 0.0667,  ..., 0.0196, 0.0235, 0.0235],
          [0.0667, 0.0588, 0.0667,  ..., 0.0196, 0.0235, 0.0196],
          [0.0627, 0.0588, 0.0667,  ..., 0.0196, 0.0235, 0.0196]]],


        [[[0.0000, 0.0000, 0.0000,  ..., 0.0078, 0.0000, 0.0078],
          [0.0000, 0.0157, 0.0000,  ..., 0.0039, 0.0000, 0.0078],
          [0.0000, 0.0118, 0.0

In [37]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        # Conv2D: 1 input channel, 8 output channels, 3 by 3 kernel, stride of 1.
        self.conv1 = nn.Conv2d(1, 4, 3, 1)
        self.fc1 = nn.Linear(87616, 2)

    def forward(self, x):
        x = self.conv1(x)
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        output = F.log_softmax(x, dim = 1)
        return output

In [38]:
model = Net()

In [39]:
# Try model on one mini-batch
for batch_idx, (images_data, target_labels) in enumerate(train_loader):
    predicted_labels = model(images_data)
    print(predicted_labels)
    print(target_labels)
    # Forced stop
    break
    #assert False, "Forced stop after one iteration of the mini-batch for loop"

tensor([[-0.7334, -0.6544],
        [-0.7805, -0.6128],
        [-0.7904, -0.6045],
        [-0.7848, -0.6092],
        [-0.7673, -0.6241],
        [-0.7157, -0.6711],
        [-0.6653, -0.7218],
        [-0.8203, -0.5803],
        [-0.8318, -0.5714],
        [-0.7809, -0.6125],
        [-0.7664, -0.6249],
        [-0.7603, -0.6302],
        [-0.7708, -0.6211],
        [-0.7361, -0.6519],
        [-0.7718, -0.6202],
        [-0.7835, -0.6103],
        [-0.7147, -0.6721],
        [-0.7311, -0.6566],
        [-0.7478, -0.6413],
        [-0.7233, -0.6639],
        [-0.7212, -0.6659],
        [-0.6976, -0.6887],
        [-0.7867, -0.6076],
        [-0.7352, -0.6528],
        [-0.7684, -0.6232],
        [-0.8197, -0.5808],
        [-0.7811, -0.6123],
        [-0.8205, -0.5802],
        [-0.7506, -0.6388],
        [-0.7980, -0.5983],
        [-0.7653, -0.6259],
        [-0.7089, -0.6777]], grad_fn=<LogSoftmaxBackward>)
tensor([[0., 1.],
        [0., 1.],
        [0., 1.],
        [1., 0.],
 