In [27]:
############
# Install dependency
############
!pip3 install numpy
!pip3 install tqdm
!pip3 install torch torchvision
!pip3 install opencv-python
!pip3 install pandas
!pip3 install h5py

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Collecting h5py
  Downloading h5py-3.1.0-cp36-cp36m-manylinux1_x86_64.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 909 kB/s eta 0:00:01
[?25hCollecting cached-property; python_version < "3.8"
  Downloading cached_property-1.5.2-py2.py3-none-any.whl (7.6 kB)
Installing collected packages: cached-property, h5py
Successfully installed cached-property-1.5.2 h5py-3.1.0


In [4]:
############
# Numpy makes my computer run out of memory, fix it by forcing to not do multi threading
# For Frendy Only
###########
import os
os.environ['OPENBLAS_NUM_THREADS'] = '2'
os.environ['MKL_NUM_THREADS'] = '2'

In [1]:
import random
import csv
import os
import os.path
import shutil
import cv2

import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm # Displays a progress bar

import torch
from torch import nn
from torch import optim
import torch.nn.functional as F
from torchvision import datasets, transforms
from torch.utils.data import Dataset, Subset, DataLoader, random_split

import pandas as pd
import h5py

In [2]:
#===========================
# This is to split the data into training and testing
# We also rotated the images
# ==========================

def splitData():
    # Paths to dst directories 
    trainImage = os.path.normpath('./Dataset/Train/image')
    testImage = os.path.normpath('./Dataset/Test/image')
    trainLabel = os.path.normpath('./Dataset/Train/label')
    testLabel = os.path.normpath('./Dataset/Test/label')
    dtset = os.path.normpath('./Dataset')

    trainingCSV = os.path.join(trainLabel, 'training.csv')
    testingCSV = os.path.join(testLabel, 'testing.csv')
    
    rowTraining = []
    rowTesting = []
    
    # To go through all files
    index = 1
    label = 0
    
    # Delete Dataset Folder
    if os.path.exists(dtset):
        print('Removing old data')
        shutil.rmtree(dtset) 

    # Create directories
    print('Creating new directories')
    os.makedirs(trainImage)
    os.makedirs(testImage)
    os.makedirs(trainLabel)
    os.makedirs(testLabel)
    
    # Dimensions for ResNet-50
    dimW = 224
    dimH = 224

    # Process images
    for dirpath, dirnames, filenames in os.walk("."):
        if 'Dataset' in dirpath:
            continue
        elif os.path.exists(os.path.join(dirpath, '_MACOSX')):
            print('Removing _MACOSX dir')
            shutil.rmtree(os.path.join(dirpath, '_MACOSX'))

        # Get isCovid = 0, IsHealthy = 1 isOthers = 2   
        if "Covid" in dirpath:
            label = 0
        elif "Healthy" in dirpath:
            label = 1
        elif 'Others'  in dirpath:
            label = 2
            
        for filename in [f for f in filenames if f.endswith(".png")]: 
            # Read image
            img = cv2.imread(os.path.join(dirpath, filename), cv2.IMREAD_UNCHANGED)
            if img is None:
                continue

            # Resize image
            imgResized = cv2.resize(img, (dimW, dimH), interpolation=cv2.INTER_AREA)
            # Convert to grayscale (faster processing)
            imgGray = cv2.cvtColor(imgResized, cv2.COLOR_BGR2GRAY)
            
            # Split into 4:1
            for i in range(4): 
                whereToGo = random.uniform(0, 1)
                # Training
                if whereToGo <= 0.80:
                    # Add number and label 
                    rowTraining.append([index,label])
                    cv2.imwrite(os.path.join(trainImage,str(index)+".png"), imgGray)
                # Testing
                else:
                    # Add number and label 
                    rowTesting.append([index,label])   
                    cv2.imwrite(os.path.join(testImage,str(index)+".png"), imgGray)
                index += 1
                # Rotate image
                imgGray = cv2.rotate(imgGray, cv2.ROTATE_90_CLOCKWISE)
                

    # field names  
    fields = ['Name', 'label']  
    
    # writing to trainingCSV file  
    print('Generating training CSV')
    with open(trainingCSV, 'w+') as csvfile:  
        # creatittng a csv writer object  
        csvwriter = csv.writer(csvfile)  

        # writing the fields  
        csvwriter.writerow(fields)  

        # writing the data rows  
        csvwriter.writerows(rowTraining)   
        
    # writing to trainingCSV file  
    print('Generating testing CSV')
    with open(testingCSV, 'w+') as csvfile:  
        # creating a csv writer object  
        csvwriter = csv.writer(csvfile)  

        # writing the fields  
        csvwriter.writerow(fields)  

        # writing the data rows  
        csvwriter.writerows(rowTesting)             
    
splitData()

Removing old data
Creating new directories
Generating training CSV
Generating testing CSV


In [15]:
import pandas as pd

class Data:

    def __init__(self, root):

        self.ROOT = root
        self.images = self.read_images(root + "/image")
        self.labels = self.read_labels(root + "/label")

    def __len__(self):
        # Return number of points in the dataset

        return len(self.images)

    def __getitem__(self, idx):
        # Here we have to return the item requested by `idx`. The PyTorch DataLoader class will use this method to make an iterable for training/validation loop.

        img = images[idx]
        label = labels[idx]

        return img, label
    
    # Read Images
    def read_images(self, path:str) -> list:
        output = []
        for file in sorted(os.listdir(path), key=lambda f : int(f[:-4])):
            if file.endswith(".png"):
                dir_path = os.path.join(path, file)
                img = cv2.imread(dir_path)   
                output.append(img)
        
        return output
        
    # Read Labels
    def read_labels(self, path:str) -> list:
        output = []
        for file in os.listdir(path):
            if file.endswith(".csv"):
                file = os.path.join(path, file)
                df = pd.read_csv(file)
                output = df.label #you can also use df['column_name']
            
        return output       

# Load the dataset and train and test splits
print("Loading datasets...")

# Data path
TrainData = Data('./Dataset/Train')
TestData = Data('./Dataset/Test')

######################################################
# Save data into binary files so we can just load them
######################################################
# For TrainData

# Images
with h5py.File("TrainDataImages.hdf5", "w") as data_file:
    for i,image in enumerate(TrainData.images):
        data_file.create_dataset(str(i), data=image)
# Labels
with h5py.File("TrainDataLabels.hdf5", "w") as data_file:
    data_file.create_dataset("labels", data=TrainData.labels)

# For TestData
# Images
with h5py.File("TestDataImages.hdf5", "w") as data_file:
    for i,image in enumerate(TestData.images):
        data_file.create_dataset(str(i), data=image)
# Labels
with h5py.File("TestDataLabels.hdf5", "w") as data_file:
    data_file.create_dataset("labels", data=TestData.labels)
    
# # Data normalization
# MyTransform = transforms.Compose([
#     transforms.Grayscale(num_output_channels=1), # Convert image to grayscale
#     transforms.ToTensor(), # Transform from [0,255] uint8 to [0,1] float
#     transforms.Normalize([0.1], [0.2] ) # TODO: Normalize to zero mean and unit variance with appropriate parameters
# ])

# DATA_train = datasets.ImageFolder(root=DATA_train_path, transform=MyTransform)
# DATA_test = datasets.ImageFolder(root=DATA_test_path, transform=MyTransform)

# print("Done!")

# # Create dataloaders
# # TODO: Experiment with different batch sizes
trainloader = DataLoader(Data_train, batch_size=1, shuffle=True)
testloader = DataLoader(Data_test, batch_size=2, shuffle=True)

# print(trainloader)

Loading datasets...


In [45]:
with h5py.File("./TestDataImages.hdf5", "r") as f:
    # List all groups
    print("Keys: %s" % sorted(f.keys(), key = int ))
    
    # label = list(f.keys())[0]

    # Get the data
    # data = list(f[label])
    # print(data)

Keys: ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92', '93', '94', '95', '96', '97', '98', '99', '100', '101', '102', '103', '104', '105', '106', '107', '108', '109', '110', '111', '112', '113', '114', '115', '116', '117', '118', '119', '120', '121', '122', '123', '124', '125', '126', '127', '128', '129', '130', '131', '132', '133', '134', '135', '136', '137', '138', '139', '140', '141', '142', '143', '144', '145', '146', '147', '148', '149', '150', '151', '152', '153', '154', '155', '156', '157




In [8]:
import pandas as pd

class Data:

    def __init__(self, root):

        self.ROOT = root
        self.images = self.read_images(os.path.join(root, "images"))
        self.labels = self.read_labels(os.path.join(root, "labels"))

    def __len__(self):
        # Return number of points in the dataset

        return len(self.images)

    def __getitem__(self, idx):
        # Here we have to return the item requested by `idx`. The PyTorch DataLoader class will use this method to make an iterable for training/validation loop.

        img = self.images[idx]
        label = self.labels[idx]

        return img, label
  
    # Read Images
    def read_images(self, path:str) -> list:
        output = []
        with h5py.File(os.path.join(path, "imagedata.hdf5"), "r") as f:
            for i in sorted(f.keys(), key = int ):
                output.append(f.get(str(i)))
        output = torch.tensor(output)
        return output
      
    # Read Labels
    def read_labels(self, path:str) -> list:
        output = []
        with h5py.File(os.path.join(path, "labeldata.hdf5"), "r") as f:
            output.append(f.get("labels"))
        return output       

# # Load the dataset and train and test splits
# print("Loading datasets...")

# # Data path
Data_train = Data(os.path.normpath('./Data/Train'))
Data_test = Data(os.path.normpath('./Data/Test'))

ValueError: Not a dataset (not a dataset)

In [5]:
# Data normalization
MyTransform = transforms.Compose([
    transforms.ToTensor(), # Transform from [0,255] uint8 to [0,1] float
    transforms.Normalize([0.5], [0.5], [0.5]) # TODO: Normalize to zero mean and unit variance with appropriate parameters
])

# DATA_train = datasets.ImageFolder(root=DATA_train_path, transform=MyTransform)
# DATA_test = datasets.ImageFolder(root=DATA_test_path, transform=MyTransform)

# print("Done!")

# # Create dataloaders
# # TODO: Experiment with different batch sizes
trainloader = DataLoader(Data_train, batch_size=1, shuffle=True)
testloader = DataLoader(Data_test, batch_size=2, shuffle=True)


In [6]:
from torchvision import datasets, transforms
import torchvision.models as models

In [7]:
class Network(nn.Module):
    def __init__(self):
        super().__init__()
        # TODO: [Transfer learning with pre-trained ResNet-50] 1) Define how many first layers of convolutoinal neural network (CNN) feature extractor in ResNet-50 to be "frozen" and 2) design your own fully-connected network (FCN) classifier.
        # 1) You will only refine last several layers of CNN feature extractor in ResNet-50 that mainly relate to high-level vision task. Determine how many first layers of ResNet-50 should be frozen to achieve best performances. Commented codes below will help you understand the architecture, i.e., "children", of ResNet-50.
        # 2) Design your own FCN classifier. Here I provide a sample of two-layer FCN.
        # Refer to PyTorch documentations of torch.nn to pick your layers. (https://pytorch.org/docs/stable/nn.html)
        # Some common Choices are: Linear, ReLU, Dropout, MaxPool2d, AvgPool2d
        # If you have many layers, consider using nn.Sequential() to simplify your code
        
        # Load pretrained ResNet-50
        self.model_resnet = models.resnet50(pretrained=True)
        
        # The code below can show children of ResNet-50
        #child_counter = 0
        #for child in model.children():
        #    print(" child", child_counter, "is -")
        #    print(child)
        #    child_counter += 1
        
        # TODO: Determine how many first layers of ResNet-50 to freeze
        child_counter = 0
        for child in self.model_resnet.children():
            if child_counter < 47:
                for param in child.parameters():
                    param.requires_grad = False
            elif child_counter == 47:
                children_of_child_counter = 0
                for children_of_child in child.children():
                    if children_of_child_counter < 3:
                        for param in children_of_child.parameters():
                            param.requires_grad = False
                    else:
                        children_of_child_counter += 1
            else:
                print("child ",child_counter," was not frozen")
            child_counter += 1
        
        # Set ResNet-50's FCN as an identity mapping
        num_fc_in = self.model_resnet.fc.in_features
        self.model_resnet.fc = nn.Identity()
        
        # TODO: Design your own FCN
        self.fc1 = nn.Linear(num_fc_in, 262548, bias = 3) # from input of size num_fc_in to output of size ?
        self.fc2 = nn.Linear(64, 3, bias = 3) # from hidden layer to 3 class scores

    def forward(self,x):
        # TODO: Design your own network, implement forward pass here
        
        relu = nn.ReLU() # No need to define self.relu because it contains no parameters
        
        with torch.no_grad():
            features = self.model_resnet(x)
            
        x = self.fc1(features) # Activation are flattened before being passed to the fully connected layers
        x = relu(x)
        x = self.fc2(x)
        
        # The loss layer will be applied outside Network class
        return x

device = "cuda" if torch.cuda.is_available() else "cpu" # Configure device
model= Network().to(device)
criterion = nn.CrossEntropyLoss() # Specify the loss layer (note: CrossEntropyLoss already includes LogSoftMax())
# TODO: Modify the line below, experiment with different optimizers and parameters (such as learning rate)
optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=.01, weight_decay=2) # Specify optimizer and assign trainable parameters to it, weight_decay is L2 regularization strength (default: lr=1e-2, weight_decay=1e-4)
num_epochs =20 # TODO: Choose an appropriate number of training epochs

def train(model, loader, num_epoch = num_epochs): # Train the model
    print("Start training...")
    model.train() # Set the model to training mode
    for i in range(num_epoch):
        running_loss = []
        for batch, label in tqdm(loader):
            batch = batch.to(device)
            label = label.to(device)
            optimizer.zero_grad() # Clear gradients from the previous iteration
            pred = model(batch) # This will call Network.forward() that you implement
            loss = criterion(pred, label) # Calculate the loss
            running_loss.append(loss.item())
            loss.backward() # Backprop gradients to all tensors in the network
            optimizer.step() # Update trainable weights
        print("Epoch {} loss:{}".format(i+1,np.mean(running_loss))) # Print the average loss for this epoch
    print("Done!")

def evaluate(model, loader): # Evaluate accuracy on validation / test set
    model.eval() # Set the model to evaluation mode
    correct = 0
    with torch.no_grad(): # Do not calculate grident to speed up computation
        for batch, label in tqdm(loader):
            batch = batch.to(device)
            label = label.to(device)
            pred = model(batch)
            correct += (torch.argmax(pred,dim=1)==label).sum().item()
    acc = correct/len(loader.dataset)
    print("Evaluation accuracy: {}".format(acc))
    return acc
    
train(model, trainloader, num_epochs)
print("Evaluate on test set")
evaluate(model, testloader)

Start training...


  0%|                                                                                        | 0/13274 [00:00<?, ?it/s]

IndexError: list index out of range