In [73]:
##### IMPORTS ######
import os
import shutil


import pandas as pd
from sklearn.model_selection import train_test_split

import torch.nn as nn
import torch.optim as optim
from torch.utils.data.dataloader import DataLoader

from torchvision import transforms
from torchvision.datasets import ImageFolder

##### ARCHITECTURE #####
class BloodCell(nn.Module):
    def __init__(self):
        super().__init__()
        
        # NETWORK
        self.network = nn.Sequential(
            
            # 1
            nn.Conv2d(3, 16, kernel_size = 3 , stride = (2,2)),
            nn.ReLU(),
            nn.BatchNorm2d(16),
            nn.Dropout(p=0.2),
            nn.MaxPool2d(kernel_size =2),
            
            # 2
            nn.Conv2d(16, 8, kernel_size = 3 , stride = (2,2)),
            nn.ReLU(),
            nn.BatchNorm2d(8),
            nn.Dropout(p=0.2),
            nn.MaxPool2d(kernel_size =2),
            
            # 3
            nn.Conv2d(8, 4, kernel_size = 3, stride = (2,2)),
            nn.ReLU(),
            nn.BatchNorm2d(4),
            nn.Dropout(p=0.2),
            nn.MaxPool2d(kernel_size =2),
        )

        # AVGPOOL
        self.avgpool = nn.AdaptiveAvgPool2d((15,15))

        # CLASSIFIER
        self.classifier = nn.Sequential(
            
            nn.Flatten(),
            nn.Linear(4*15*15,200),
            nn.ReLU(),
            nn.Linear(200, 50),
            nn.ReLU(),
            nn.Linear(50,4),
            nn.ReLU(),
        )
    
    def forward(self, xb):
        xb = self.network(xb)
        xb = self.avgpool(xb)
        xb = self.classifier(xb)
        return xb

##### FUNCTIONS ######
def load_csv(path) :
    # 1 : read the csv
    df = pd.read_csv(path+"dataset-master/labels_full.csv",sep =",")
    
    # 2 : remove images with missing labels
    df = df[df['Category'].notnull()]
    
    # 3 : translate the "Image" column to the path of the actual image
    df['Image'] = df['Image'].apply(lambda x : 
                                    "BloodImage_" 
                                    + (5-len(str(x)))*"0"
                                    + str(x)
                                    + ".jpg")
    # 4 drop unnecessary columns
    df = df[['Image', 'Category']]
    
    # 5 drop categories without enough data to train the model on
    category_value_count = df.Category.value_counts()
    rare_categories = category_value_count[category_value_count < 10].index
    df = df.loc[~df["Category"].isin(rare_categories)]

    return df

def build_folder(path, folder ,df) :
    # reset and make folder
    if(os.path.isdir(path+folder)) :
        shutil.rmtree(path+folder) 
    os.mkdir(path+folder)
    
    # make subfolders
    for categories in df.Category.unique():
        if(not os.path.isdir(path+folder+"/"+categories)) :
            os.mkdir(path+folder+"/"+categories)
    
    # fill subfolders
    for index, row in df.iterrows() :
        origin = path + "dataset-master/JPEGImages/" + row["Image"]
        target = path + folder + "/" + row["Category"] + "/" + row["Image"]
        if(os.path.isfile(origin)) :
            shutil.copyfile(origin, target)
        else :
            print(origin + " is in the csv, but does not exist")
          
def build_training_and_validation(path, df, valid_size) :
    # 1 : separate training and validation set
    train_df, valid_df = train_test_split(
        df, 
        test_size = valid_size,
        stratify = df['Category'])
    
    # 2 : build training and validation folders
    build_folder(path, "training", train_df)
    build_folder(path, "validation", valid_df)

##### MAIN ######
# path where you want to store your data. Must contain the dataset-master file
path = "./data/"

# how you with to transform the data
transform = transforms.Compose([
    transforms.Resize((120,120)),
    transforms.ToTensor()])



# I : read the labels
df = load_csv(path)

# II : separate training and validation set
build_training_and_validation(path, df, 0.05)

# III : build
net = BloodCell()
batch_size = 128
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=1e-2)

training_set = ImageFolder(path+"training",transform = transform)
validation_set = ImageFolder(path+"validation",transform = transform)

training_loader = DataLoader(training_set, batch_size, shuffle = True)
validation_loader = DataLoader(validation_set, batch_size)



./data/dataset-master/JPEGImages/BloodImage_00280.jpg is in the csv, but does not exist
./data/dataset-master/JPEGImages/BloodImage_00116.jpg is in the csv, but does not exist


In [88]:
import numpy as np
epochs = 20
min_valid_loss = np.inf

for e in range(epochs):
    train_loss = 0.0
    net.train()
    for data, labels in training_loader :
        optimizer.zero_grad()
        output = net(data)
        loss = criterion(output,labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    
    valid_loss = 0.0
    #valid_accuracy = 0.0
    net.eval()
    for data, labels in validation_loader:
        output = net(data)
        loss = criterion(output,labels)
        valid_loss += loss.item()   
        #valid_accuracy += 100*(output == labels).float().sum()
                
                

    print(f'Epoch {e+1} \t Training Loss: {train_loss / len(training_loader)} \t\t Validation Loss: {valid_loss / len(validation_loader)}')
    #print(f'Epoch {e+1} \t Training Accuracy: {train_accuracy / len(training_loader)} \t\t Validation Accuracy: {valid_accuracy / len(validation_loader)}')
        

Epoch 1 	 Training Loss: 0.9868676861127218 		 Validation Loss: 1.1191450357437134
Epoch 2 	 Training Loss: 1.0033438603083293 		 Validation Loss: 1.0804510116577148
Epoch 3 	 Training Loss: 0.9762939016024271 		 Validation Loss: 1.0644100904464722
Epoch 4 	 Training Loss: 0.9672296841939291 		 Validation Loss: 1.1266156435012817
Epoch 5 	 Training Loss: 0.986436128616333 		 Validation Loss: 1.1443185806274414
Epoch 6 	 Training Loss: 1.0142105221748352 		 Validation Loss: 1.1317776441574097
Epoch 7 	 Training Loss: 1.0070547858874004 		 Validation Loss: 1.1558817625045776
Epoch 8 	 Training Loss: 0.9805910388628641 		 Validation Loss: 1.2118123769760132
Epoch 9 	 Training Loss: 1.0219677686691284 		 Validation Loss: 1.1996800899505615
Epoch 10 	 Training Loss: 0.9493559002876282 		 Validation Loss: 1.1676915884017944
Epoch 11 	 Training Loss: 0.9970597227414449 		 Validation Loss: 1.1284245252609253
Epoch 12 	 Training Loss: 0.9715195298194885 		 Validation Loss: 1.116477608680725
Epo