In [None]:
import numpy as np
import pandas as pd 
import cv2
import os
import tqdm
import glob

import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn.utils import shuffle

class OralCancerDataset(Dataset):
    """__init__ and __len__ functions are the same as in TorchvisionDataset"""

    def __init__(self, path_to_images, path_to_csv = None, validation=False, val_ratio=1.0):
        
        # Passing the path to the train csv file reads the data from the csv with the labels
        # If None is passes insted only the images in the image folder is loaded (wich is useful for the test set)
        
        self.path_to_images = path_to_images
        self.path_to_csv = path_to_csv
        self.v=validation
        self.v_r=val_ratio

        if self.path_to_csv is not None:
            dat=pd.read_csv(self.path_to_csv)
            dat=shuffle(dat)
            dat.reset_index(inplace=True, drop=True)
            val,tr=np.split(dat,[int(len(dat)*val_ratio)]) #train val split
            
            if self.v==False:
                #self.df = pd.read_csv(self.path_to_csv)
                self.df=tr

            elif self.v==True:
                self.df=val
    
    def __len__(self):
        if self.path_to_csv:
            return len(self.df)
        else:
            return len(glob.glob(self.path_to_images + '/*.jpg'))
    
    def __getitem__(self, idx):
        
        if self.path_to_csv:
            data = self.df.iloc[idx]
            #print(data['Name'])
            image = cv2.imread(os.path.join(self.path_to_images, data['Name']), -1)
            label = data['Diagnosis']
            
            # You can input torchvision (or other) transforms and directly augment the data
            # if self.transform:
            #    image = self.transform(image)
            # ..
            
            return image, label
            
        else:
            name = 'image_' + str(idx) + '.jpg'
            image = cv2.imread(os.path.join(self.path_to_images, name), -1)
            
            return image, name

In [None]:
# class OralCancerDataset(Dataset):
#     """__init__ and __len__ functions are the same as in TorchvisionDataset"""

#     def __init__(self, path_to_images, path_to_csv = None):
        
#         # Passing the path to the train csv file reads the data from the csv with the labels
#         # If None is passes insted only the images in the image folder is loaded (wich is useful for the test set)
        
#         self.path_to_images = path_to_images
#         self.path_to_csv = path_to_csv
        
#         if self.path_to_csv is not None:
#             self.df = pd.read_csv(self.path_to_csv)
    
#     def __len__(self):
#         if self.path_to_csv:
#             return len(self.df)
#         else:
#             return len(glob.glob(self.path_to_images + '/*.jpg'))
    
#     def __getitem__(self, idx):
        
#         if self.path_to_csv:
#             data = self.df.iloc[idx]
#             image = cv2.imread(os.path.join(self.path_to_images, data['Name']), -1)
#             label = data['Diagnosis']
            
#             # You can input torchvision (or other) transforms and directly augment the data
#             # if self.transform:
#             #    image = self.transform(image)
#             # ..
            
#             return image, label
            
#         else:
#             name = 'image_' + str(idx) + '.jpg'
#             image = cv2.imread(os.path.join(self.path_to_images, name), -1)
            
#             return image, name

In [None]:
path_to_csv = 'Data/train.csv'
path_to_train_images = 'Data/train'
path_to_test_images = 'Data/test'


train_dataset = OralCancerDataset(path_to_train_images, path_to_csv, validation=False, val_ratio=0.2)
val_dataset = OralCancerDataset(path_to_train_images, path_to_csv, validation=True, val_ratio=0.2)

test_dataset = OralCancerDataset(path_to_test_images)

train_dataloader = DataLoader(train_dataset,
batch_size=32,
shuffle=True,
num_workers=0 ) #* (1+torch.cuda.device_count()))

val_dataloader= DataLoader(val_dataset,
batch_size=32,
shuffle=True,
num_workers=0 ) #* (1+torch.cuda.device_count()))


test_dataloader = DataLoader(test_dataset,
    batch_size=32,
    shuffle=False,
    num_workers=0 ) #* (1+torch.cuda.device_count()))

In [None]:
print((len(train_dataset))+(len(val_dataset))) #14683
# if  __name__ == '__main__':

#     k=0
#     for i in range(0,len(train_dataset)):
#         im,n=train_dataset.__getitem__(i)
#         print(n)
#         k=k+1
#         cv2.imshow(str(n),im)
#         cv2.waitKey(0) 
#         cv2.destroyWindow(str(n))
#         if k==1:
#             break

In [None]:
## Inference
# Simple example based on only a mean intensity threshold 

threshold = 120

d = {'Name':[], 'Diagnosis':[]}

for data, name in tqdm.tqdm(test_dataloader):
    data = data.permute(0,3,1,2) # move channel axis before x & y
    # We guess that dark cells are dangerous
    label = torch.mean(data.float(), dim=(1,2,3)) < threshold # mean over dimensions 1,2,3 of the data tensor
    label = label.float() # cast to real values (still a tensor)
    label = label.tolist() # cast to list
    
    d['Name'].extend(name)
    d['Diagnosis'].extend(label)
    #print(d)

In [None]:
df = pd.DataFrame(d)
df.to_csv('submission.csv', index = False)