In [31]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import torch
import torchvision
import PIL
from PIL import Image
PIL.Image.MAX_IMAGE_PIXELS = None # allows big images
from torchvision import transforms, models
from torch.utils.data import Dataset
from torchvision.utils import make_grid
from matplotlib import pyplot as plt
import gc
from tqdm import tqdm
from typing import Optional, List, Dict

# Loading the data

In [4]:
fileNames = []
filePaths = []
for dirname, _, filenames in os.walk('../input/mayo-clinic-strip-ai'):
    for filename in filenames:
        filePaths.append(os.path.join(dirname, filename))
        fileNames.append(filename)
dictFiles = {k:v for k,v in zip(fileNames,filePaths)}
dfTrain = pd.read_csv(filePaths[1])
dfTest = pd.read_csv(filePaths[2])
dfTrain['paths'] = dfTrain.apply(lambda x: dictFiles[x['image_id']+'.tif'], axis = 1)

## Direct approach

In [4]:
class MayoDataset(Dataset):
    def __init__(self,dataframe,colPaths,colLabels, transform = None, label_transform = None):
        self.dataframe = dataframe
        self.colPaths = colPaths
        self.colLabels = colLabels
        self.transform = transform
        self.label_transform = label_transform
        
    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, index):
        imagePath = self.dataframe[self.colPaths].iloc[index]
        im = Image.open(imagePath)
        label = self.dataframe[self.colLabels].iloc[index]
        if self.transform:
            im = self.transform(im)
        if self.label_transform:
            label = self.target_transform(imageLabel)
            
        return(im, label)
        

In [5]:
batch_size = 10
transform = transforms.Compose([
                        transforms.Resize(224),
                        transforms.RandomHorizontalFlip(p=0.5),
                        transforms.RandomRotation(30),     
                        transforms.CenterCrop(224),
                        transforms.ToTensor()
                        ])


trainData = MayoDataset(dfTrain , 'paths' , 'label' ,transform = transform)
train_DataLoader = torch.utils.data.DataLoader(trainData, batch_size = batch_size , shuffle = True)

In [6]:
for images, labels in train_DataLoader:
    break
labels



In [7]:
im = make_grid(images,nrow = 5)
plt.figure(figsize =(10,4))
plt.imshow(np.transpose(im.numpy(),(1,2,0)))

* Some images seems to be empty

## Undirect approach

* The directories are created

In [5]:
dirsToCreate = ['CE','LAA']
parentDir = "/kaggle/working"

for newdir in dirsToCreate:
    path = os.path.join(parentDir, newdir)
    if  os.path.isdir(path) == False:
        os.mkdir(path)

In [None]:

def scaleKeepingAspectRatio(imSize:tuple,maxdim:int)->tuple:
    '''
    Takes the image size and calculates the new size keeping the aspect ratio constant and using as reference the heighest dimension
    '''
    maxdim = maxdim
    dimns = list(imSize)
    dimnsScaled = dimns.copy()
    indxmindim = np.argmin(dimns)
    indxmaxdim = np.argmax(dimns)
    dimnsScaled[indxmaxdim] = maxdim
    scaleFactor = maxdim/dimns[indxmaxdim]
    dimnsScaled[indxmindim] = int(dimnsScaled[indxmindim]*scaleFactor)
    return tuple(dimnsScaled)


for n, row in tqdm(dfTrain.iterrows()):
    imName = row['image_id']
    im = Image.open(row['paths'])
    label = row['label']        
    newSize = scaleKeepingAspectRatio(im.size,1024)
    im = im.resize(newSize)
    path = os.path.join(parentDir, label,imName)
    im.save(path,'png')
    del im # delete image
    gc.collect() # release memory


In [7]:
import shutil
shutil.make_archive('images', 'zip', '/kaggle/working')