In [1]:
import os
import pandas as pd
import numpy as np
from skimage import io, transform

# Torch utilities
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils

### Constants

In [2]:
# Images Path
imgs_dir = 'result_data/images'

# CSV Path
csv_path = 'data/final.csv'

# Exterior/Interior strings
EXTERIOR = 'exterior'
INTERIOR = 'interior'

### Load data from csv file

In [3]:
data = pd.read_csv(csv_path)
data = data[data.Doors.notna()]
data.head()

Unnamed: 0,ID,Manufacturer,Model,Category,Mileage,Gear box type,Doors,Wheel,Color,Interior color,VIN,Leather interior,Price,Customs
1,45788844,TOYOTA,RAV 4,Jeep,30402 km,Variator,4/5,Left wheel,Blue,Black,,0,15000,518.0
2,45653468,HONDA,Insight,Hatchback,210758 km,Automatic,4/5,Left wheel,Silver,,JHMZE2H57AS029004,1,800,574.0
3,45731431,KIA,Optima,Sedan,131040 km,Tiptronic,4/5,Left wheel,White,Black,KNAGM4AD8D5052655,0,5500,751.0
4,45771182,LEXUS,ES 300,Sedan,135500 km,Tiptronic,4/5,Left wheel,White,Black,,1,13500,
5,45761498,TOYOTA,Prius,Hatchback,226000 km,Automatic,4/5,Left wheel,Blue,Beige,,1,2980,761.0


### Door Types

In [4]:
door_types = data.Doors.unique()
door_types

array(['4/5', '2/3', '>5'], dtype=object)

As we can see there are 3 types of doors 

### Divide data into train, validation, test

In [5]:
# Propotions to be divided by
props = [int(.7*len(data)), int(.85*len(data))]

# Divide data into 3 parts
train, validate, test = np.split(data.sample(frac=1), props)

# PyTorch DataLoader

In [6]:
class CarsDataset(Dataset):
    """Cars Dataframe Dataset."""

    def __init__(self, df, root_dir=imgs_dir, transform=None):
        """
        Args:
            df (pd.Dataframe): Dataframe object for data
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied on a sample.
        """
        self.df = df
        self.root_dir = root_dir
        self.transform = transform
    
    def __len__(self):
         return len(self.df)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
            
        # Get Doors Type
        doors = self.df.iloc[idx].Doors
        
        # Get all images into array
        imgs_ext_dir = os.path.join(self.root_dir, str(self.df.iloc[idx, 0]), EXTERIOR)
        images = []
        for img_rel_path in os.listdir(imgs_ext_dir):
            img_abs_path = os.path.join(imgs_ext_dir, img_rel_path)
            image = io.imread(img_abs_path)
            images.append(image)
            
        
        # Data (Sample) to return
        sample = {'doors' : doors,'images':images}
        if self.transform:
            sample = self.transform(sample)
        return sample

# Create Transform Classes

In [7]:
class Rescale(object):
    """Rescale the image in a sample to a given size.

    Args:
        output_size (tuple): Desired output size. Output is matched to output_size. 
    """

    def __init__(self, output_size):
        assert isinstance(output_size, tuple)
        self.output_size = output_size

    def __call__(self, sample):
        """
        Args:
            sample(dict): {'doors' : doors,'images':images}
                    dictionary that represents doors type of car and array of images
        """
        images = []
        for image in sample['images']:
            new_h, new_w = self.output_size
            new_h, new_w = int(new_h), int(new_w)
            img = transform.resize(image, (new_h, new_w))
            images.append(img)
            
        sample['images'] = images
        return sample

### * Datasets

In [8]:
# Create transforms array from Rescale class
_transforms = transforms.Compose([Rescale((256, 256))])

train_dataset = CarsDataset(train, transform=_transforms)
validate_dataset = CarsDataset(validate, transform=_transforms)
test_dataset = CarsDataset(test, transform=_transforms)

### * Dataloaders

#### If torch.cuda.is_available(), we set device =”cuda”. This allows the program to be run on GPU or CPU based on the availability of GPU

We are setting num_workers as 1 and pin_memory as True in kwargs.num_workers denotes the number of processes that generate batches in parallel. Setting num_workers as a positive integer will turn on multi-process data loading with the specified number of loader worker processes. For data loading, passing pin_memory=True to a DataLoader will automatically put the fetched data Tensors in pinned memory, and thus enables faster data transfer to CUDA-enabled GPUs.

In [9]:
device = "cuda" if torch.cuda.is_available() else "cpu"
kwargs = {'num_workers': 1, 'pin_memory': True} if device=='cuda' else {}

In [10]:
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True, **kwargs)
validate_dataset = DataLoader(validate_dataset, batch_size=4, shuffle=True, **kwargs)
test_dataset = DataLoader(test_dataset, batch_size=4, shuffle=True, **kwargs)