# Car Doors Type Classification

## Part 1: Data Manipulation

### 1.1. Imports

In [1]:
import os
import pandas as pd
import numpy as np
from skimage import io, transform

# Torch utilities
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils,models

### 1.2. Constants

In [2]:
# Images Path
imgs_dir = 'result_data/images'

# CSV Path
csv_path = 'data/final.csv'
mod_csv_path = 'result_data/final.csv'

# Exterior/Interior strings
EXTERIOR = 'exterior'
INTERIOR = 'interior'

### 1.3. Loading data from csv file

In [3]:
data = pd.read_csv(csv_path)
data = data[data.Doors.notna()]
data

Unnamed: 0,ID,Manufacturer,Model,Category,Mileage,Gear box type,Doors,Wheel,Color,Interior color,VIN,Leather interior,Price,Customs
1,45788844,TOYOTA,RAV 4,Jeep,30402 km,Variator,4/5,Left wheel,Blue,Black,,0,15000,518.0
2,45653468,HONDA,Insight,Hatchback,210758 km,Automatic,4/5,Left wheel,Silver,,JHMZE2H57AS029004,1,800,574.0
3,45731431,KIA,Optima,Sedan,131040 km,Tiptronic,4/5,Left wheel,White,Black,KNAGM4AD8D5052655,0,5500,751.0
4,45771182,LEXUS,ES 300,Sedan,135500 km,Tiptronic,4/5,Left wheel,White,Black,,1,13500,
5,45761498,TOYOTA,Prius,Hatchback,226000 km,Automatic,4/5,Left wheel,Blue,Beige,,1,2980,761.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85932,40988107,MITSUBISHI,Pajero Sport,Jeep,0 km,Automatic,4/5,Left wheel,White,Black,,1,31200,
85933,43245646,JEEP,Wrangler rubicon unlimit,Jeep,0 km,Automatic,4/5,Left wheel,Grey,Black,,0,Price,
85934,39483245,JEEP,Grand Cherokee Laredo,Jeep,0 km,Automatic,4/5,Left wheel,Grey,Black,,0,Price,
85935,32774020,FIAT,500 Abarth,Hatchback,0 km,Automatic,2/3,Left wheel,White,Black,,0,Price,


### 1.4. Append image path to each record

If file is already created it will jump over this part

In [4]:
if not os.path.exists(mod_csv_path):
    # Create Empty Dataframe from old columns and plus image path
    old_columns = list(data.columns)
    mod_data = pd.DataFrame(columns=old_columns + ['img_path'])
    new_index = 0

    # Iterate over all cars and append record to given DF
    for index, row in data.iterrows():
        imgs_ext_dir = os.path.join(imgs_dir, str(row['ID']), EXTERIOR)
        # Check if path to exterior exists
        if not os.path.exists(imgs_ext_dir):
            continue
        for img_rel_path in os.listdir(imgs_ext_dir):
            img_abs_path = os.path.join(imgs_ext_dir, img_rel_path)
            # If image exists put into dataframe
            if os.path.exists(img_abs_path):   
                row['img_path'] = img_abs_path
                mod_data.loc[new_index] = row
                new_index +=1
    mod_data.to_csv(mod_csv_path)

In [5]:
data = pd.read_csv(mod_csv_path)
data

Unnamed: 0.1,Unnamed: 0,ID,Manufacturer,Model,Category,Mileage,Gear box type,Doors,Wheel,Color,Interior color,VIN,Leather interior,Price,Customs,img_path
0,0,45788844,TOYOTA,RAV 4,Jeep,30402 km,Variator,4/5,Left wheel,Blue,Black,,0,15000,518.0,result_data/images/45788844/exterior/4.jpg
1,1,45788844,TOYOTA,RAV 4,Jeep,30402 km,Variator,4/5,Left wheel,Blue,Black,,0,15000,518.0,result_data/images/45788844/exterior/1.jpg
2,2,45788844,TOYOTA,RAV 4,Jeep,30402 km,Variator,4/5,Left wheel,Blue,Black,,0,15000,518.0,result_data/images/45788844/exterior/5.jpg
3,3,45653468,HONDA,Insight,Hatchback,210758 km,Automatic,4/5,Left wheel,Silver,,JHMZE2H57AS029004,1,800,574.0,result_data/images/45653468/exterior/1.jpg
4,4,45653468,HONDA,Insight,Hatchback,210758 km,Automatic,4/5,Left wheel,Silver,,JHMZE2H57AS029004,1,800,574.0,result_data/images/45653468/exterior/2.jpg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58191,58191,45737045,HYUNDAI,Tucson,Jeep,67950 km,Automatic,4/5,Left wheel,White,Black,,1,12826,770.0,result_data/images/45737045/exterior/4.jpg
58192,58192,45737045,HYUNDAI,Tucson,Jeep,67950 km,Automatic,4/5,Left wheel,White,Black,,1,12826,770.0,result_data/images/45737045/exterior/2.jpg
58193,58193,45737045,HYUNDAI,Tucson,Jeep,67950 km,Automatic,4/5,Left wheel,White,Black,,1,12826,770.0,result_data/images/45737045/exterior/3.jpg
58194,58194,45737045,HYUNDAI,Tucson,Jeep,67950 km,Automatic,4/5,Left wheel,White,Black,,1,12826,770.0,result_data/images/45737045/exterior/0.jpg


### 1.5. View Available Door Types

In [6]:
door_types = data.Doors.unique()
door_types

array(['4/5', '2/3', '>5'], dtype=object)

As we can see there are 3 types of doors 

### 1.6. Divide data into train, validation, test

In [7]:
# Propotions to be divided by
props = [int(.7*len(data)), int(.85*len(data))]

# Divide data into 3 parts
train, validate, test = np.split(data.sample(frac=1), props)

## Part 2: PyTorch DataSet/DataLoader

### 2.1 Dataset

In [8]:
class CarsDataset(Dataset):
    """ Cars Dataframe Dataset """

    def __init__(self, df, transform=None):
        """
        Args:
            df (pd.Dataframe): Dataframe object for data
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied on a sample.
        """
        self.df = df
        self.transform = transform
    
    def __len__(self):
         return len(self.df)

    def __getitem__(self, idx):
        sample = {'doors' : [-1],'image':[]}
        if torch.is_tensor(idx):
            idx = idx.tolist()
            
        # Get Doors Type
        doors = np.where(door_types == self.df.iloc[idx].Doors)[0]
        
        # Get all images into array
        img_path = str(self.df.iloc[idx]['img_path'])
        if not os.path.exists(img_path):
            return sample
                       
        image = io.imread(img_path)
        
        # Data (Sample) to return
        sample = {'doors' : doors,'image': image}
        if self.transform:
            sample = self.transform(sample)
        return sample

### 2.2 Transform Clases

**Class Rescale**: Scales every picture to given size (by default its 256x256)

In [9]:
class Rescale(object):
    """ Rescale the image in a sample to a given size.

    Args:
        output_size (tuple): Desired output size. Output is matched to output_size. 
    """

    def __init__(self, output_size):
        assert isinstance(output_size, tuple)
        self.output_size = output_size

    def __call__(self, sample):
        """
        Args:
            sample(dict): {'doors' : doors,'images':images}
                    dictionary that represents doors type of car and array of images
        """
        # Get Image
        image = sample['image']
        
        # Rescale Image
        new_h, new_w = self.output_size
        new_h, new_w = int(new_h), int(new_w)
        image = transform.resize(image, (new_h, new_w))
        
        # Change Sample
        sample['image'] = image
        return sample

**Class ToTensor**: Changes all values to Tensors

In [10]:
class ToTensor(object):
    """ Convert ndarrays in sample to Tensors."""

    def __call__(self, sample):
        """
        Args:
            sample(dict): {'doors' : doors,'images':images}
                    dictionary that represents doors type of car and array of images
        """

        # 1. Change (i, j, k) to (k, i, j)
        #        numpy image: H x W x C
        #        torch image: C X H X W
        sample['image'] = sample['image']
        
        # 2. change to tensor long
        sample['doors'] = torch.Tensor(sample['doors']).long()
        
        # return changed Sample
        return sample

### 2.3 Create Datasets with Transforms

In [11]:
# Create transforms array from Rescale class
_transforms = transforms.Compose([Rescale((256, 256)), ToTensor()])

# Load Datasets
train_dataset = CarsDataset(train, transform=_transforms)
validate_dataset = CarsDataset(validate, transform=_transforms)
test_dataset = CarsDataset(test, transform=_transforms)

### 2.4 Create Dataloaders from Datasets

#### If torch.cuda.is_available(), we set device =”cuda”. This allows the program to be run on GPU or CPU based on the availability of GPU

We are setting num_workers as 1 and pin_memory as True in kwargs.num_workers denotes the number of processes that generate batches in parallel. Setting num_workers as a positive integer will turn on multi-process data loading with the specified number of loader worker processes. For data loading, passing pin_memory=True to a DataLoader will automatically put the fetched data Tensors in pinned memory, and thus enables faster data transfer to CUDA-enabled GPUs.

In [12]:
device = "cuda" if torch.cuda.is_available() else "cpu"
kwargs = {'num_workers': 1, 'pin_memory': True} if device=='cuda' else {}
batch_size = 10

In [13]:
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, **kwargs)
validate_dataloader = DataLoader(validate_dataset, batch_size=batch_size, shuffle=True, **kwargs)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, **kwargs)