In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import pathlib
import tensorflow as tf

In [None]:
!ls -lh ../data/raw/vehicle

In [None]:
!ls -lh ../data/raw/vehicle/train/train

In [None]:
!ls -lh ../data/raw/vehicle/test/testset | head -5

In [None]:
# I create a dataframe with class and file paths for all images on train folder
root_train = '../data/raw/vehicle/train/train/'
data_train = []

In [None]:
for category in sorted(os.listdir(root_train)):
    for file in sorted(os.listdir(os.path.join(root_train, category))):
        data_train.append((category, os.path.join(root_train, category, file)))

In [None]:
train_df = pd.DataFrame(data_train, columns=['class','file_path'])

In [None]:
train_df.head()

In [None]:
print(f'There are {len(train_df)} images on train folder')

In [None]:
# I create a dataframe with file paths for all images on test folder
root_test = '../data/raw/vehicle/test/testset/'
data_test = []

In [None]:
for file in sorted(os.listdir(root_test)):
    data_test.append(file)

In [None]:
test_df = pd.DataFrame(data_test, columns=['file_path'])

In [None]:
test_df.head()

In [None]:
print(f'There are {len(test_df)} images on test folder')

In [None]:
train_df['class']

In [None]:
train_df['class'].value_counts()

In [None]:
train_df['class'].value_counts().plot(kind='bar')
plt.title('Types of vehicles')

The dataset is highly imbalanced, there exist 8892 images of boat class and only 73 images of limousine class.

In [None]:
root_train = pathlib.Path(root_train)
image_count = len(list(root_train.glob('*/*.jpg')))

In [None]:
CLASS_NAMES = np.array([item.name for item in root_train.glob('*') if item.name != ".DS_Store"])

In [None]:
CLASS_NAMES

In [None]:
image_generator = tf.keras.preprocessing.image.ImageDataGenerator(rescale=1. / 255)
BATCH_SIZE = 32
IMG_HEIGHT = 224
IMG_WIDTH = 224
STEPS_PER_EPOCH = np.ceil(image_count / BATCH_SIZE)

In [None]:
train_data_gen = image_generator.flow_from_directory(directory=str(root_train),
                                                     batch_size=BATCH_SIZE,
                                                     shuffle=True,
                                                     target_size=(IMG_HEIGHT, IMG_WIDTH),
                                                     classes=list(CLASS_NAMES))

In [None]:
image_batch, label_batch = next(train_data_gen)
plt.figure(figsize=(10, 10))
for n in range(25):
    ax = plt.subplot(5, 5, n + 1)
    plt.imshow(image_batch[n])
    plt.title(CLASS_NAMES[label_batch[n] == 1][0].title())
    plt.axis('off')
plt.show()

In [None]:
#other way to show images
import cv2
import random

In [None]:
cols = []
for col in train_df['class'].unique():
    cols.append(col)

In [None]:
cols

In [None]:
print("="*70)
print("Displaying 4 random image per vehicle category")
print("="*70)

# for every category in `cols`
for j in range(15):
    plt.figure(j)
    plt.figure(figsize=(20,20))
    
    # 4 images per every vehicle
    for i in range(4):
        id = "14{}".format(i+1)
        plt.subplot(int(id))
        random_file = random.choice(os.listdir(root_train + cols[j] + "/"))
        img = cv2.imread(root_train + cols[j] + "/" + random_file)
        plt.title(cols[j])
        plt.imshow(img)
plt.show()