# **Import Modules**

In [27]:
import os
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import backend as K
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import numpy as np
import pandas as pd
import shutil
from google.colab import drive


# **Download Dataset From Kaggle**

In [None]:
! pip install kaggle
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets download landrykezebou/vcor-vehicle-color-recognition-dataset

In [None]:
!unzip vcor-vehicle-color-recognition-dataset.zip -d /content/Dataset

#**Dataset Info**

This dataset combined with 2 daataset :

1.   [**KaggleDS**](https://www.kaggle.com/datasets/landrykezebou/vcor-vehicle-color-recognition-dataset)
2.   [**CVisionDS**](https://class.vision/)

________________________________________________________________________________
Train Images contain KaggleDS & CVisonDS

Valid Images contain just CVisionDS(Results on this dataset is important for us)



In [16]:
from google.colab import drive
drive.mount('/content/drive' ,force_remount=True)

Mounted at /content/drive


In [28]:
DS_DIR = os.path.join("/content/drive/MyDrive","Car-Color-Recognition","CarDS")
colors = sorted(os.listdir(os.path.join(DS_DIR ,'train')))
train_len = pd.Series([ len(os.listdir(os.path.join(DS_DIR ,'train',class_))) for class_ in colors  ])
valid_len = pd.Series([ len(os.listdir(os.path.join(DS_DIR ,'val',class_))) for class_ in colors  ])
dic = { "color":colors ,
       "train_len":train_len ,
       "valid_len":valid_len ,
       "total_len":train_len+valid_len }
df = pd.DataFrame(dic)
df.to_csv(os.path.join(DS_DIR,'Ds-Info.csv'))
df

Unnamed: 0,color,train_len,valid_len,total_len
0,beige,1050,124,1174
1,black,1235,298,1533
2,blue,1102,275,1377
3,brown,1051,237,1288
4,cream,600,33,633
5,crimson,623,9,632
6,gold,600,25,625
7,green,904,46,950
8,grey,1373,343,1716
9,navy-blue,590,45,635


# **Prepare Data**

**Implemented Process**


*   UPSampling Data to (600,900) 
*   Augument classes are less than 1000



Upsampling Images to size = 600*900 if images have small size

In [None]:
def upsample(path):
    image = tf.image.decode_image(tf.io.read_file(path), expand_animations=False)
    scale = max(np.ceil(600 / image.shape[0]), np.ceil(900 / image.shape[1]))
    image = tf.expand_dims(image, axis=0)
    image = tf.keras.layers.UpSampling2D(size=(scale, scale))(image)
    return tf.reshape(image, (image.shape[1], image.shape[2], 3))

In [None]:
def save_upsampled(src, dist):
    image = upsample(src)
    image = tf.image.resize(image, (image.shape[0], image.shape[1]))
    tf.keras.utils.save_img(dist, image)
    print('.', end='')

In [None]:
def make_upsample(color, dist):
    files = os.listdir(os.path.join("/content/drive/MyDrive", color))
    for file in files:
        save_upsampled(os.path.join("/content/drive/MyDrive", color, file), os.path.join(dist, file))

In [None]:
def dirsTOdf (dsDir):
    categories=['train','val']
    for category in categories:
        catpath=os.path.join(dsDir, category)
        file_paths=[]
        labels=[]    
        classlist=os.listdir(catpath)
        for class_ in classlist:
            classpath=os.path.join(catpath,class_)
            for file in os.listdir(classpath):
                file_paths.append(os.path.join(classpath ,file))
                labels.append(class_)
        Fseries=pd.Series(file_paths, name='file_paths')
        Lseries=pd.Series(labels, name='labels')
        if category == 'train':
            train_df=pd.concat([Fseries, Lseries], axis=1)            
        else:
            valid_df= pd.concat([Fseries, Lseries], axis=1)     

    return train_df, valid_df

In [None]:
DS_DIR = os.path.join("/content/drive/MyDrive","Car-Color-Recognition","CarDS")
train_df, valid_df= dirsTOdf(DS_DIR)

Check Data Distribution

In [None]:
print('train_df length: ', len(train_df),'  valid_df length: ', len(valid_df))
print("\nTrain labels distribution :\n\n{}".format(list(train_df["labels"].value_counts())))

train_df length:  17043   valid_df length:  3495

Train labels distribution :

[5744, 1373, 1235, 1102, 1040, 951, 950, 845, 824, 804, 762, 300, 300, 300, 290, 223]


In [None]:
train_df.groupby('labels').count()

Unnamed: 0_level_0,file_paths
labels,Unnamed: 1_level_1
beige,950
black,1235
blue,1102
brown,951
cream,300
gold,300
green,804
grey,1373
navy-blue,290
orange,762


As you can see Train Data is imbalance .

We will reach all classes to 400 samples with Pop and Augumentation .

In [None]:
def trim (df, size, column):
    df=df.copy()
    original_class_count= len(list(df[column].unique()))
    print ('Original Number of classes in dataframe: ', original_class_count)
    sample_list=[] 
    groups=df.groupby(column)
    for label in df[column].unique():        
        group=groups.get_group(label)
        sample_count=len(group)         
        if sample_count <= size:
            sample_list.append(group)
    df=pd.concat(sample_list, axis=0).reset_index(drop=True)
    final_class_count= len(list(df[column].unique())) 
    if final_class_count != original_class_count:
        print ('*** WARNING***  dataframe has a reduced number of classes' )
    print( list(df[column].value_counts()) )
    return df

In [None]:
def balance(train_df,limit_size,unit_size,column, working_dir, image_size):
    train_df=train_df.copy()
    train_df=trim (train_df, limit_size, column)    
    aug_dir=os.path.join(working_dir, 'train')
    total=0
    gen=ImageDataGenerator(horizontal_flip=True,  rotation_range=20, width_shift_range=.2,
                                  height_shift_range=.2, zoom_range=.1)
    groups=train_df.groupby('labels') # group by class
    for label in train_df['labels'].unique():  # for every class               
        group=groups.get_group(label)  # a dataframe holding only rows with the specified label 
        sample_count=len(group)   # determine how many samples there are in this class  
        if sample_count< limit_size: # if the class has less than target number of images
            aug_img_count=0
            delta = (limit_size//sample_count) * unit_size
           # delta=max_samples-sample_count  # number of augmented images to create
            target_dir=os.path.join(aug_dir, label)  # define where to write the images    
            aug_gen=gen.flow_from_dataframe( group,  x_col='file_paths', y_col=None, target_size=image_size,
                                            class_mode=None, batch_size=1, shuffle=False, 
                                            save_to_dir=target_dir, color_mode='rgb',
                                            save_format='jpg')
            while aug_img_count<delta:
                images=next(aug_gen)            
                aug_img_count += len(images)
            total +=aug_img_count
    print('Total Augmented images created= ', total)

In [None]:
limit_size = 1000
unit_size = 100
column='labels'
img_size=(600,900)
balance(train_df, limit_size, unit_size, column, DS_DIR, img_size)

Original Number of classes in dataframe:  16
[951, 950, 845, 824, 804, 762, 300, 300, 300, 290, 223]
Found 290 validated image filenames.
Found 223 validated image filenames.
Found 804 validated image filenames.
Found 300 validated image filenames.
Found 824 validated image filenames.
Found 950 validated image filenames.
Found 762 validated image filenames.
Found 300 validated image filenames.
Found 300 validated image filenames.
Found 951 validated image filenames.
Found 845 validated image filenames.
Total Augmented images created=  2200
