In [None]:
from PIL import Image
import numpy as np
import os
import shutil
#from tqdm import tqdm
from tqdm.notebook import trange, tqdm
import imageio
import matplotlib.pyplot as plt
import pandas as pd
from random import shuffle
import random

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

import keras
from keras_tqdm import TQDMNotebookCallback
from keras.models import Sequential
from keras.utils import to_categorical
from keras.regularizers import l2
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import ReduceLROnPlateau
from keras import optimizers
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.ERROR)
import cv2
import sys

from keras.layers import Activation, Dense, Dropout, Flatten
from keras.layers.normalization import BatchNormalization
from keras.layers.convolutional import Conv2D # to add convolutional layers
from keras.layers.convolutional import MaxPooling2D # to add pooling layers
from keras.layers.advanced_activations import LeakyReLU,ThresholdedReLU

In [None]:
def plot_loss_accuracy(history):
    historydf = pd.DataFrame(history.history, index=history.epoch)
    plt.figure(figsize=(8, 6))
    historydf.plot(ylim=(0, max(1, historydf.values.max())))
    loss = history.history['val_loss'][-1]
    acc = history.history['val_acc'][-1]
    plt.title('Validation Loss: %.3f, Validation Accuracy: %.3f' % (loss, acc))

In [None]:
train_categories = pd.read_csv('train.csv')
train_images = train_categories.id.values.tolist()
location = 'C:\\Users\\ak19919\\Downloads\\Github\\Analytics-Vidya\\Apparels'
trainLabels = {}
f = open("train.csv", "r")
clothes = f.read()
clothes = clothes.split('\n')

for i in tqdm(range(len(clothes) - 1)):
    clothes[i] = clothes[i].split(',')
    trainLabels[clothes[i][0]] = clothes[i][1]
del trainLabels['id']

test_images = pd.read_csv('test.csv')
testImages = test_images.id.values.tolist()

In [None]:
Apparels = trainLabels.values()
trainSet = set(Apparels)
itr_set = {}
for i in trainSet:
    itr_set[i] = 0

In [None]:
if not os.path.exists(location + str('\\train_labelled')):
    os.makedirs(location + str('\\train_labelled'))
    os.makedirs(location + str('\\test_labelled'))
    
    # Combine labels and images and move to labelled train folder
    for img in tqdm(os.listdir(location + '\\train')):
        if not int(img.split('.')[0]) in train_images:
            continue
        imgName = img.split('.')[0]
        label = trainLabels[str(imgName)]
        itr_set[label] += 1
        path = os.path.join(location + '\\train\\', img)
        saveName = location + '\\train_labelled\\' + label + '-' + str(itr_set[label]) + '.png'
        image_data = np.array(Image.open(path))
        imageio.imwrite(saveName, image_data)
        
    # Move 20% of labelled data to validation folder for testing
    validation_data = os.listdir(location + '\\train_labelled')
    random.Random(28).shuffle(validation_data)
    for i in itr_set:
        itr_set[i] = int(itr_set[i]*0.2)
    for i in tqdm(itr_set):
        for j in validation_data:
            if j.split('-')[0] == i:
                if itr_set[i] > 0:
                    shutil.move(location + '\\train_labelled\\' + str(j), location + str('\\test_labelled'))
                    itr_set[i] -= 1

# Move unlabelled data for classification to test folder
if not os.path.exists(location + str('\\test_images')):
    os.makedirs(location + str('\\test_images'))
    for image in tqdm(testImages):
        shutil.move(location + '\\test\\' + str(image) + '.png', location + str('\\test_images'))

In [None]:
def label_img(name):
    word_label = name.split('-')[0]
    if word_label == '0' : return np.array([1,0,0,0,0,0,0,0,0,0])
    elif word_label == '1' : return np.array([0,1,0,0,0,0,0,0,0,0])
    elif word_label == '2' : return np.array([0,0,1,0,0,0,0,0,0,0])
    elif word_label == '3' : return np.array([0,0,0,1,0,0,0,0,0,0])
    elif word_label == '4' : return np.array([0,0,0,0,1,0,0,0,0,0])
    elif word_label == '5' : return np.array([0,0,0,0,0,1,0,0,0,0])
    elif word_label == '6' : return np.array([0,0,0,0,0,0,1,0,0,0])
    elif word_label == '7' : return np.array([0,0,0,0,0,0,0,1,0,0])
    elif word_label == '8' : return np.array([0,0,0,0,0,0,0,0,1,0])
    elif word_label == '9' : return np.array([0,0,0,0,0,0,0,0,0,1])

In [None]:
def get_size_statistics(DIR):
    heights = []
    widths = []
    for img in tqdm(os.listdir(DIR)): 
        path = os.path.join(DIR, img)
        data = np.array(Image.open(path)) #PIL Image library
        heights.append(data.shape[0])
        widths.append(data.shape[1])
    avg_height = sum(heights) / len(heights)
    avg_width = sum(widths) / len(widths)
    print("Average Height: " + str(avg_height))
    print("Max Height: " + str(max(heights)))
    print("Min Height: " + str(min(heights)))
    print("Average Width: " + str(avg_width))
    print("Max Width: " + str(max(widths)))
    print("Min Width: " + str(min(widths)))

In [None]:
get_size_statistics(location + '\\train_labelled')

In [None]:
IMG_SIZE = 48
def load_training_data(DIR):
    train_data = []
    for img in tqdm(os.listdir(DIR)):
        label = label_img(img)
        path = os.path.join(DIR, img)
        img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
        img = cv2.resize(img, (IMG_SIZE, IMG_SIZE))
        train_data.append([np.array(img), label])
    shuffle(train_data)
    return train_data

def load_validation_data(DIR):
    val_data = []
    for img in tqdm(os.listdir(DIR)):
        label = label_img(img)
        path = os.path.join(DIR, img)
        img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
        img = cv2.resize(img, (IMG_SIZE, IMG_SIZE))
        val_data.append([np.array(img), label])
    shuffle(val_data)
    return val_data

def load_testing_data(DIR):
    test_data = []
    for Img in tqdm(os.listdir(DIR)):
        path = os.path.join(DIR, Img)
        img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
        img = cv2.resize(img, (IMG_SIZE, IMG_SIZE))
        test_data.append([np.array(img), Img])
    return test_data

In [None]:
train_data = load_training_data(location + '\\train_labelled')
val_data = load_validation_data(location + '\\test_labelled')
X_train = np.array([i[0] for i in train_data]).reshape(-1,IMG_SIZE,IMG_SIZE,1)
X_train = X_train / 255 # normalize training data
y_train = np.array([i[1] for i in train_data])
y_train = y_train / 255 # normalize training data
X_test = np.array([i[0] for i in val_data]).reshape(-1,IMG_SIZE,IMG_SIZE,1)
X_test = X_test / 255 # normalize test data
y_test = np.array([i[1] for i in val_data])
y_test = y_test / 255 # normalize training data

In [None]:
plt.imshow(train_data[1247][0], cmap = 'gist_gray')

In [None]:
def convolutional_model():
    ADAMAX = optimizers.Adamax(lr = 0.002, beta_1 = 0.9, beta_2 = 0.999)

    # create model
    model = Sequential()
    model.add(Conv2D(32, (3, 3), padding = 'same', activation = 'relu', kernel_initializer = 'he_uniform', input_shape = (IMG_SIZE, IMG_SIZE, 1)))
    model.add(LeakyReLU(alpha = 0.1))
    model.add(BatchNormalization())
    model.add(MaxPooling2D((2, 2), padding = 'same'))
    model.add(Dropout(0.5))
    model.add(Conv2D(64, (3, 3), activation = 'linear', padding = 'same'))
    model.add(LeakyReLU(alpha = 0.1))
    model.add(BatchNormalization())
    model.add(MaxPooling2D(pool_size = (2, 2), padding = 'same'))
    model.add(Dropout(0.5))
    model.add(Conv2D(128, (3, 3), activation = 'linear',padding = 'same'))
    model.add(LeakyReLU(alpha = 0.1))
    model.add(BatchNormalization())
    model.add(MaxPooling2D(pool_size = (2,2),padding = 'same'))
    model.add(Dropout(0.5))
    model.add(Flatten())
    model.add(Dense(256, activation = 'linear'))
    model.add(LeakyReLU(alpha = 0.1))
    model.add(Dense(10, activation = 'softmax'))
    # Compile model
    model.compile(optimizer = ADAMAX, loss = 'categorical_crossentropy',  metrics = ['accuracy'])
    return model

In [None]:
gen = ImageDataGenerator(rotation_range = 10, width_shift_range = 0.1, shear_range = 0.1,
                         height_shift_range = 0.1, zoom_range = 0.25, fill_mode = 'nearest', horizontal_flip = True,
                         vertical_flip = False, featurewise_center = False,
                         samplewise_center = False, featurewise_std_normalization = False,
                         samplewise_std_normalization = False)
test_gen = ImageDataGenerator()

# Create batches to  train models faster
train_generator = gen.flow(X_train, y_train, batch_size = 256)
test_generator = test_gen.flow(X_test, y_test, batch_size = 256)

In [None]:
# Use annelar to gradually decrese the learning rate to improve generalization

reduce_lr = ReduceLROnPlateau(monitor = 'val_acc', patience = 20, verbose = 1, factor = 0.4, min_lr = 0.00002,
                                            mode = 'auto', cooldown = 0)

In [None]:
# build the model
model = convolutional_model()
epochs = 100
# fit the model

history = model.fit_generator(train_generator, steps_per_epoch = 190, epochs = epochs, 
                              validation_data = test_generator, validation_steps = 48, verbose = 1,
                              callbacks=[reduce_lr])

# evaluate the model
scores = model.evaluate(X_test, y_test, verbose = 1)
print("Accuracy: {} \n Error: {}".format(scores[1], 100-scores[1]*100))

In [None]:
plot_loss_accuracy(history)

In [None]:
test_data = load_testing_data(location + '\\test_images')
test = np.array([i[0] for i in test_data]).reshape(-1, IMG_SIZE, IMG_SIZE, 1)
test_labels = np.array([i[1] for i in test_data])
test = test / 255 # normalize test data
Y_pred = np.round(model.predict(test))
Y_pred = np.argmax(Y_pred, axis = 1)
Y_pred = pd.Series(Y_pred, name = "label")

In [None]:
submission_df = pd.DataFrame({
                  "id": pd.Series(test_labels),
                  "label": pd.Series(Y_pred)})
submission_df['id'] = submission_df['id'].apply(lambda x: x.split('.')[0])
submission_df.to_csv('submission_3_96p_3CLBMD_FDLD.csv', index = False)

In [None]:
LR = 0.001
model_name = 'classify_apparels-{}-{}.model'.format(LR, '96p_3CLBMD_FDLD_conv_v3')
model.save(model_name)