# Shopee Image Classification Task

## Import Dataset

In [24]:
from PIL import Image, ImageOps
import numpy as np
import pandas as pd
import glob

categories = ['BabyBibs', 'BabyHat', 'BabyPants', 'BabyShirt', 'PackageFart','womanshirtsleeve',
             'womencasualshoes','womenchiffontop','womendollshoes','womenknittedtop','womenlazyshoes',
             'womenlongsleevetop','womenpeashoes','womenplussizedtop','womenpointedflatshoes',
             'womensleevelesstop','womenstripedtop','wrapsnslings']

desired_size = 64

X_train = []
y_train = []

for label, cate in enumerate(categories):
    print('Starting to load ' + cate)
    for filename in glob.glob('./dataset/Training/' + cate + '/*.jpg'):
        with Image.open(filename) as img:
            old_size = img.size
            ratio = float(desired_size) / max(old_size)
            new_size = tuple([int(x * ratio) for x in old_size])
            imt = img.resize(new_size, Image.ANTIALIAS)
            new_img = Image.new("RGB", (desired_size, desired_size))
            new_img.paste(img, ((desired_size - new_size[0])//2, (desired_size - new_size[1])//2))
            pixels = list(new_img.getdata())
            pixels = np.array([[c[i] for c in pixels] for i in range(3)])
            pixels = pixels.reshape(-1)
            X_train.append(pixels)
            y_train.append(label)
    print(len(X_train))
    print(label)
            
X_train = np.array(X_train, dtype = 'float32')
y_train = np.array(y_train, dtype = 'int32')

X_train_df = pd.DataFrame(X_train)
y_train_df = pd.DataFrame(y_train)

X_train_df.to_csv('./dataset/data_train_values/train.csv', index = False, header = False)
y_train_df.to_csv('./dataset/data_train_values/label.csv', index = False, header = False)

print(X_train.shape)
print(y_train.shape)        


Starting to load BabyBibs
3388
0
Starting to load BabyHat
6037
1
Starting to load BabyPants
7963
2
Starting to load BabyShirt
9875
3
Starting to load PackageFart
11849
4
Starting to load womanshirtsleeve
13960
5
Starting to load womencasualshoes
17060
6
Starting to load womenchiffontop
20247
7
Starting to load womendollshoes
21694
8
Starting to load womenknittedtop
23982
9
Starting to load womenlazyshoes
26969
10
Starting to load womenlongsleevetop
29884
11
Starting to load womenpeashoes
31703
12
Starting to load womenplussizedtop
33320
13
Starting to load womenpointedflatshoes
34999
14
Starting to load womensleevelesstop
36292
15
Starting to load womenstripedtop
37101
16
Starting to load wrapsnslings
38199
17
(38199, 12288)
(38199,)


## Data Preprocessing

In [25]:
X_train = pd.read_csv('./dataset/data_train_values/train.csv').values
y_train = pd.read_csv('./dataset/data_train_values/train.csv').values
X_train = X_train / 255
_y_train = y_train
y_train = np.zeros((X_train.shape[0], 17))
y_train[np.arange(X_train.shape[0]), _y_train] = 1
desired_size = 64
X_train = X_train.reshape(X_train.shape[0], 3, desired_size, desired_size)
num_train, num_val = int(X_train.shape[0] * 0.9), X_train.shape[0] - int(X_train.shape[0] * 0.9)
idx = np.random.permutation(X_train.shape[0])
X_train = X_train[idx]
y_train = y_train[idx]
X_train, X_val = X_train[: num_train], X_train[num_train: ]
y_train, y_val = y_train[: num_train], y_train[num_train: ]

KeyboardInterrupt: 

## Baseline Approach

In [None]:
from keras.applications import VGG16
num_category = 17
vgg16 = VGG16(weights = 'imagenet', include_top = False, input_shape = (3, 64, 64))
train_features = np.zeros(shape = (num_train, 7, 7, 512))
train_labels = np.zeros(shape = (num_train, num_category))
iter = 0
batch_size = 64
while iter * batch_size <= num_train:
    x = X_train[iter * batch_size : (iter + 1) * batch_size]
    y = y_train[iter * batch_size : (iter + 1) * batch_size]
    features = vgg16.predict(x)
    train_features[iter * batch_size : (iter + 1) * batch_size] = features
    train_labels[iter * batch_size : (iter + 1) * batch_size] = y
    iter += 1

train_features = np.reshape(train_features, (num_train, 7 * 7 * 512))

from keras import models
from keras import layers
from keras import optimizers
 
model = models.Sequential()
model.add(layers.Dense(256, activation='relu', input_dim = 7 * 7 * 512))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(3, activation='softmax'))

model.compile(optimizer=optimizers.Adam(lr=1e-4),
              loss='categorical_crossentropy',
              metrics=['accuracy'])
 
history = model.fit(train_features,
                    train_labels,
                    epochs=2,
                    batch_size=batch_size,
                    validation_data=(X_val, y_val))