In [1]:
import numpy as np
import pandas as pd

import os # used for navigating to image path
import imageio # used for writing images

from sklearn.preprocessing import LabelEncoder

import keras

#  Keras preprocessing
from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img
from keras.preprocessing import image_dataset_from_directory

# Keras modeling
from keras.models import Sequential
from keras.layers import  Lambda , Dense, Flatten, Dropout, Conv2D, MaxPool2D
from keras.callbacks import EarlyStopping
from keras.layers import BatchNormalization, Convolution2D , MaxPooling2D

In [2]:
train = pd.read_csv('train.csv', error_bad_lines=False, sep = '\t')
test = pd.read_csv('test.csv', error_bad_lines=False, sep = '\t')

In [3]:
train.head()

Unnamed: 0,imageid,label,productname
0,2653,Bags,Murcia Women Leather Office Grey Bag
1,55997,Others,Colorbar Velvet Matte Temptation Lipstick 24MA
2,2640,Shoes,Carlton London Men Brown Formal Shoes
3,40565,Topwear,W Women Maroon Kurta
4,38932,Bottomwear,Gini and Jony Girls Pink Leggings


In [4]:
train['image'] = train.apply(lambda row: str(row['imageid']) + ".jpg", axis=1)
test['image'] = test.apply(lambda row: str(row['imageid']) + ".jpg", axis=1)

In [5]:
path = 'images/'
batch_size = 40
target_size = (100,100)
epochs = 40
num_train_samples = train.shape[0]
num_test_samples = test.shape[0]

In [6]:

image_generator = ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.1,
    height_shift_range=0.1,
    rescale=1/255,
    shear_range=0.1,
    zoom_range=0.1,
    horizontal_flip=True,
    fill_mode='nearest',
    validation_split=0.2)

print('Training data')
training_generator = image_generator.flow_from_dataframe(
    dataframe=train,
    directory=path,
    x_col="image",
    y_col="label",
    target_size=target_size,
    class_mode='categorical',
    #color_mode="grayscale",
    batch_size=batch_size,
    subset="training"
)
print('Validation data')
validation_generator = image_generator.flow_from_dataframe(
    dataframe=train,
    directory=path,
    x_col="image",
    y_col="label",
    target_size=target_size,
    class_mode='categorical',
    #color_mode="grayscale",
    batch_size=batch_size,
    subset="validation"
)
print('Test data')
test_image_gen = ImageDataGenerator(rescale=1/255)
test_generator = test_image_gen.flow_from_dataframe(
    dataframe=test,
    directory=path,
    x_col='image',
    y_col=None,
    class_mode=None,
    #color_mode="grayscale",
    target_size=target_size,
    batch_size=batch_size,
    shuffle=False
)




Training data
Found 32353 validated image filenames belonging to 13 classes.
Validation data
Found 8088 validated image filenames belonging to 13 classes.
Test data
Found 4000 validated image filenames.


In [7]:
base_model = keras.applications.Xception(
        weights="imagenet",  # Load weights pre-trained on ImageNet.
        input_shape=(100, 100, 3),
        include_top=False,
    )
base_model.trainable = False  # Freeze the model for later fine turning
base_model.summary()

Model: "xception"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 100, 100, 3) 0                                            
__________________________________________________________________________________________________
block1_conv1 (Conv2D)           (None, 49, 49, 32)   864         input_1[0][0]                    
__________________________________________________________________________________________________
block1_conv1_bn (BatchNormaliza (None, 49, 49, 32)   128         block1_conv1[0][0]               
__________________________________________________________________________________________________
block1_conv1_act (Activation)   (None, 49, 49, 32)   0           block1_conv1_bn[0][0]            
___________________________________________________________________________________________

In [8]:
# Freeze the base_model
base_model.trainable = False

# Create new model on top
inputs = keras.Input(shape=(100,100,3))

norm_layer = keras.layers.experimental.preprocessing.Normalization()
mean = np.array([127.5] * 3) # (255+0)/2 = 127.5
var = mean ** 2

# Scale inputs to [-1, +1]
x = norm_layer(inputs)
norm_layer.set_weights([mean, var])

# when we unfreeze the base model for fine-tuning, so we make sure that the base_model is running in inference mode here.
x = base_model(x, training=False)
x = keras.layers.GlobalAveragePooling2D()(x)
x = Dropout(0.1)(x)  # Regularize with dropout
x = Dense(256)(x) 
outputs = Dense(13)(x) 
model = keras.Model(inputs, outputs)
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 100, 100, 3)]     0         
_________________________________________________________________
normalization (Normalization (None, 100, 100, 3)       7         
_________________________________________________________________
xception (Functional)        (None, 3, 3, 2048)        20861480  
_________________________________________________________________
global_average_pooling2d (Gl (None, 2048)              0         
_________________________________________________________________
dropout (Dropout)            (None, 2048)              0         
_________________________________________________________________
dense (Dense)                (None, 256)               524544    
_________________________________________________________________
dense_1 (Dense)              (None, 13)                3341  

In [10]:
# Train the top layer

model.compile(
    optimizer=keras.optimizers.Adam(),
    loss=keras.losses.BinaryCrossentropy(from_logits=True),
    metrics=[keras.metrics.BinaryAccuracy()],
)
epochs = 5
model.fit_generator(
        training_generator,
        steps_per_epoch= 0.8 * num_train_samples // batch_size,
        epochs=epochs,# lower the computational cost
        #verbose=1,
        validation_data=validation_generator,
        validation_steps= 0.2 * num_train_samples // batch_size)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x1d4507d5788>

In [12]:
# fine-tuning of the entire model

base_model.trainable = True
model.summary()

model.compile(
    optimizer=keras.optimizers.Adam(1e-5),  # Low learning rate
    loss=keras.losses.BinaryCrossentropy(from_logits=True),
    metrics=[keras.metrics.BinaryAccuracy()],
)

epochs = 1
model.fit(
        training_generator,
        steps_per_epoch= 0.8 * num_train_samples // batch_size,
        epochs=epochs,# lower the computational cost
        #verbose=1,
        validation_data=validation_generator,
        validation_steps= 0.2 * num_train_samples // batch_size)

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 100, 100, 3)]     0         
_________________________________________________________________
normalization (Normalization (None, 100, 100, 3)       7         
_________________________________________________________________
xception (Functional)        (None, 3, 3, 2048)        20861480  
_________________________________________________________________
global_average_pooling2d (Gl (None, 2048)              0         
_________________________________________________________________
dropout (Dropout)            (None, 2048)              0         
_________________________________________________________________
dense (Dense)                (None, 256)               524544    
_________________________________________________________________
dense_1 (Dense)              (None, 13)                3341  

<tensorflow.python.keras.callbacks.History at 0x1d45462acc8>

In [13]:
predict = model.predict_generator(test_generator, steps=num_test_samples //batch_size )

# convert prediction result of integers to categorical names
predicted_class_indices=np.argmax(predict,axis=1)
labels = (training_generator.class_indices)
labels = dict((v,k) for k,v in labels.items())
predictions = [labels[k] for k in predicted_class_indices]



In [None]:
test['label']

In [14]:
from sklearn.metrics import accuracy_score

accuracy_score(test['label'], predictions)


0.768

In [22]:
score = model.evaluate(validation_generator)



In [15]:
#https://www.tensorflow.org/guide/keras/save_and_serialize
model.save("my_model")

INFO:tensorflow:Assets written to: my_model\assets


# Text analysis

prepare for text data

In [75]:
train_text = train['productname']
test_text = test['productname']
train_tags = train['label']
test_tags = test['label']

In [76]:
from keras.preprocessing import text, sequence

max_words = 10000
tokenize = text.Tokenizer(num_words=max_words, char_level=False)
train_text = train_text.astype(str)
tokenize.fit_on_texts(train_text)
x_train = tokenize.texts_to_matrix(train_text)
test_text = test_text.astype(str)
x_test = tokenize.texts_to_matrix(test_text)

In [57]:
from keras import utils


encoder = LabelEncoder()
encoder.fit(train_tags)
y_train = encoder.transform(train_tags)
y_test = encoder.transform(test_tags)
num_classes = np.max(y_train) + 1
y_train = utils.to_categorical(y_train, num_classes)
y_test = utils.to_categorical(y_test, num_classes)

In [58]:
model_text = Sequential()

model_text.add(Flatten(input_shape=(max_words,)))
model_text.add(Dense(256, activation='relu'))
model_text.add(Dropout(0.1))
model_text.add(Dense(256, activation='relu'))
model_text.add(Dense(13, activation='sigmoid'))


In [59]:
from keras.models import Model
from keras.layers import *

mergedOut = Add()([model.output,model_text.output])
newModel = Model([model.input,model_text.input], mergedOut)

In [81]:
newModel.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 100, 100, 3) 0                                            
__________________________________________________________________________________________________
normalization (Normalization)   (None, 100, 100, 3)  7           input_2[0][0]                    
__________________________________________________________________________________________________
flatten_1_input (InputLayer)    [(None, 10000)]      0                                            
__________________________________________________________________________________________________
xception (Functional)           (None, 3, 3, 2048)   20861480    normalization[0][0]              
____________________________________________________________________________________________

In [60]:
#prepare for image data
image_generator = ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.1,
    height_shift_range=0.1,
    rescale=1/255,
    shear_range=0.1,
    zoom_range=0.1,
    horizontal_flip=True,
    fill_mode='nearest')

train_generator = image_generator.flow_from_dataframe(
    dataframe=train,
    directory=path,
    x_col="image",
    y_col="label",
    target_size=target_size,
    class_mode='categorical',
    #color_mode="grayscale",
    batch_size=batch_size,
)

Found 40441 validated image filenames belonging to 13 classes.


In [62]:

# for pictures
X_train_p=np.concatenate([train_generator.next()[0] for i in range(train_generator.__len__())])
#y_train_p=np.concatenate([train_generator.next()[1] for i in range(train_generator.__len__())])
#X_test_p=np.concatenate([test_generator.next()[0] for i in range(test_generator.__len__())])
#y_test_p=np.concatenate([test_generator.next()[1] for i in range(test_generator.__len__())])
print(X_train_p.shape)
#print(y_train_p.shape)
print(X_test_p.shape)
#print(y_test_p.shape)

(40441, 100, 100, 3)
(10000, 100, 3)


In [None]:
X_test_p=np.concatenate([test_generator.next()[0] for i in range(test_generator.__len__())])

In [82]:
newModel.compile(
    optimizer=keras.optimizers.Adam(),
    loss=keras.losses.BinaryCrossentropy(from_logits=True),
    metrics=[keras.metrics.BinaryAccuracy()],
)
batch_size = 32

history = newModel.fit([X_train_p,x_train], y_train, batch_size=batch_size, epochs=1, verbose=1, validation_split=0.1)
#score = newModel.evaluate([X_test_p,x_test], y_test, batch_size=batch_size, verbose=1) #This line is copied below



ValueError: Data cardinality is ambiguous:
  x sizes: 10000, 4000
  y sizes: 4000
Make sure all arrays contain the same number of samples.

In [87]:
test_generator = image_generator.flow_from_dataframe(
    dataframe=test,
    directory=path,
    x_col="image",
    y_col="label",
    target_size=target_size,
    class_mode='categorical',
    #color_mode="grayscale",
    batch_size=batch_size,
)

Found 4000 validated image filenames belonging to 13 classes.


In [88]:
X_test_p=np.concatenate([test_generator.next()[0] for i in range(test_generator.__len__())])
print(X_test_p.shape)

(4000, 100, 100, 3)


In [89]:
score = newModel.evaluate([X_test_p,x_test], y_test, batch_size=batch_size, verbose=1)



# Example for (2)

In [None]:
img = load_img('images/1163.jpg')  # this is a PIL image
x = img_to_array(img)  # this is a Numpy array with shape (80, 60, 3)
x = x.reshape((1,) + x.shape)  # this is a Numpy array with shape (1, 80, 60, 3)
i = 0
for batch in image_generator.flow(x, batch_size=1,
                          save_to_dir='preview', save_prefix='shirt', save_format='jpeg'):
    i += 1
    if i > 20:
        break

In [None]:
#https://www.tensorflow.org/guide/keras/save_and_serialize
model.save("my_model")