In [1]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from sklearn.decomposition import PCA

### reading directory

In [2]:
# dataset that contains images' path and label
data = pd.DataFrame()
data['image-name'] = os.listdir('../data/flowers/')

# extracting species
data['species'] = data['image-name'].apply(lambda x: int(x[:2]))

# saving "true" label
np.save('../data/true_label.npy', data['species'].values)

### raw images

In [3]:
raw_images_flatten = np.array([plt.imread('../data/flowers/'+img).ravel() for img in data['image-name'].tolist()])
# saving raw images
np.save('../data/raw_images_flatten.npy', raw_images_flatten)

### applying pca to reduce images' dimensions

In [4]:
def reduce_dim(imgs, k):
    pca = PCA(n_components=k)
    return pca.fit_transform(imgs)

images_600 = reduce_dim(raw_images_flatten, 600)
# saving reduced images
np.save('../data/images_600.npy', images_600)

### using a neural network to create a better mapping

In [5]:
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Flatten, Dense, concatenate,  Dropout, Conv2D, MaxPool2D
from tensorflow.keras import backend as K
import tensorflow as tf

# embeddings size
EMBED_SIZE = 128

input_image = Input(shape=(128, 128, 4))
x = Conv2D(32, (3, 3), activation='relu')(input_image)
x = MaxPool2D((2, 2))(x)
x = Conv2D(16, (3, 3), activation='relu')(x)
x = MaxPool2D((2, 2))(x)
x = Conv2D(8, (3, 3), activation='relu')(x)
x = MaxPool2D((2, 2))(x)
x = Flatten()(x)
x = Dense(EMBED_SIZE, activation='relu')(x)

base_network = Model(inputs=input_image, outputs=x)

def triplet_loss(y_true, y_pred):
    anchor, positive, negative = y_pred[:, :EMBED_SIZE], y_pred[:, EMBED_SIZE:2*EMBED_SIZE], y_pred[:, 2*EMBED_SIZE:]
    positive_dist = tf.reduce_mean(tf.square(anchor - positive), axis=1)
    negative_dist = tf.reduce_mean(tf.square(anchor - negative), axis=1)
    return tf.maximum(positive_dist - negative_dist + 0.2, 0.)

input_image_one = Input(shape=(128, 128, 4), name='input_image_one') # input layer for image one
input_image_two = Input(shape=(128, 128, 4), name='input_image_two') # input layer for image two
input_image_three = Input(shape=(128, 128, 4), name='input_image_three') # input layer for image three

out = concatenate([base_network(input_image_one), base_network(input_image_two),
                             base_network(input_image_three)])

model = Model(inputs=[input_image_one, input_image_two, input_image_three],
            outputs=out)
model.compile(loss=triplet_loss, optimizer='rmsprop')

### using the whole dataset

In [6]:
def image_batch_generator(images_names_df, embed_size, batchsize=32):
    
    while True:
        
        images_names = images_names_df.sample(batchsize)['image-name'].tolist()
        anchors, positives, negatives, masks = [], [], [], []
        
        for img_name in images_names:
            
            anchor = plt.imread('../data/flowers/'+img_name)
            anchor_class = int(img_name[:2])
            
            positive_example = images_names_df[images_names_df['species'] == anchor_class].sample(1).iloc[0]['image-name']
            negative_example = images_names_df[images_names_df['species'] != anchor_class].sample(1).iloc[0]['image-name']
            
            anchors.append(anchor)
            positives.append(plt.imread('../data/flowers/'+positive_example))
            negatives.append(plt.imread('../data/flowers/'+negative_example))
            masks.append(np.zeros(3 * embed_size))
            
        yield [np.array(anchors), np.array(positives), np.array(negatives)], np.array(masks)
#         instances, masks = [], []    

In [7]:
train_gen = image_batch_generator(data, EMBED_SIZE)
model.fit_generator(train_gen, epochs=15, steps_per_epoch=60)



Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x7f46ba828d00>

In [8]:
raw_images = np.array([plt.imread('../data/flowers/'+img) for img in data['image-name'].tolist()])
# creating embeddings for each image
embeddings = base_network.predict(raw_images)
# saving embeddings' images
np.save('../data/full_data_embeddings.npy', embeddings)

### using 10% of images - simulating a manual labeling

In [9]:
model = Model(inputs=[input_image_one, input_image_two, input_image_three],
            outputs=out)
model.compile(loss=triplet_loss, optimizer='rmsprop')

In [10]:
data_undersized = data.sample(60)
train_gen = image_batch_generator(data_undersized, EMBED_SIZE, batchsize=16)
model.fit_generator(train_gen, epochs=15, steps_per_epoch=10)



Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x7f46bf8d9040>

In [11]:
raw_images = np.array([plt.imread('../data/flowers/'+img) for img in data['image-name'].tolist()])
# creating embeddings for each image
embeddings = base_network.predict(raw_images)
# saving embeddings' images
np.save('../data/partial_data_embeddings.npy', embeddings)