In [None]:
# We will now extract the feature vectors from an existing network to use them later for clustering our scrapped data. 
# In this notebook the features for the BiT network are extracted exemplarily (https://tfhub.dev/google/bit/m-r152x4/1). 
# However, another Keras model can also be used (https://keras.io/api/applications/).

In [None]:
%tensorflow_version 2.x
import tensorflow as tf
import numpy as np

In [None]:
#If you're working with Colab mount your drive or skip this step
from google.colab import drive
drive.mount("/content/drive")

### Building the feature extractor

In [None]:
import tensorflow_hub as hub

In [None]:
base_model = hub.KerasLayer("https://tfhub.dev/google/bit/m-r152x4/1") # for a different version check: https://tfhub.dev/s?q=bit

### Building the data pipeline

In [None]:
scraped_images_folder = '/set/the/path/to/your/scraped/images/'

In [None]:
filelist_ds = tf.data.Dataset.list_files(scraped_images_folder+'/*') 
# If you get an input/output error in Colab, try a few times until the whole dataset is loaded

In [None]:
IMG_WIDTH, IMG_HEIGHT = 224, 224

def load_img(file_path):
    img = tf.io.read_file(file_path)
    # convert the compressed string to a 3D uint8 tensor
    img = tf.image.decode_jpeg(img, channels=3)
    # Use `convert_image_dtype` to convert to floats in the [0,1] range.
    img = tf.image.convert_image_dtype(img, tf.float32)
    # resize the image to the desired size
    img = tf.image.resize(img, [IMG_WIDTH, IMG_HEIGHT]) # A batch of images with shape [batch_size, height, width, 3]
    return file_path, img

In [None]:
dataset = filelist_ds.map(load_img)

### Extracting features

In [None]:
batched_ds = dataset.batch(64)

feature_collection = {}
for filenames, img in batched_ds:
    features = base_model(img)
    feature_dict = {
        str(name).split('/')[-1]: feature.squeeze() 
        for name, feature in zip(filenames.numpy(), features.numpy())
    }
    feature_collection.update(feature_dict)
    print('*', end='')
features = np.vstack(features).squeeze()

In [None]:
len(feature_collection)

### Save the data

In [None]:
feature_file = '/create/a/path/to/save/file/BiT-m-r152x4_feature.npz'

In [None]:
np.savez(feature_file, **feature_collection)

In [None]:
features.shape # Features with shape [batch_size, 8192].