In [None]:
%reload_ext autoreload
%autoreload 2

In [None]:
import warnings

warnings.filterwarnings("ignore")

import keras, pickle
import matplotlib.pyplot as plt

# Basic Libraries
import numpy as np
import pandas as pd

# Image Image manipulation libraries
import PIL.Image

# Deep Learning Libraries
import tensorflow as tf
from keras.applications.resnet import ResNet50, preprocess_input
from keras.datasets import mnist
from keras.layers import Concatenate, Input
from tqdm import tqdm

plt.style.use("ggplot")
%matplotlib inline

# Step 1 Data Loading

- Download the MNIST data
- Download a pretrained model (pretrained for ImageNet classification)
- Extract embeddings of the MNIST images from this pretrained model
- Convert data into correct format (tensors) and plot two input-output pairs

In [None]:
# Load the MNIST dataset
(X_train, y_train), (X_test, y_test) = mnist.load_data()

In [None]:
# Check the shapes of the dataset
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(60000, 28, 28) (60000,)
(10000, 28, 28) (10000,)


In [None]:
# Create resnet50 model for feature extraction
new_input = Input(shape=(28, 28, 3))
res = ResNet50(include_top=False, input_tensor=new_input)

2023-03-07 19:11:59.504762: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:267] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2023-03-07 19:11:59.504785: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (KARZA-HW-0270): /proc/driver/nvidia/version does not exist
2023-03-07 19:11:59.505007: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
# Extract embeddings in batches
# Function to create batches of data from the provided input data


def create_batches(X, batch_size=32):

    # With the given batch size, find the total number of batches
    n_batches = X.shape[0] // batch_size

    # Last batch could've been fractional and may get ignored hence increment n_batches by one
    n_batches += 1

    # Loop over all the data for n_batches times to create the batch
    all_batches = []
    for batch in tqdm(range(n_batches), total=n_batches, desc="Creating batches..."):
        try:

            # Subset into the batch dimension of the numpy array
            start = batch * batch_size
            end = start + batch_size
            images = X[start:end, :, :]
            preprocessed_images = []

            # Process each image in the batch
            if len(images) > 0:
                for image in images:
                    # Create the numpy array into a 1-D grayscale PIL Image
                    # Post that convert that image into an RGB image since the
                    # pretrained model was trained on 3-channel images
                    image = np.array(PIL.Image.fromarray(image).convert("RGB"))
                    image = image.reshape((1, *image.shape))
                    preprocessed_images.append(image)

                # Create a tensor out of this batch
                batch_tensor = Concatenate(axis=0)(preprocessed_images)

                # Add these batches to a list
                all_batches.append(batch_tensor)
        except Exception as e:
            print(str(e))
            pass

    # Return the list of all the batches
    return all_batches

In [None]:
# Create a set of train and validation batches with the help of above function
train_batches = create_batches(X_train)
test_batches = create_batches(X_test)

Creating batches...: 100%|█████████████████| 1876/1876 [00:05<00:00, 349.99it/s]
Creating batches...: 100%|███████████████████| 313/313 [00:00<00:00, 346.77it/s]


In [None]:
# Get the resnet embeddings for the images
def get_embeddings(batched_data):
    embeddings = []
    for batch in tqdm(batched_data, total=len(batched_data)):
        embeddings.append(res.predict(batch, verbose=0))
    return embeddings

In [None]:
# Get the train embeddings
train_embeds = get_embeddings(train_batches)
train_embeds = np.concatenate(train_embeds, axis=0)
train_embeds = train_embeds[:, 0, 0, :]

100%|███████████████████████████████████████| 1875/1875 [02:56<00:00, 10.63it/s]


In [None]:
# Get the test embeddings
test_embeds = get_embeddings(test_batces)
test_embeds = np.concatenate(test_embeds, axis=0)
test_embeds = test_embeds[:, 0, 0, :]

100%|█████████████████████████████████████████| 313/313 [00:29<00:00, 10.61it/s]


In [None]:
train_embeds.shape, test_embeds.shape

((60000, 2048), (10000, 2048))

In [None]:
# Dump the embeddings along with labels in pickle file 
# This is to persist the data and to look at the same for any subsequent analysis

data = {
    "train": {"embeddings": train_embeds, "labels": y_train},
    "test": {"embeddings": test_embeds, "labels": y_test},
}

pickle.dump(data, open("data/embeddings_with_labels.pkl", "wb"))