## 0. Setup

In [None]:
import os
### ignore tensorflow warnings
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
#force tensorflow to use CPU only
os.environ["CUDA_VISIBLE_DEVICES"] = ""

In [None]:
#download data
from zipfile import ZipFile
#embeddings
from tensorflow.keras.utils import image_dataset_from_directory
import tensorflow as tf
from huggingface_hub import from_pretrained_keras
from PIL import Image
import numpy as np
import pandas as pd
#timing
from tdqm.auto import tdqm
#vector DB
import kdbai_client as kdbai
from getpass import getpass
import time
#plotting
import umap.umap_ as umap
from matplotlib import pyplot as plt

In [None]:
def show_df(df: pd.DataFrame) -> pd.DataFrame:
    print(df.shape)
    return df.head()

def plot_image(axis, source: str, label=None) -> None:
    axis.imshow(plt.imread(source), cmap = 'gray')
    axis.axis("off")
    title = (f"{label}: " if label else "") + source.split("/")[-1]
    axis.set_title(title)

## 1. Load Image Data

### Define list of paths to the extracted image files

In [None]:
def extract_file_paths_from_folder(parent_dir: str) -> dict:
    image_paths = {}
    for sub_folder in os.listdir(parent_dir):
        sub_dir = os.path.join(parent_dir, sub_folder)
        image_paths[sub_folder] = [
            os.path.join(sub_dir, file) for file in os.listdir(sub_dir)
        ]
    return image_paths

In [None]:
image_paths_map = extract_file_paths_from_folder("COVID-images")

### Visualize Some of the images

In [None]:
image_index = 14
_, ax = plt.subplots(nrows = len(image_paths_map) // 2, ncols=2, figsize=(10,8))
axes = ax.reshape(-1)
for i, (_, image_paths) in enumerate(image_paths_map.items()):
    for path in image_paths:
        if path.endswith(f"{image_index}.png"):
            break

    plot_image(axes[i], path)

In [None]:
dataset = image_dataset_from_directory(
    "COVID-images",
    labels="inferred",
    label_mode="categorical",
    shuffle=False,
    seed=1,
    image_size=(299,299),
    batch_size=1,
)

## 2. Create Image Vector Embeddings

To create our image embeddings, we will use a neural network which we have pre-trained on the lung classifications. In this showcase, we will use the ResNet-50 nerual network architcture, a popular choice for general image classification.

ResNet-50 was originally trained on the ImageNet dataset - although this dataset contains millions of images, including X-ray scans, we would like it to be a bit more custom suited working with our dataset and the classifications we assign to it. Therefore our model here was ,ade by taking ResNet50, pre-trained on ImageNet, and re-training it to classify X-ray scan images of lungs.

We trained the model using roughly 70% of the Kaggle dataset, split equally among the 4 classifications. We then use the remaining 30% of data in this showcase for embedding as vectors and using to perform image search on.

### Load Pre-Trained classification neural network

In [None]:
model = tf.keras.models.load_model('saved_model/lungs_model.h5')
model.summary()

### Use embedding network to create image embeddings

In [None]:
embeddings = np.empty([len(dataset), 2048])
labels = np.empty([len(datset),4])
for i, image in tqdm(enumerate(dataset), total=len(dataset)):
    embeddings[i, :] = model.predict(image[0], verbose=0)
    labels[i, :] = image[1]

In [None]:
#sort the disease types in order
lung_types = sorted(image_paths_map.keys())
#for each vector, save the disease type given by the index
class_labels = [lung_types[label.argmax()] for label in labels]
#get a single list of all paths
all_paths = []
for _, image_paths in image_paths_map.items():
    all_paths += image_paths
#sort the source files in alphanumeric order
sorted_all_paths = sorted(all_paths)
#define our DataFrame for insertion into KDB.AI
embedded_df = pd.DataFrame(
    {
        "source": sorted_all_paths,
        "class": class_labels,
        "embedding": embeddings.tolist(),
    }
)

In [None]:
show_df(embedded_df)

### Visualising the embeddings

It is quite difficult to grasp the concept of a high dimensional vector embedding. One trick to help us try and see this is by using UMAP: a technique which reduces the number of dimensions to allow us to visualize the clusterings in 2D. This will allow to us to get an initial grasp of how well classifications have been separated within the embeddings, and also some potential misclassifications that may occur.

In [None]:
_umap = umap.UMAP(n_neighbors=15, min_dist=0.0)
umap_df = pd.DataFrame(_umap.fit_transform(embeddings), columns=["u0", "u1"])
show_df(umap_df)

In [None]:
class_colors = ["blue", "red", "green", "purple"]
#Create figure for plotting
plt.figure(figsize=(10, 8))
#Scatter plot with 'u0' and 'u1' columns as x and y, color mapped by class labels
for lung_type, color in zip(lung_types, class_colors):
    indices_to_plot = [i for i, label in enumerate(class_labels) if label == tumor_type]
    subset = umap_df.iloc[indices_to_plot]
    plt.scatter(subset["u0"], subset["u1"], label=lung_type, color=color, alpha=0.5)

plt.title("Embeddings Map For X-ray Lung Scan Images")
plt.legend()
plt.show()

## 3. Store Embeddings in KDB.AI

In [None]:
session = kdbai.Session(endpoint="http://localhost:8082")
image_schema = {
    "columns": [
        {"name": "source", "pytype": "str"},
        {"name": "class", "pytpe": "str"},
        {
            "name": "embedding",
            "vectorIndex": {"dims":2048, "metric":"IP", "type": "flat"},
        },
    ]
}

In [None]:
try:
    session.table("lungs").drop()
    time.sleep(5)
except kdbai.KDBAIException:
    pass

In [None]:
table = session.create_table("lungs", image_schema)

In [None]:
embedded_df.memory_usage(deep=True).sum() / (1024**2)

In [None]:
#Yield successive n-sized chunks from l
def divide_chunks(l, n):
    for i in range(0, len(l), n):
        yield l[i:i + n]
#How many elements each list should have
n = len(embedded_df) // 12
#Now split the data 
embedded_df_split = list(divide_chunks(embedded_df, n))
#and insert the now acceptably sized chunks
for i in tqdm(range(len(embedded_df_split))):
    table.insert(embedded_df_split[i]

In [None]:
session.list()

## 4. Query KDB.AI Table

In [None]:
table.query()

In [None]:
table.query(filter=[("like", "class", "*Lung*")])

## 5. Search for similar images to a target image

In [None]:
#Get a sample row
row_index_1 = 140
#Select the random row and the desired colimn's value
row_1 = embedded_df.iloc[row_index_1]
plot_image(plt.subplots()[-1], row_1["source"], label="Query Image")

In [None]:
sample_embedding_1 = row_1["embedding"]