# Feature Embeddings Extraction Notebook

In [3]:
# In this notebook we can generate embeddings using verious pre-trained models and upload them to qdrant, we also make use of tensorboard
# to visualize the embeddings, the directory where the tensorboard related information such as checkpoints/logs etc will be stored is
# hardcoded as of now, user needs to change it according to their preference. UMAP visualization of the extracted embeddings is achieved
# in this notebook. Dataset used is DeepPCB dataset which has 3001 images.

In [4]:
import os
import cv2
import torch
import pickle
import numpy as np
import pandas as pd
from torch import nn
import seaborn as sns
from tqdm import tqdm
from PIL import Image
import multiprocessing
import tensorflow as tf
import umap.umap_ as umap
import matplotlib.pyplot as plt
from torchvision import transforms
from sklearn.cluster import KMeans
import matplotlib.patches as mpatches
from tensorboard.plugins import projector # type: ignore
from sklearn.metrics import silhouette_score
from torch.utils.data import Dataset, DataLoader
%matplotlib inline

## Dataset loading and temp+test images extraction

In [None]:
PATH = "/home/ashgatsy/DeepPCB-master/PCBData"

In [None]:
normal = []
defect = []
defectlog = []
path_2 = [os.path.join(PATH,dir) for dir in os.listdir(PATH) if '.' not in dir]
for p in tqdm(path_2,total=len(path_2)):
    path_3 = os.path.join(p,sorted(os.listdir(p))[0])
    normal +=[os.path.join(path_3,dir)for dir in os.listdir(path_3) if 'temp' in dir]
    defect +=[os.path.join(path_3,dir)for dir in os.listdir(path_3) if 'test' in dir]
    path_4 = os.path.join(p,sorted(os.listdir(p))[1])
    defectlog +=[os.path.join(path_4,dir)for dir in os.listdir(path_4)]
normal.sort()
defect.sort()
defectlog.sort()

In [None]:
img0 = []
img1 = []
for img_path in tqdm(normal,total=len(normal)):
    img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
    img = cv2.resize(img,(128,128))
    img0.append(img)
for img_path in tqdm(defect,total=len(defect)):
    img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
    img = cv2.resize(img,(128,128))
    img1.append(img)
img0 = np.array(img0)
img1 = np.array(img1)

In [None]:
def conv_to_rgb(bnw_images):
    # function that converts numpy array of bnw images to numpy array of rgb images
    num_images, height, width = bnw_images.shape
    rgb_images = np.zeros((num_images, height, width, 3), dtype=np.uint8)
    # Set all three channels to the same intensity (grayscale value)
    for i in range(num_images):
        rgb_images[i, :, :, 0] = bnw_images[i]  # Red channel
        rgb_images[i, :, :, 1] = bnw_images[i]  # Green channel
        rgb_images[i, :, :, 2] = bnw_images[i]  # Blue channel

    rgb_images = rgb_images.transpose(0, 3, 1, 2)
    return rgb_images

rgb_images0 = conv_to_rgb(img0)
rgb_images1 = conv_to_rgb(img1)

In [None]:
num_images, height, width = img0.shape
rgb_images = np.zeros((num_images, height, width, 3), dtype=np.uint8)

# Set all three channels to the same intensity (grayscale value)
for i in range(num_images):
    rgb_images[i, :, :, 0] = img0[i]  # Red channel
    rgb_images[i, :, :, 1] = img0[i]  # Green channel
    rgb_images[i, :, :, 2] = img0[i]  # Blue channel

rgb_images = rgb_images.transpose(0, 3, 1, 2)

## Import any pre-trained Model and preprocess dataset

In [None]:
model = torch.hub.load('facebookresearch/WSL-Images', 'resnext101_32x8d_wsl')

def slice_model(original_model, from_layer=None, to_layer=None):
    return nn.Sequential(*list(original_model.children())[from_layer:to_layer])

model_conv_features = slice_model(model, to_layer=-1)

In [None]:
# define required transformations according the model imported
preprocess = transforms.Compose([
    transforms.Resize(224),
    # transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

In [None]:
def apply_transforms(rgb_images,transform):
    # function to apply required transformation on the dataset images
    transformed_images0 = []
    rgb_images = rgb_images.transpose(0, 2, 3, 1)
    for img_np in rgb_images:
        # Convert numpy array to PIL Image
        # print(img_np.shape)
        img_pil = Image.fromarray(img_np)

        # Apply the transformation
        img_transformed = transform(img_pil)

        # Convert the transformed image back to numpy array
        img_transformed_np = np.array(img_transformed)

        # Append to the list
        transformed_images0.append(img_transformed_np)
    return transformed_images0

transformed_images0 = apply_transforms(rgb_images0,preprocess)
transformed_images1 = apply_transforms(rgb_images1,preprocess)

In [None]:
class MyDataset(Dataset):
    def __init__(self, data, targets=None, transform=None):
        self.data = data
        self.targets = targets
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data[idx]
        if self.transform:
            sample = self.transform(sample)
        if self.targets is not None:
            return sample, self.targets[idx]
        else:
            return sample

In [None]:
my_dataset0 = MyDataset(data=transformed_images0)
my_dataset1 = MyDataset(data=transformed_images1)

In [None]:
cpu_count = multiprocessing.cpu_count()
batch_size = 8
my_dataloader0 = DataLoader(dataset=my_dataset0, batch_size=batch_size, shuffle=True, num_workers=cpu_count)
my_dataloader1 = DataLoader(dataset=my_dataset1, batch_size=batch_size, shuffle=True, num_workers=cpu_count)

In [None]:
def get_features(dataloader,model_conv_features):
    features_list=[]
    # this function returns the populated features list
    model_conv_features
    for batch in tqdm(dataloader):
        # print(batch.shape)
        image_batch = batch
        # image_batch = torch.stack(image_batch) 

        with torch.no_grad():
            features_batch = model_conv_features(image_batch).flatten(start_dim=1)
        features_list.append(features_batch)

    return features_list

In [None]:
features_list0 = get_features(my_dataloader0,model_conv_features)
features_list1 = get_features(my_dataloader1,model_conv_features)

## Upload/Retrieve the embeddings : qdrant

In [None]:
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams
qdrant = QdrantClient(url='http://132.186.158.40:6333/')

In [None]:
def concatenate_embeddings(features_list):
  # Reshape each tensor to remove the batch dimension
  embeddings = [tf.reshape(tensor, [-1, 2048]) for tensor in features_list]

  # Concatenate the reshaped tensors
  concatenated_embeddings = tf.concat(embeddings, axis=0)

  return concatenated_embeddings.numpy().tolist()

individual_embeddings0 = concatenate_embeddings(features_list0)
individual_embeddings1 = concatenate_embeddings(features_list1)

In [None]:
def qdarnt_upload(collection_name,csize,vectors):
    qdrant.create_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(size=2048,distance=Distance.COSINE),
    )
    for i, embedding in enumerate(vectors):
        qdrant.upsert(collection_name=collection_name, points=[{
            'id': i,  # Unique ID for each data point
            'vector': embedding,  # Convert numpy array to list
            'payload': {'text': f'Embedding {i}'}  # Optional payload (metadata)
        }])
    print("Embeddings inserted successfully!")

qdarnt_upload("check1",1501,individual_embeddings0)
qdarnt_upload("check2",1500,individual_embeddings1)

In [None]:
def retrieve_points(collection_name,client,num_vectors):
    res = client.scroll(
    collection_name=collection_name,
    scroll_filter=None,
    limit=num_vectors,
    with_payload=True,
    with_vectors=True,
    )
    return res

ret_points0 = retrieve_points("check1",qdrant,1501)
ret_points1 = retrieve_points("check2",qdrant,1500)

In [None]:
def extract_embeddings(points):
    ret_embeddings = []
    ret_ids = [] #no need to keep anything here since id is just the index of the retrieved vector. (int(ret_points0[0][0].payload['text'].split()[1]))
    for record in points[0]:
        ret_embeddings.append(record.vector)

    return ret_embeddings

ret_embeddings0 = extract_embeddings(ret_points0)
ret_embeddings1 = extract_embeddings(ret_points1)
tot_embeddings = ret_embeddings0+ret_embeddings1

## Tensorboard Projection to visualize

In [None]:
def cluster_embeddings(embeddings, num_clusters):
    # Flatten the list of tensors into a single 2D array
    flattened_embeddings = np.array(embeddings)

    # Perform K-Means clustering
    kmeans = KMeans(n_clusters=num_clusters, random_state=0)  # Set random_state for reproducibility
    kmeans.fit(flattened_embeddings)

    # Get cluster labels for each data point
    cluster_labels = kmeans.labels_

    return cluster_labels

cluster_labels = cluster_embeddings(tot_embeddings,6)

In [None]:
log_dir='/home/ashgatsy/test/logs/pcb-example1/'

def tensorboard_umap(umap_embeddings,cluster_labels):
    embedding_var = tf.Variable(umap_embeddings,name='embedding')
    checkpoint = tf.train.Checkpoint(embedding=embedding_var)
    checkpoint.save(os.path.join(log_dir,'ts_embedding.ckpt'))
    config = projector.ProjectorConfig()
    embedding = config.embeddings.add()
    embedding.tensor_name = 'embedding/.ATTRIBUTES/VARIABLE_VALUE'  # Assuming saved tensor

    # Create metadata.tsv (optional): Each line: index<tab>cluster_label
    with open('/home/ashgatsy/test/logs/pcb-example1/metadata.tsv', 'w') as f:
        column_names = ["Id", "Cluster_Label"]
        f.write('\t'.join(column_names) + '\n')
        for i, label in enumerate(cluster_labels):
            f.write(f"{i}\t{label}\n")

    embedding.metadata_path = 'metadata.tsv'

    projector.visualize_embeddings(log_dir, config)

tensorboard_umap(tot_embeddings,cluster_labels)

In [None]:
!tensorboard --logdir /home/ashgatsy/test/logs/pcb-example1/