In [1]:
import numpy as np
import os
from PIL import Image
import tensorflow as tf
from tensorflow.keras import layers, models
from pinecone import Pinecone
import pinecone
from pymilvus import Milvus, DataType, CollectionSchema, FieldSchema, Collection

  from tqdm.autonotebook import tqdm


In [2]:
img_size = (80, 80)
image_folder = '../ellipses'
test_image_folder = '../ellipses_test' 

# Pinecone and Milvus config
pinecone_api_key = 'd9632c30-10fd-419d-a7b9-adcc3d868754' 
pinecone_index_name = 'sample'

# Not used yet
milvus_host = 'localhost'
milvus_port = '19530'
milvus_collection_name = 'image_embeddings'

In [3]:
def build_autoencoder(img_shape):
    encoder_input = layers.Input(shape=img_shape)
    x = layers.Flatten()(encoder_input)
    x = layers.Dense(64, activation='relu')(x)
    encoded = layers.Dense(32, activation='relu')(x)

    # Decoder
    x = layers.Dense(64, activation='relu')(encoded)
    x = layers.Dense(np.prod(img_shape), activation='sigmoid')(x)
    decoded = layers.Reshape(img_shape)(x)

    # Autoencoder model
    autoencoder = models.Model(encoder_input, decoded)
    encoder = models.Model(encoder_input, encoded)

    # Compile model
    autoencoder.compile(optimizer='adam', loss='binary_crossentropy')
    return autoencoder, encoder

In [4]:
def load_images_from_folder(folder_path):
    images = []
    files = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".png"):
            img_path = os.path.join(folder_path, filename)
            img = Image.open(img_path).convert('L')  # Convert to grayscale
            img = img.resize(img_size)  # Resize to the target dimensions
            img = np.array(img) / 255.0  # Normalize pixel values
            images.append(img)
            files.append(filename)
    images = np.array(images).reshape(-1, *img_size, 1)
    return images, files

In [5]:
def train_autoencoder(images, img_shape):
    autoencoder, encoder = build_autoencoder(img_shape)
    autoencoder.fit(images, images, epochs=50, batch_size=32, shuffle=True)
    return encoder

In [6]:
def generate_embeddings(encoder, images):
    return encoder.predict(images)

In [7]:
def insert_into_pinecone(embeddings, files, pinecone_index_name):
    pc = Pinecone(api_key=pinecone_api_key)
    
    index = pc.Index(pinecone_index_name)
    
    for i, embedding in enumerate(embeddings):
        index.upsert([(files[i], embedding.tolist())])

"""
def insert_into_milvus(embeddings, milvus_collection_name):
    milvus = Milvus(host=milvus_host, port=milvus_port)

    if milvus_collection_name not in milvus.list_collections():
        fields = [
            FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
            FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=32)
        ]
        schema = CollectionSchema(fields, "Image embedding collection")
        collection = Collection(milvus_collection_name, schema)
    else:
        collection = Collection(milvus_collection_name)

    ids = np.arange(len(embeddings)).tolist()
    entities = [
        ids,
        embeddings.tolist()
    ]
    
    collection.insert(entities)
"""

'\ndef insert_into_milvus(embeddings, milvus_collection_name):\n    milvus = Milvus(host=milvus_host, port=milvus_port)\n\n    if milvus_collection_name not in milvus.list_collections():\n        fields = [\n            FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),\n            FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=32)\n        ]\n        schema = CollectionSchema(fields, "Image embedding collection")\n        collection = Collection(milvus_collection_name, schema)\n    else:\n        collection = Collection(milvus_collection_name)\n\n    ids = np.arange(len(embeddings)).tolist()\n    entities = [\n        ids,\n        embeddings.tolist()\n    ]\n    \n    collection.insert(entities)\n'

In [8]:
images, files = load_images_from_folder(image_folder)
test_images, test_files = load_images_from_folder(test_image_folder)

In [9]:
encoder = train_autoencoder(images, (*img_size, 1))

Epoch 1/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - loss: 0.6781  
Epoch 2/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.5309 
Epoch 3/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.3961 
Epoch 4/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.2632 
Epoch 5/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 0.1761
Epoch 6/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.1185 
Epoch 7/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.0844 
Epoch 8/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 0.0694 
Epoch 9/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.0599 
Epoch 10/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 0.0553
Epoch 11/50
[1m7/7

In [10]:
embeddings = generate_embeddings(encoder, images)

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 


In [11]:
insert_into_pinecone(embeddings, files, pinecone_index_name)
#insert_into_milvus(embeddings, milvus_collection_name)

In [12]:
len(embeddings)

200

In [13]:
def search_pinecone(embedding, pinecone_index_name, top_k):
    pc = Pinecone(api_key=pinecone_api_key)
    index = pc.Index(pinecone_index_name)
    query_result = index.query(vector=[embedding.tolist()], top_k=top_k, include_values=True)
    return query_result

In [14]:
test_embeddings = generate_embeddings(encoder, test_images)

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 703us/step


In [31]:
eval_matrix = [[0,0],[0,0]]
overwhelm = 0
top_k = 15

In [32]:
for i in range(len(test_embeddings)):
    true_label = int(test_files[i].split("_")[0][-1]) # Gets either 0 or 1
    results = search_pinecone(test_embeddings[i], pinecone_index_name, top_k)
    
    count = 0
    for match in results['matches']:
        if match['id'].split("_")[0][-1] == "0":
            count += 1

    if count == top_k or count == 0:
        overwhelm += 1
    
    new_label = None
    if count > top_k/2:
        new_label = 0
    elif count == top_k/2:
        new_label = np.random.choice([0, 1])
    else:
        new_label = 1

    eval_matrix[true_label][new_label] += 1

In [33]:
def print_evaluation(eval_matrix, overwhelm):
    accuracy = (eval_matrix[0][0] + eval_matrix[1][1])/np.sum(eval_matrix)
    precision = eval_matrix[1][1]/(eval_matrix[1][1] + eval_matrix[0][1])
    recall = eval_matrix[1][1]/(eval_matrix[1][0] + eval_matrix[1][1])
    F_1 = 2/(recall**-1 + precision**-1)

    print(f"Accuracy:  | {accuracy}")
    print(f"Precision: | {precision}")
    print(f"Recall:    | {recall}")
    print(f"F_1:       | {F_1}")
    print("-------------------------")
    print(f"# w/ 100% top_k: {overwhelm}")

In [34]:
print_evaluation(eval_matrix, overwhelm)

Accuracy:  | 1.0
Precision: | 1.0
Recall:    | 1.0
F_1:       | 1.0
-------------------------
# w/ 100% top_k: 200


In [39]:
results = search_pinecone(test_embeddings[0], pinecone_index_name, top_k)
    
for match in results['matches']:
    print(f"{match['id']} {match['score']}")
    #print(match["values"])

ellipse1_46.png 0.0
ellipse1_63.png 43.28125
ellipse1_89.png 121.125
ellipse1_4.png 203.71875
ellipse1_15.png 854.0
ellipse1_76.png 887.59375
ellipse1_83.png 1158.5625
ellipse1_73.png 1813.9375
ellipse1_39.png 2067.34375
ellipse1_67.png 2344.90625
ellipse1_30.png 4604.75
ellipse1_24.png 5561.875
ellipse1_65.png 6118.23438
ellipse1_16.png 6926.39062
ellipse1_14.png 7143.5625
