In [1]:
import numpy as np
import os
from PIL import Image
import tensorflow as tf
from tensorflow.keras import layers, models
from pinecone import Pinecone
import pinecone
from pymilvus import Milvus, DataType, CollectionSchema, FieldSchema, Collection

  from tqdm.autonotebook import tqdm


In [2]:
img_size = (80, 80)
image_folder = '../nonlin_data'
test_image_folder = '../nonlin_data' 

# Pinecone and Milvus config
pinecone_api_key = "55820143-1b89-405c-b2df-fd354001c4cc" 
pinecone_index_name = 'sample'

In [3]:
def build_autoencoder(img_shape):
    encoder_input = layers.Input(shape=img_shape)
    x = layers.Flatten()(encoder_input)
    x = layers.Dense(64, activation='relu')(x)
    encoded = layers.Dense(32, activation='relu')(x)

    # Decoder
    x = layers.Dense(64, activation='relu')(encoded)
    x = layers.Dense(np.prod(img_shape), activation='sigmoid')(x)
    decoded = layers.Reshape(img_shape)(x)

    # Autoencoder model
    autoencoder = models.Model(encoder_input, decoded)
    encoder = models.Model(encoder_input, encoded)

    # Compile model
    autoencoder.compile(optimizer='adam', loss='binary_crossentropy')
    return autoencoder, encoder

In [4]:
def load_images_from_folder(folder_path):
    images = []
    files = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".png"):
            img_path = os.path.join(folder_path, filename)
            img = Image.open(img_path).convert('L')  # Convert to grayscale
            img = img.resize(img_size)  # Resize to the target dimensions
            img = np.array(img) / 255.0  # Normalize pixel values
            images.append(img)
            files.append(filename)
    images = np.array(images).reshape(-1, *img_size, 1)
    return images, files

In [5]:
def train_autoencoder(images, img_shape):
    autoencoder, encoder = build_autoencoder(img_shape)
    autoencoder.fit(images, images, epochs=50, batch_size=32, shuffle=True)
    return encoder

In [6]:
def generate_embeddings(encoder, images):
    return encoder.predict(images)

In [7]:
def insert_into_pinecone(embeddings, files, pinecone_index_name):
    pc = Pinecone(api_key=pinecone_api_key)
    
    index = pc.Index(pinecone_index_name)
    
    for i, embedding in enumerate(embeddings):
        index.upsert([(files[i], embedding.tolist())])

In [8]:
images, files = load_images_from_folder(image_folder)
test_images, test_files = load_images_from_folder(test_image_folder)

In [9]:
encoder = train_autoencoder(images, (*img_size, 1))

Epoch 1/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - loss: 0.6935  
Epoch 2/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.6924 
Epoch 3/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.6911 
Epoch 4/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - loss: 0.6900
Epoch 5/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.6880 
Epoch 6/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 0.6859 
Epoch 7/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.6839 
Epoch 8/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 0.6817 
Epoch 9/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.6795 
Epoch 10/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 0.6757 
Epoch 11/50
[1m5/

In [10]:
embeddings = generate_embeddings(encoder, images)

[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 


In [11]:
insert_into_pinecone(embeddings, files, pinecone_index_name)
#insert_into_milvus(embeddings, milvus_collection_name)

In [12]:
len(embeddings)

137

In [13]:
def search_pinecone(embedding, pinecone_index_name, top_k):
    pc = Pinecone(api_key=pinecone_api_key)
    index = pc.Index(pinecone_index_name)
    query_result = index.query(vector=[embedding.tolist()], top_k=top_k, include_values=True)
    return query_result

In [14]:
test_embeddings = generate_embeddings(encoder, test_images)

[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 


In [15]:
eval_matrix = [[0,0],[0,0]]
overwhelm = 0
top_k = 15

In [16]:
for i in range(len(test_embeddings)):
    true_label = int(test_files[i].split("_")[2]) # Gets either 0 or 1
    results = search_pinecone(test_embeddings[i], pinecone_index_name, top_k)
    
    count = 0
    for match in results['matches']:
        if match['id'].split("_")[2] == "0":
            count += 1

    if count == top_k or count == 0:
        overwhelm += 1
    
    new_label = None
    if count > top_k/2:
        new_label = 0
    elif count == top_k/2:
        new_label = np.random.choice([0, 1])
    else:
        new_label = 1

    eval_matrix[true_label][new_label] += 1

In [17]:
eval_matrix

[[92, 8], [1, 36]]

In [18]:
def print_evaluation(eval_matrix, overwhelm):
    accuracy = (eval_matrix[0][0] + eval_matrix[1][1])/np.sum(eval_matrix)
    precision = eval_matrix[1][1]/(eval_matrix[1][1] + eval_matrix[0][1])
    recall = eval_matrix[1][1]/(eval_matrix[1][0] + eval_matrix[1][1])
    F_1 = 2/(recall**-1 + precision**-1)

    print(f"Accuracy:  | {accuracy}")
    print(f"Precision: | {precision}")
    print(f"Recall:    | {recall}")
    print(f"F_1:       | {F_1}")
    print("-------------------------")
    print(f"# w/ 100% top_k: {overwhelm}")

In [19]:
print_evaluation(eval_matrix, overwhelm)

Accuracy:  | 0.9343065693430657
Precision: | 0.8181818181818182
Recall:    | 0.972972972972973
F_1:       | 0.8888888888888888
-------------------------
# w/ 100% top_k: 49


In [20]:
results = search_pinecone(test_embeddings[0], pinecone_index_name, top_k)
    
for match in results['matches']:
    print(f"{match['id']} {match['score']}")
    #print(match["values"])

nonlinear_cluster_0_image_23.png 1.0
nonlinear_cluster_0_image_89.png 0.991448045
nonlinear_cluster_0_image_28.png 0.988063812
nonlinear_cluster_0_image_9.png 0.98005271
nonlinear_cluster_0_image_62.png 0.979839861
nonlinear_cluster_0_image_85.png 0.977246821
nonlinear_cluster_0_image_54.png 0.976236939
nonlinear_cluster_0_image_79.png 0.964013577
nonlinear_cluster_0_image_20.png 0.961850882
nonlinear_cluster_0_image_19.png 0.956474423
nonlinear_cluster_0_image_44.png 0.956393898
nonlinear_cluster_0_image_2.png 0.951297045
nonlinear_cluster_0_image_83.png 0.946653903
nonlinear_cluster_0_image_69.png 0.946342051
nonlinear_cluster_0_image_7.png 0.945337713


3d
two rings interlocked

2 Countour ellipse
one close to circle one close to needle
sum of image is the same


Generate low dimensional 

    Map them to high dimensional data
        Linear vs non linear methods

    Try to catch the low dimensional stuff

    2 clusters
    not necessarily easily separable

We want to see if they will improve the performance of transformer

speed up transformer inference

python package that can serve an open source LLM

Go to attention mechanism
    Reimplement using vector database

Go to transformer and truncate
    Throw away detail information

Singular value decomposition
    Low rank approximations
    
    Throw away small singular value part and see if its good

Numerical Linear Algebra book

Look up smallest but latest Llama models
    8b 

Run Llama 3.1 locally

Look at their stuff

Pytorch Attention Implementation