# Initial UMAP

This is an initial attempt on doing UMAP on my dataset of images. This will be upstreamed to a basic Dash frontend.

*Aniket Pant, Personal*

In [1]:
import os
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
import umap.umap_ as umap
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
from sklearn import preprocessing
from concurrent.futures import ThreadPoolExecutor

  @numba.jit()
  @numba.jit()
  @numba.jit()
  from .autonotebook import tqdm as notebook_tqdm
  @numba.jit()


In [2]:
def process_image(image_path, label, size=(128, 128)):
    # Open image
    img = Image.open(image_path)
    # Resize image
    img = img.resize(size)
    # Normalize pixel values to be between 0 and 1
    img_array = np.array(img) / 255.0
    # Flatten the image
    flat_img_array = img_array.flatten()

    return flat_img_array, label, image_path

def load_and_process_images(path, size=(128, 128)):
    image_files = []
    labels = []
    image_paths = []

    # List directories in the path
    dirs = os.listdir(path)
    print(f"Found directories: {dirs}")
    print(f"-–––––-––––––––––-–––––-––––––––––-–––––-––––––––––")

    with ThreadPoolExecutor(max_workers=5) as executor:
        # Iterate through directories
        for dir in dirs:
            full_path = os.path.join(path, dir)
            image_list = os.listdir(full_path)
            # Iterate through images
            for image_path in tqdm(image_list[0:200]):
                if image_path.endswith(".jpg"):
                    full_image_path = os.path.join(full_path, image_path)
                    # Process image in a separate thread
                    future = executor.submit(process_image, full_image_path, dir, size)
                    # When done, append the result to the lists
                    result = future.result()
                    image_files.append(result[0])
                    labels.append(result[1])
                    image_paths.append(result[2])

    return image_files, labels, image_paths

In [None]:
path = "wikiart"  # your path here
image_files, labels, image_paths = load_and_process_images(path)

Found directories: ['New_Realism', 'Mannerism_Late_Renaissance', 'Synthetic_Cubism', 'Symbolism', 'Impressionism', 'Fauvism', 'Cubism', 'Romanticism', 'Analytical_Cubism', 'Pointillism', 'Realism', 'Art_Nouveau_Modern', 'Ukiyo_e', 'Abstract_Expressionism', 'Expressionism', 'Contemporary_Realism', 'Action_painting', 'Northern_Renaissance', 'Baroque', 'Post_Impressionism', 'Rococo', 'Early_Renaissance', 'Minimalism', 'Naive_Art_Primitivism', 'High_Renaissance', 'Color_Field_Painting', 'Pop_Art']
-–––––-––––––––––-–––––-––––––––––-–––––-––––––––––


100%|██████████| 314/314 [00:18<00:00, 17.29it/s]
100%|██████████| 1279/1279 [01:16<00:00, 16.73it/s]
100%|██████████| 216/216 [00:09<00:00, 23.24it/s]
100%|██████████| 4528/4528 [03:37<00:00, 20.79it/s]
100%|██████████| 13060/13060 [10:03<00:00, 21.62it/s]
100%|██████████| 934/934 [00:29<00:00, 31.60it/s]
100%|██████████| 2235/2235 [01:23<00:00, 26.92it/s]
100%|██████████| 7019/7019 [05:50<00:00, 20.04it/s]
100%|██████████| 110/110 [00:03<00:00, 34.40it/s]
100%|██████████| 513/513 [00:24<00:00, 21.08it/s]
 86%|████████▌ | 9204/10733 [14:33<6:29:45, 15.29s/it] 

In [None]:
def perform_umap(image_files, n_neighbors=15, min_dist=0.1, n_components=2):
    # Scale data for zero mean and unit variance
    image_files = StandardScaler().fit_transform(image_files)
    
    # Run UMAP
    reducer = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components)
    embedding = reducer.fit_transform(image_files)

    return embedding

In [None]:
embeddings = perform_umap(image_files)

In [None]:
def plot_embeddings(embeddings, labels):
    plt.figure(figsize=(6, 4))

    le = preprocessing.LabelEncoder()
    classes = le.fit_transform(labels)

    # Create a scatter plot
    plt.scatter(embeddings[:, 0], embeddings[:, 1], c = classes, s = 1, alpha = 0.3)

    # Label the points
    # for i, label in enumerate(labels):
    #     plt.annotate(label, (embeddings[i, 0], embeddings[i, 1]))
    plt.colorbar()
    plt.axis("off")
    plt.show()

In [None]:
plot_embeddings(embeddings, labels)

In [None]:
np.save("embeddings.npy", embeddings)
np.save("labels.npy", labels)
np.save("paths.npy", image_paths)