In [1]:
from glob import glob
import os

import numpy as np
import pandas as pd
import plotly.express as px
import skimage as ski

In [2]:
path = "../../data/images/*/*.png"
file_paths = sorted(glob(path))

In [3]:
type_to_file = []
for file_path in file_paths:
    parts = os.path.normcase(file_path).split(os.sep)
    type_name = parts[-2]
    file_name = parts[-1]
    type_to_file.append({"type": type_name, "file": file_name})
    
type_to_file[:5]

[{'type': 'bug', 'file': '010.png'},
 {'type': 'bug', 'file': '011.png'},
 {'type': 'bug', 'file': '012.png'},
 {'type': 'bug', 'file': '013.png'},
 {'type': 'bug', 'file': '014.png'}]

In [4]:
image_list = [ski.io.imread(filename) for filename in file_paths]

In [5]:
image_list[:5]

[array([[[255, 255, 255,   0],
         [255, 255, 255,   0],
         [255, 255, 255,   0],
         ...,
         [255, 255, 255,   0],
         [255, 255, 255,   0],
         [255, 255, 255,   0]],
 
        [[255, 255, 255,   0],
         [255, 255, 255,   0],
         [255, 255, 255,   0],
         ...,
         [255, 255, 255,   0],
         [255, 255, 255,   0],
         [255, 255, 255,   0]],
 
        [[255, 255, 255,   0],
         [255, 255, 255,   0],
         [255, 255, 255,   0],
         ...,
         [255, 255, 255,   0],
         [255, 255, 255,   0],
         [255, 255, 255,   0]],
 
        ...,
 
        [[255, 255, 255,   0],
         [255, 255, 255,   0],
         [255, 255, 255,   0],
         ...,
         [255, 255, 255,   0],
         [255, 255, 255,   0],
         [255, 255, 255,   0]],
 
        [[255, 255, 255,   0],
         [255, 255, 255,   0],
         [255, 255, 255,   0],
         ...,
         [255, 255, 255,   0],
         [255, 255, 255,   0],
    

In [6]:
for i, img in enumerate(image_list):
    if img.shape != (475, 475, 4):
        # ski.transform.resize(img, (475, 475, 4))
        image_list[i] = ski.transform.resize(img, (475, 475, 4), preserve_range=True)

In [None]:
image_float_scaled_list = [ski.util.img_as_float(image) for image in image_list]

In [7]:
image_array = np.array(image_list, dtype=np.uint8)
image_array.shape

(1025, 475, 475, 4)

In [8]:
image_without_alpha_array = image_array[:, :, :, :3]
image_without_alpha_array.shape

(1025, 475, 475, 3)

In [9]:
image_without_alpha_flattened_array = image_without_alpha_array.reshape(image_without_alpha_array.shape[0], -1)
image_without_alpha_flattened_array.shape

(1025, 676875)

In [10]:
fig = px.imshow(
    image_without_alpha_array[0],
)
fig

In [11]:
from sklearn.preprocessing import LabelEncoder

labels = [tf["type"] for tf in type_to_file]
le = LabelEncoder()
y = le.fit_transform(labels)
y.shape

(1025,)

In [12]:
labeled_data = np.hstack((image_without_alpha_flattened_array, y.reshape(-1, 1))).astype(np.uint8)
labeled_data.shape

(1025, 676876)

In [15]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
X_reduced = pca.fit_transform(labeled_data[:, :-1])

In [16]:
c = le.inverse_transform(labeled_data[:, -1].astype(int))
c

array(['bug', 'bug', 'bug', ..., 'water', 'water', 'water'],
      shape=(1025,), dtype='<U8')

In [17]:
pca_df = pd.DataFrame(X_reduced, columns=["PCA1", "PCA2"])
pca_df["Type"] = c
pca_df.head()

Unnamed: 0,PCA1,PCA2,Type
0,-36032.865407,-25663.356138,bug
1,874.933076,-43279.255353,bug
2,-21962.108994,1299.777823,bug
3,28363.153798,-34971.725715,bug
4,57116.7227,11590.930353,bug


In [18]:
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "browser"

fig = px.scatter(pca_df, x="PCA1", y="PCA2", color="Type")
# fig.update_layout(height=18 * 200)
fig.show()