In [1]:
!pip install plotly scikit-image scikit-learn



In [2]:
import zipfile
with zipfile.ZipFile("data.zip", 'r') as zip_ref:
    zip_ref.extractall("data")

In [3]:
from glob import glob
import os

import numpy as np
import pandas as pd
import plotly.express as px
import skimage as ski

In [4]:
path = "./data/data/images/*/*.png"
file_paths = sorted(glob(path))

In [5]:
type_to_file = []
for file_path in file_paths:
    parts = os.path.normcase(file_path).split(os.sep)
    type_name = parts[-2]
    file_name = parts[-1]
    type_to_file.append({"type": type_name, "file": file_name})

type_to_file[:5]

[{'type': 'bug', 'file': '010.png'},
 {'type': 'bug', 'file': '011.png'},
 {'type': 'bug', 'file': '012.png'},
 {'type': 'bug', 'file': '013.png'},
 {'type': 'bug', 'file': '014.png'}]

In [6]:
image_list = [ski.io.imread(filename) for filename in file_paths]

In [None]:
for i, img in enumerate(image_list):
        # ski.transform.resize(img, (475, 475, 4))
    image_list[i] = ski.transform.resize(img, (128, 128, 4), preserve_range=True)

In [8]:
image_array = np.array(image_list, dtype=np.uint8)
image_array.shape

(1025, 128, 128, 4)

In [9]:
image_without_alpha_array = image_array[:, :, :, :3]
image_without_alpha_array.shape

(1025, 128, 128, 3)

In [10]:
image_without_alpha_flattened_array = image_without_alpha_array.reshape(image_without_alpha_array.shape[0], -1)
image_without_alpha_flattened_array.shape

(1025, 49152)

In [11]:
fig = px.imshow(
    image_without_alpha_array[0],
)
fig.show(renderer="colab")

In [12]:
from sklearn.preprocessing import LabelEncoder

labels = [tf["type"] for tf in type_to_file]
le = LabelEncoder()
y = le.fit_transform(labels).reshape(-1, 1)
y.shape

(1025, 1)

In [13]:
labeled_data = np.hstack((image_without_alpha_flattened_array, y)).astype(np.uint8)
labeled_data.shape

(1025, 49153)

In [14]:
%%time

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

lda = LinearDiscriminantAnalysis(n_components=2)
X_reduced = lda.fit_transform(labeled_data[:, :-1], y.reshape(-1))

CPU times: user 1min 6s, sys: 1.3 s, total: 1min 7s
Wall time: 40.1 s


In [15]:
X_reduced

array([[-9.58686033e-02,  1.20279057e+00],
       [ 8.78356656e-01,  9.39676675e-01],
       [-3.35181580e-01,  1.74493794e+00],
       ...,
       [-5.67942705e+00,  2.58394931e+00],
       [ 3.95149152e-04, -1.24434248e+00],
       [ 2.66310077e+00, -1.52785558e+00]])

In [16]:
c = le.inverse_transform(labeled_data[:, -1].astype(int))
c

array(['bug', 'bug', 'bug', ..., 'water', 'water', 'water'], dtype='<U8')

In [17]:
lda_df = pd.DataFrame(X_reduced, columns=["LDA1", "LDA2"])
lda_df["Type"] = c
lda_df.head()

Unnamed: 0,LDA1,LDA2,Type
0,-0.095869,1.202791,bug
1,0.878357,0.939677,bug
2,-0.335182,1.744938,bug
3,1.091616,0.521894,bug
4,0.506635,2.055208,bug


In [20]:
fig = px.scatter(lda_df, x="LDA1", y="LDA2", color="Type")
# fig.update_layout(height=18 * 200)
fig.show(renderer="colab")

In [21]:
np.hstack((X_reduced, y.reshape(-1, 1))).tofile("lda_2_128_resolution.dat")