In [1]:
import pandas as pd
import numpy as np
import matplotlib.image as mplimg

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

from PIL import Image

In [2]:
DATA_PATH = 'data/figures.csv'
RANDOM_STATE = 42

In [3]:
def get_img_vect(path):
    img = mplimg.imread(f'data/output/{path}')
    return img[:, :, 0]

def to_gray(path):
    img = Image.open(f'data/output/{path}').convert('LA')
    img.save(f'data/gray/{path}')

In [4]:
df = pd.read_csv(DATA_PATH)

In [5]:
df['fig_path'].apply(to_gray)

0       None
1       None
2       None
3       None
4       None
        ... 
9995    None
9996    None
9997    None
9998    None
9999    None
Name: fig_path, Length: 10000, dtype: object

In [6]:
df['vect_fig'] = df['fig_path'].apply(get_img_vect)
df['target'] = df['fig_name'].replace(['Circle', 'Triangle', 'Square'], [0, 1, 2])

In [7]:
pca = PCA(n_components=3)
pca.fit_transform(df.loc[0, 'vect_fig'])

array([[-0.01479925, -0.00156214, -0.00100291],
       [-0.01479802, -0.00156192, -0.00100315],
       [-0.01479903, -0.00156176, -0.00100311],
       [-0.01479904, -0.00156205, -0.00100299],
       [-0.01479904, -0.00156206, -0.001003  ],
       [-0.01479904, -0.00156206, -0.001003  ],
       [-0.01479904, -0.00156206, -0.001003  ],
       [-0.01479904, -0.00156206, -0.00100299],
       [-0.01479904, -0.00156206, -0.00100299],
       [-0.01479904, -0.00156206, -0.00100299],
       [-0.01479904, -0.00156206, -0.00100299],
       [-0.01479904, -0.00156206, -0.00100299],
       [-0.01479904, -0.00156206, -0.00100299],
       [-0.01479904, -0.00156206, -0.00100299],
       [-0.01479904, -0.00156206, -0.00100299],
       [-0.01479904, -0.00156206, -0.00100299],
       [-0.01479904, -0.00156206, -0.00100299],
       [-0.01479904, -0.00156206, -0.00100299],
       [-0.01479904, -0.00156206, -0.00100299],
       [-0.01479904, -0.00156206, -0.00100299],
       [-0.01479904, -0.00156206, -0.001

In [8]:
pca.explained_variance_ratio_

array([0.7668844 , 0.09467451, 0.04083807], dtype=float32)

In [9]:
df['pca'] = df['vect_fig'].apply(lambda x: PCA(3).fit_transform(x))
df.loc[0, 'pca']

array([[-0.01479925, -0.00156214, -0.00100291],
       [-0.01479802, -0.00156192, -0.00100315],
       [-0.01479903, -0.00156176, -0.00100311],
       [-0.01479904, -0.00156205, -0.00100299],
       [-0.01479904, -0.00156206, -0.001003  ],
       [-0.01479904, -0.00156206, -0.001003  ],
       [-0.01479904, -0.00156206, -0.001003  ],
       [-0.01479904, -0.00156206, -0.00100299],
       [-0.01479904, -0.00156206, -0.00100299],
       [-0.01479904, -0.00156206, -0.00100299],
       [-0.01479904, -0.00156206, -0.00100299],
       [-0.01479904, -0.00156206, -0.00100299],
       [-0.01479904, -0.00156206, -0.00100299],
       [-0.01479904, -0.00156206, -0.00100299],
       [-0.01479904, -0.00156206, -0.00100299],
       [-0.01479904, -0.00156206, -0.00100299],
       [-0.01479904, -0.00156206, -0.00100299],
       [-0.01479904, -0.00156206, -0.00100299],
       [-0.01479904, -0.00156206, -0.00100299],
       [-0.01479904, -0.00156206, -0.00100299],
       [-0.01479904, -0.00156206, -0.001

In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    df['pca'], df['target'], test_size=0.2, random_state=RANDOM_STATE
)
X_train = np.array([i.flatten() for i in X_train])
X_test = np.array([i.flatten() for i in X_test])

In [11]:
mdl = KNeighborsClassifier(3, weights='distance', leaf_size=20)
mdl.fit(X_train, y_train)
mdl.score(X_train, y_train)

0.999375

In [12]:
mdl.score(X_test, y_test)

0.811