# Process generated image dataset 
1. Extract 20 features from the image and store in features.csv
2. Use PCA to get the main component of 20 features and store in components.csv

## 1. Initialization

In [1]:
step = 2
img_folder = r".\GeneratedDataset\Dataset_Step=" + str(step)
origin_path = img_folder + r"\data.csv"
processed_folder = r".\ProcessedData\Dataset_Step=" + str(step)
features_path = processed_folder + r"\features.csv"
calc_components_path = processed_folder + r"\calc_components.csv"
components_path = processed_folder + r"\components.csv"
# %load_ext autoreload
# %autoreload 2

In [6]:
import pandas as pd
from enum import Enum
import os
import numpy as np
import cv2
from matplotlib import pyplot as plt
import timeit
from IPython.display import clear_output
from sklearn.decomposition import PCA
from feature_extraction import extract_features
from draw_tools import draw_points, draw_contours

## 2. Extract Features from images
Interative read image, process the image, extract features, store in features_pd  
If you already have the features_pd .csv file, jump to [here](#PCA)

### Read CSV data from original data

In [3]:
origin_pd = pd.read_csv(origin_path)
origin_pd.head(10)

Unnamed: 0,Gamma1,Gamma2,Gamma3,Alpha1,Alpha2,Beta,thumb_x,thumb_y,index_x,index_y,ImgName
0,0.0,0.0,0.0,0.0,0.0,0.0,1,2,1,-2,2020_4_2_14_49_53_14_7905.png
1,0.0,0.0,-2.0,0.0,0.0,0.0,1,2,1,-1,2020_4_2_14_49_53_15_1130.png
2,0.0,0.0,0.0,0.0,2.0,0.0,1,2,1,-1,2020_4_2_14_49_53_15_2150.png
3,0.0,0.0,0.0,0.0,-2.0,0.0,1,3,1,-2,2020_4_2_14_49_53_15_2767.png
4,0.0,0.0,0.0,0.0,0.0,2.0,0,3,0,-2,2020_4_2_14_49_53_15_3038.png
5,0.0,0.0,0.0,0.0,0.0,-2.0,3,2,2,-2,2020_4_2_14_49_53_15_3463.png
6,0.0,0.0,-2.0,2.0,0.0,0.0,1,1,1,0,2020_4_2_14_49_53_15_5296.png
7,0.0,0.0,-2.0,0.0,2.0,0.0,1,2,1,-1,2020_4_2_14_49_53_15_6131.png
8,0.0,0.0,-2.0,0.0,0.0,2.0,0,2,0,-1,2020_4_2_14_49_53_15_7131.png
9,0.0,0.0,0.0,-2.0,2.0,0.0,1,3,1,-2,2020_4_2_14_49_54_15_9813.png


### Initialize a features_pd dataframe

In [4]:
class Features(Enum):
    Defect_left_x   = 0    # left defect point
    Defect_left_y   = 1    # left defect point
    Defect_right_x  = 2    # right defect point
    Defect_right_y  = 3    # right defect point
    Centroid_up_x   = 4    # up finger (thumb) centroid point
    Centroid_up_y   = 5    # up finger (thumb) centroid point
    Centroid_down_x = 6    # down finger (index finger) centroid point
    Centroid_down_y = 7    # down finger (index finger) centroid point
    Top_left_x      = 8    # top left point of the boundary
    Top_left_y      = 9    # top left point of the boundary
    Top_right_x     = 10    # top right point of the boundary
    Top_right_y     = 11   # top right point of the boundary
    Bottom_left_x   = 12   # bottom left point of the boundary
    Bottom_left_y   = 13   # bottom left point of the boundary
    Bottom_right_x  = 14   # bottom right point of the boundary
    Bottom_right_y  = 15   # bottom right point of the boundary
    Lowest_up_x     = 16   # lowest point of up finger (thumb)   
    Lowest_up_y     = 17   # lowest point of up finger (thumb)  
    Rightest_down_x = 18   # rightest point of down finger (index finger)
    Rightest_down_y = 19   # rightest point of down finger (index finger)

class Labels(Enum):
    Thumb_x = 0
    Thumb_y = 1
    Index_x = 2
    Index_y = 3
    ImgName = 4

In [5]:
col = [str(_).split('.')[1] for _ in Features]
col.extend([str(_).split('.')[1] for _ in Labels])
features_pd = pd.DataFrame(None, columns=col)
features_pd.head(10)

Unnamed: 0,Defect_left_x,Defect_left_y,Defect_right_x,Defect_right_y,Centroid_up_x,Centroid_up_y,Centroid_down_x,Centroid_down_y,Top_left_x,Top_left_y,...,Bottom_right_y,Lowest_up_x,Lowest_up_y,Rightest_down_x,Rightest_down_y,Thumb_x,Thumb_y,Index_x,Index_y,ImgName


### Iterate the images and add data to feature_pd dataframe

In [7]:
def calc_features_points(pd, row, is_draw=False):
    img_name = pd.loc[row]['ImgName']
    img_path = os.path.join(img_folder, img_name)
    bgr_image = cv2.imread(img_path)
    IM_HEIGHT, IM_WIDTH, _ = bgr_image.shape

    # ---------------------------------------------
    # 1.1 Get the mask and its contour and apply the mask to image
    # ---------------------------------------------
    gray_image = cv2.cvtColor(bgr_image, cv2.COLOR_BGR2GRAY)
    _, mask = cv2.threshold(gray_image, 10, 1, cv2.THRESH_BINARY)
    finger_image = cv2.bitwise_and(bgr_image, bgr_image, mask=mask)
    # Get the all contours, CHAIN_APPROX_NONE means get all the points
    _, contours, _ = cv2.findContours(mask, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_NONE)
    contour = contours[0]

    # ---------------------------------------------
    # 1.2 segment features
    # ---------------------------------------------
    out_image = finger_image if is_draw else None
    features = extract_features(contour, gray_image.shape[0], gray_image.shape[1], out_image)
    
    # ---------------------------------------------
    # 1.2 If show the points
    # ---------------------------------------------
    if is_draw:
        plt.imshow(cv2.cvtColor(finger_image, cv2.COLOR_BGR2RGB))
        plt.show()
    
    return features

In [None]:
start_time = timeit.default_timer()

for row in range(len(origin_pd)):
    features = calc_features_points(origin_pd, row)
    if features is None:
        continue
    else:
        features = np.array(features).flatten()
        labels = [origin_pd.loc[row]['thumb_x'], origin_pd.loc[row]['thumb_y'],
                  origin_pd.loc[row]['index_x'], origin_pd.loc[row]['index_y'],
                  origin_pd.loc[row]['ImgName']]
        features = np.append(features, labels)
        features_pd = features_pd.append(
                    pd.Series(features, index=features_pd.columns), 
                    ignore_index=True)
    
    clear_output(wait=True)
    stop_time = timeit.default_timer()
    print("Current Progress:", np.round((row + 1) / len(origin_pd) * 100, 2), "%")
    print("Current Run Time:", np.round((stop_time - start_time) / 60, 2), "minutes")
    print("Excepted Run Time:", np.round((stop_time - start_time) * 
                                         (len(origin_pd) / (row + 1) - 1) / 60, 2), "minutes")

print("Generate", len(features_pd), "items from", len(origin_pd), "samples")

In [11]:
print("Generate", len(features_pd), "items from", len(origin_pd), "samples")

Generate 465470 items from 506327 samples


### Save to .csv file

In [12]:
if not os.path.exists(processed_folder):
    os.mkdir(processed_folder)
features_pd.to_csv(features_path, index=False)

<span id="PCA"></span>
## 3. Use PCA for main components of images
This is use PCA to get the main components
If you already have the component_pd .csv file, jump to [here](#KNN)

#### Read features data from csv file and pack to numpy array

In [13]:
features_pd = pd.read_csv(features_path)
X = features_pd.iloc[:, 0: 20].to_numpy()
print("Shape of feature numpy for PCA", X.shape)

Shape of feature numpy for PCA (465470, 20)


#### Use PCA to get component analysis

In [14]:
# pca = PCA(n_components='mle')
pca = PCA()
pca.fit(X)
print("Extract", pca.n_components_, "components from", pca.n_features_, "features")
print("Percentages of components\n", np.round(pca.explained_variance_ratio_, 3))
accumulated_ratio = [np.round(sum(pca.explained_variance_ratio_[:i+1]), 3) 
                    for i in range(len(pca.explained_variance_ratio_))]
print("Accumulated percentages of components\n", list(enumerate(accumulated_ratio)))

Extract 20 components from 20 features
Percentages of components
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Accumulated percentages of components
 [(0, 1.0), (1, 1.0), (2, 1.0), (3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (7, 1.0), (8, 1.0), (9, 1.0), (10, 1.0), (11, 1.0), (12, 1.0), (13, 1.0), (14, 1.0), (15, 1.0), (16, 1.0), (17, 1.0), (18, 1.0), (19, 1.0)]


#### Choose Components and save in disk

In [15]:
threshold = 0.95
components_num = np.where(np.array(accumulated_ratio) >= threshold)[0][0] + 1
print("Choose", components_num, "components")

# ---------------------------------------------
# Store calculation method
# ---------------------------------------------
weight_list = pca.components_[:components_num]
calc_component_pd = pd.DataFrame(weight_list, columns=features_pd.columns[:20])
calc_component_pd.to_csv(calc_components_path, index=False)

# ---------------------------------------------
# Store component data
# ---------------------------------------------
component_data = pca.transform(X)[:, :components_num]
col = []
for i in range(components_num):
    col.append("Component_" + str(i))
component_pd = pd.DataFrame(component_data, columns=col)
labels_pd = features_pd.iloc[:, 20: len(features_pd.columns)]
component_pd = component_pd.join(labels_pd)
component_pd.to_csv(components_path, index=False)

Choose 1 components


<span id="KNN"></span>
## 4. Use KNN

In [1]:
components_path =  r"models\pca\components.csv"
calc_components_path = r"models\pca\calc_components.csv"

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

In [2]:
components_pd = pd.read_csv(components_path)
X = components_pd.iloc[:, 0].to_numpy().reshape(-1, 1)
Y = components_pd.iloc[:, 1:3].to_numpy()
print("Shape of X", X.shape, "Shape of Y", Y.shape)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)
print("Train dataset shape")
print("X:", X_train.shape, " Y:", Y_train.shape)
print("Test dataset shape")
print("X:", X_test.shape, " Y:", Y_test.shape)

Shape of X (465470, 1) Shape of Y (465470, 2)
Train dataset shape
X: (325829, 1)  Y: (325829, 2)
Test dataset shape
X: (139641, 1)  Y: (139641, 2)


In [24]:
neighbors = 41
neigh_x = KNeighborsClassifier(n_neighbors=neighbors)
neigh_y = KNeighborsClassifier(n_neighbors=neighbors)
neigh_x.fit(X_train, Y_train[:, 0])
neigh_y.fit(X_train, Y_train[:, 1])
train_sc1 = neigh_x.score(X_train, Y_train[:, 0])
train_sc2 = neigh_y.score(X_train, Y_train[:, 1])
test_sc1 = neigh_x.score(X_test, Y_test[:, 0])
test_sc2 = neigh_y.score(X_test, Y_test[:, 1])
print(train_sc1, train_sc2, test_sc1, test_sc2)

0.31252896457958007 0.2674746569519595 0.2606111385624566 0.2150013248258033


In [22]:
max_neigh = None
max_test_sc1 = None
for neighbors in range(41, 200): 
    neigh_x = KNeighborsClassifier(n_neighbors=neighbors)
    neigh_y = KNeighborsClassifier(n_neighbors=neighbors)
    neigh_x.fit(X_train, Y_train[:, 0])
    neigh_y.fit(X_train, Y_train[:, 1])
    train_sc1 = neigh_x.score(X_train, Y_train[:, 0])
    train_sc2 = neigh_y.score(X_train, Y_train[:, 1])
    test_sc1 = neigh_x.score(X_test, Y_test[:, 0])
    test_sc2 = neigh_y.score(X_test, Y_test[:, 1])
    print(train_sc1, train_sc2, test_sc1, test_sc2)
    if max_test_sc1 is None or test_sc1 > max_test_sc1:
        max_neigh = neighbors
        max_test_sc1 = test_sc1

0.31252896457958007 0.2674746569519595 0.2606111385624566 0.2150013248258033
0.31126449763526265 0.26661837957947265 0.259988112373873 0.2156744795582959
0.30996933974569485 0.265700720316485 0.2597661145365616 0.2158033815283477
0.3087570474083031 0.2652710470829791 0.2593149576413804 0.2157604142049971
0.30842558519959856 0.26398509647698637 0.2588208334228486 0.21530925730981587
0.3072593292800825 0.26323316831835103 0.25887812318731607 0.21524480632478998
0.3063293936389947 0.2625641057118918 0.2580402603819795 0.21545964294154296
0.3055836036694094 0.2616894137722548 0.2579972930586289 0.21507293703138763
0.3049206792520003 0.26080244545451753 0.25766071569238264 0.21523764510423157
0.30360403770075717 0.25981419701745395 0.2570018834010069 0.21492971262021898
0.302431643592191 0.25915127260004484 0.2566509835936437 0.21489390651742682
0.30158764259780435 0.25842696629213485 0.25695891607765625 0.21508725947250448
0.3011886603095489 0.25781928557617645 0.25693027119542255 0.214901

KeyboardInterrupt: 

In [23]:
print(max_neigh, max_test_sc1)

41 0.2606111385624566


In [None]:
max_neigh = 21, max_test_sc1 = 0.26