# Process Dataset

## 1. Initialization

In [96]:
step = 3
img_folder = ".\GeneratedDataset\Dataset_Step=" + str(step)
data_path = img_folder + "\data.csv"
features_path = r".\ProcessedData\Dataset_Step=" + str(step) + r"\features.csv"
calc_components_path = r".\ProcessedData\Dataset_Step=" + str(step) + r"\calc_components.csv"
components_path = r".\ProcessedData\Dataset_Step=" + str(step) + r"\components.csv"
# %load_ext autoreload
# %autoreload 2

In [97]:
import pandas as pd
from enum import Enum
import os
import numpy as np
import cv2
from matplotlib import pyplot as plt
import timeit
from IPython.display import clear_output
from sklearn.decomposition import PCA
from tracking_convdef import get_defect_points
from draw_tools import draw_points, draw_contours
from tracking_bound import segment_diff_fingers, get_bound_points
from correct_tracking_convdef import get_centroid, fit_lost_contour

## 2. Extract Features from images
Interative read image, process the image, extract features, store in features_pd  
If you already have the features_pd .csv file, jump to [here](#PCA)

### Read CSV data from original data

In [25]:
origin_pd = pd.read_csv(data_path)
origin_pd.head(10)

Unnamed: 0,Gamma1,Gamma2,Gamma3,Alpha1,Alpha2,Beta,thumb_x,thumb_y,index_x,index_y,ImgName
0,0.0,0.0,0.0,0.0,0.0,0.0,1,2,1,-2,2020_3_23_9_01_02_12_5956.png
1,0.0,0.0,0.0,0.0,4.0,0.0,1,1,1,0,2020_3_23_9_01_03_13_9563.png
2,0.0,0.0,0.0,0.0,0.0,4.0,1,2,-4,-2,2020_3_23_9_01_04_14_4567.png
3,0.0,0.0,0.0,0.0,0.0,-4.0,-1,3,6,-2,2020_3_23_9_01_04_14_8969.png
4,0.0,0.0,0.0,4.0,4.0,0.0,1,-1,2,3,2020_3_23_9_01_05_16_0155.png
5,0.0,0.0,0.0,0.0,8.0,0.0,1,0,1,2,2020_3_23_9_01_06_16_5160.png
6,0.0,0.0,0.0,0.0,4.0,4.0,1,1,-4,0,2020_3_23_9_01_06_16_8951.png
7,0.0,0.0,0.0,0.0,4.0,-4.0,-1,2,6,0,2020_3_23_9_01_07_17_2815.png
8,0.0,0.0,0.0,0.0,0.0,8.0,-1,3,-6,-2,2020_3_23_9_01_08_18_7357.png
9,0.0,0.0,0.0,4.0,0.0,-4.0,4,-1,4,1,2020_3_23_9_01_09_19_8759.png


### Initialize a features_pd dataframe

In [26]:
class Features(Enum):
    Defect_left_x   = 0    # left defect point
    Defect_left_y   = 1    # left defect point
    Defect_right_x  = 2    # right defect point
    Defect_right_y  = 3    # right defect point
    Centroid_up_x   = 4    # up finger (thumb) centroid point
    Centroid_up_y   = 5    # up finger (thumb) centroid point
    Centroid_down_x = 6    # down finger (index finger) centroid point
    Centroid_down_y = 7    # down finger (index finger) centroid point
    Top_left_x      = 8    # top left point of the boundary
    Top_left_y      = 9    # top left point of the boundary
    Top_right_x     = 10    # top right point of the boundary
    Top_right_y     = 11   # top right point of the boundary
    Bottom_left_x   = 12   # bottom left point of the boundary
    Bottom_left_y   = 13   # bottom left point of the boundary
    Bottom_right_x  = 14   # bottom right point of the boundary
    Bottom_right_y  = 15   # bottom right point of the boundary
    Lowest_up_x     = 16   # lowest point of up finger (thumb)   
    Lowest_up_y     = 17   # lowest point of up finger (thumb)  
    Rightest_down_x = 18   # rightest point of down finger (index finger)
    Rightest_down_y = 19   # rightest point of down finger (index finger)

class Labels(Enum):
    Thumb_x = 0
    Thumb_y = 1
    Index_x = 2
    Index_y = 3
    ImgName = 4

In [27]:
col = [str(_).split('.')[1] for _ in Features]
col.extend([str(_).split('.')[1] for _ in Labels])
features_pd = pd.DataFrame(None, columns=col)
features_pd.head(10)

Unnamed: 0,Defect_left_x,Defect_left_y,Defect_right_x,Defect_right_y,Centroid_up_x,Centroid_up_y,Centroid_down_x,Centroid_down_y,Top_left_x,Top_left_y,...,Bottom_right_y,Lowest_up_x,Lowest_up_y,Rightest_down_x,Rightest_down_y,Thumb_x,Thumb_y,Index_x,Index_y,ImgName


### Iterate the images and add data to feature_pd dataframe

In [28]:
def calc_features_points(pd, row, is_draw=False):
    file_name = pd.loc[row]['ImgName']
    file_path = os.path.join(img_folder, file_name)
    bgr_image = cv2.imread(file_path)
    IM_HEIGHT, IM_WIDTH, _ = bgr_image.shape

    # ---------------------------------------------
    # 1.1 Get the mask and its contour and apply the mask to image
    # ---------------------------------------------
    gray_image = cv2.cvtColor(bgr_image, cv2.COLOR_BGR2GRAY)
    _, mask = cv2.threshold(gray_image, 10, 1, cv2.THRESH_BINARY)
    finger_image = cv2.bitwise_and(bgr_image, bgr_image, mask=mask)
    # Get the all contours, CHAIN_APPROX_NONE means get all the points
    _, contours, _ = cv2.findContours(mask, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_NONE)
    contour = contours[0]

    # ---------------------------------------------
    # 1.2 Get defect points
    # ---------------------------------------------
    defect_points, _ = get_defect_points(contour)

    # ---------------------------------------------
    # 1.3 Divide up and down finger contour
    # ---------------------------------------------
    up_contour, down_contour = segment_diff_fingers(contour, defect_points)
    up_centroid = get_centroid(up_contour)
    down_centroid = get_centroid(down_contour)

    # ---------------------------------------------
    # 1.4 Get boundary points
    # ---------------------------------------------
    top_left, top_right, bottom_left, bottom_right = get_bound_points(
        up_contour, down_contour, IM_HEIGHT, IM_WIDTH)

    # ---------------------------------------------
    # 1.5 Get touch line then lowest up point and rightest down point
    # ---------------------------------------------
    up_touch_line, _ = fit_lost_contour(
                    IS_UP=True,
                    contour=up_contour,
                    bound_points=(top_left, top_right),
                    fitting_curve=lambda X, Y: np.poly1d(np.polyfit(X, Y, 4)),
                    defect_points=defect_points,
                    draw_image=None)
    lowest_up, rightest_down = None, None
    if up_touch_line is not None and up_contour is not None and down_contour is not None:
        index_list = np.where(up_contour[:, 0, 1] == max(up_contour[:, 0, 1]))[0]
        tmp1 = tuple(up_contour[index_list[0], 0, :])
        tmp2 = ((int)(up_centroid[0]), (int)(up_touch_line(up_centroid[0]))) 
        if tmp1[1] > tmp2[1]:
            lowest_up = tmp1
        else:
            lowest_up = tmp2
        index_list = np.where(down_contour[:, 0, 0] == max(down_contour[:, 0, 0]))[0]
        rightest_down = tuple(down_contour[index_list[0], 0, :])

    # ---------------------------------------------
    # 1.6 Check None and form the feature data
    # ---------------------------------------------
    if defect_points is None:
        return None
    
    features = [defect_points[0], 
                defect_points[1],
                up_centroid,
                down_centroid,
                top_left,
                top_right,
                bottom_left,
                bottom_right,
                lowest_up,
                rightest_down]
    
    if None in features:
        return None
    
    # ---------------------------------------------
    # 1.7 If show the points
    # ---------------------------------------------
    if is_draw:
        # Two defect points (Green), centroid points (Blue), boundary points (Green-blue) 
        draw_points(finger_image, defect_points, color=[0, 255, 0])
        draw_points(finger_image, up_centroid, color=[255, 0, 0])
        draw_points(finger_image, down_centroid, color=[255, 0, 0])
        draw_points(finger_image, top_left, radius=10, color=[255, 255, 0])
        draw_points(finger_image, top_right, radius=10, color=[255, 255, 0])
        draw_points(finger_image, bottom_left, radius=10, color=[255, 255, 0])
        draw_points(finger_image, bottom_right, radius=10, color=[255, 255, 0])
        draw_points(finger_image, lowest_up, color=[0, 255, 255])
        draw_points(finger_image, rightest_down, color=[0, 255, 255])
#         draw_contours(finger_image, down_contour)
        plt.imshow(cv2.cvtColor(finger_image, cv2.COLOR_BGR2RGB))
        plt.show()
    
    return features

In [29]:
start_time = timeit.default_timer()

for row in range(len(origin_pd)):
    features = calc_features_points(origin_pd, row)
    if features is None:
        continue
    else:
        features = np.array(features).flatten()
        labels = [origin_pd.loc[row]['thumb_x'], origin_pd.loc[row]['thumb_y'],
                  origin_pd.loc[row]['index_x'], origin_pd.loc[row]['index_y'],
                  origin_pd.loc[row]['ImgName']]
        features = np.append(features, labels)
        features_pd = features_pd.append(
                    pd.Series(features, index=features_pd.columns), 
                    ignore_index=True)
    
    clear_output(wait=True)
    stop_time = timeit.default_timer()
    print("Current Progress:", np.round((row + 1) / len(origin_pd) * 100, 2), "%")
    print("Current Run Time:", np.round((stop_time - start_time) / 60, 2), "minutes")
    print("Excepted Run Time:", np.round((stop_time - start_time) * 
                                         (len(origin_pd) / (row + 1) - 1) / 60, 2), "minutes")

print("Generate", len(features_pd), "items")

Current Progress: 99.99 %
Current Run Time: 5.22 minutes
Excepted Run Time: 0.0 minutes
Generate 8219 items


### Save to .csv file

In [32]:
features_pd.to_csv(features_path, index=False)

<span id="PCA"></span>
## 3. Use PCA for features of images

#### Read features data from csv file and pack to numpy array

In [98]:
features_pd = pd.read_csv(features_path)
X = features_pd.iloc[:, 0: 20].to_numpy()
print("Shape of feature numpy for PCA", X.shape)

Shape of feature numpy for PCA (76326, 20)


#### Use PCA to get component analysis

In [99]:
# pca = PCA(n_components='mle')
pca = PCA()
pca.fit(X)
print("Extract", pca.n_components_, "components from", pca.n_features_, "features")
print("Percentages of components\n", np.round(pca.explained_variance_ratio_, 3))
accumulated_ratio = [np.round(sum(pca.explained_variance_ratio_[:i+1]), 3) 
                    for i in range(len(pca.explained_variance_ratio_))]
print("Accumulated percentages of components\n", list(enumerate(accumulated_ratio)))

Extract 20 components from 20 features
Percentages of components
 [0.911 0.045 0.015 0.01  0.008 0.004 0.003 0.002 0.001 0.001 0.001 0.
 0.    0.    0.    0.    0.    0.    0.    0.   ]
Accumulated percentages of components
 [(0, 0.911), (1, 0.956), (2, 0.971), (3, 0.981), (4, 0.989), (5, 0.992), (6, 0.995), (7, 0.997), (8, 0.998), (9, 0.998), (10, 0.999), (11, 0.999), (12, 1.0), (13, 1.0), (14, 1.0), (15, 1.0), (16, 1.0), (17, 1.0), (18, 1.0), (19, 1.0)]


#### Choose Components and save in disk

In [100]:
threshold = 0.95
components_num = np.where(np.array(accumulated_ratio) >= threshold)[0][0] + 1
print("Choose", components_num, "components")

# ---------------------------------------------
# Store calculation method
# ---------------------------------------------
weight_list = pca.components_[:components_num]
calc_component_pd = pd.DataFrame(weight_list, columns=features_pd.columns[:20])
calc_component_pd.to_csv(calc_components_path, index=False)

# ---------------------------------------------
# Store component data
# ---------------------------------------------
component_data = pca.transform(X)[:, :components_num]
col = []
for i in range(components_num):
    col.append("Component_" + str(i))
component_pd = pd.DataFrame(component_data, columns=col)
labels_pd = features_pd.iloc[:, 20: len(features_pd.columns)]
component_pd = component_pd.join(labels_pd)
component_pd.to_csv(components_path, index=False)

Choose 2 components
