# Analysis Purpose
>### EDA with image, and find out some way(or feature) can let model get better performance
>##### Reference:&emsp;[OpenCV Image Thresholding](https://docs.opencv.org/master/d7/d4d/tutorial_py_thresholding.html)

## Table of Contents
* [1. Import Packages](#1.-Import-Packages)
* [2. Set Direction and Find the File Path](#2.-Set-Direction-and-Find-the-File-Path)
* [3. Basic EDA](#3.-Basic-EDA)
    * [3.1 EDA with Color Image](#3.1-EDA-with-Color-Image)
    * [3.2 EDA with Gray Image](#3.2-EDA-with-Gray-Image)
* [4. Final Thought](#4.-Final-Thought)
    * [4.1 Let Middle Image Cover by Right Image's Black Pixel](#4.1-Let-Middle-Image-Cover-by-Right-Image's-Black-Pixel)


## 1. Import Packages
[Go back to the Table of Contents](#Table-of-Contents)

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from glob import glob
import cv2 as cv

## 2. Set Direction and Find the File Path
[Go back to the Table of Contents](#Table-of-Contents)

In [2]:
deed_name = ['AA', 'AD', 'AG', 'BA', 'BD', 'BG', 'EA', 'ED', 'EG', 'HA', 'HD',
             'HG', 'LA', 'LG', 'PA', 'PG', 'QA', 'QG', 'RA', 'RG', 'UA', 'UD', 'UG', ]

In [3]:
ok_file_name = ['P2_OK_P4', 'P3_OK_P4']

In [4]:
ok_deed_dir = "//Srvappweb-t/f$/{}/{}/".format(deed_name[1], ok_file_name[1])

In [5]:
# find direction
ok_deed_dir_li = []
for i in range(len(deed_name)):
    for t in range(len(ok_file_name)):
        ok_deed_dir = "//Srvappweb-t/f$/{}/{}/".format(deed_name[i], ok_file_name[t])
        ok_deed_dir_li.append(ok_deed_dir)

In [6]:
# find file name
img_path_list = []
for ok_deed_dir in ok_deed_dir_li:
    for file_path in glob('{}/*.jpg'.format(ok_deed_dir)):
        img_path_list.append(file_path)

In [7]:
# check path list
img_path_list[0]

'//Srvappweb-t/f$/AA/P2_OK_P4\\AB002-AA0769076.jpg'

In [8]:
print(f'Number of ok deed: {len(img_path_list)}')

Number of ok deed: 69912


## 3. Basic EDA
[Go back to the Table of Contents](#Table-of-Contents)

## 3.1 EDA with Color Image

In [None]:
img = cv.imread(img_path_list[16599])

# The method returns two outputs. The first is the threshold that was used and the second output is the thresholded image.
# you also can use 'cv.threshold(img_color, 0, 255, cv.THRESH_BINARY+cv.THRESH_OTSU)'
# Otsu's method avoids having to choose a value and determines it automatically.
ret,thresh1 = cv.threshold(img,127,255,cv.THRESH_BINARY)  
ret,thresh2 = cv.threshold(img,127,255,cv.THRESH_BINARY_INV)
ret,thresh3 = cv.threshold(img,127,255,cv.THRESH_TRUNC)
ret,thresh4 = cv.threshold(img,50,255,cv.THRESH_TOZERO)
ret,thresh5 = cv.threshold(img,150,255,cv.THRESH_TOZERO_INV)
# thresh6 = cv2.adaptiveThreshold(img, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 33, 8)
# thresh7 = cv2.adaptiveThreshold(img, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 33, 8)

# titles = ['Original Image','BINARY','BINARY_INV','TRUNC','TOZERO','TOZERO_INV', 'adaptiveThreshold', 'adaptiveThresholdINV']
titles = ['Original Image','BINARY','BINARY_INV','TRUNC','TOZERO','TOZERO_INV']
# images = [img, thresh1, thresh2, thresh3, thresh4, thresh5, thresh6, thresh7]
images = [img, thresh1, thresh2, thresh3, thresh4, thresh5]

plt.rcParams["figure.figsize"] = (20,20)
n = 6
for i in range(n):
    plt.subplot(3,3,i+1)
    original_img = cv.cvtColor(images[i],cv.COLOR_BGR2RGB)
    plt.imshow(original_img)    
    
#     if i == 0:
#         original_img = cv2.cvtColor(images[i],cv2.COLOR_BGR2RGB)
#         plt.imshow(original_img)
#     else:
#         plt.imshow(images[i],'gray')
    plt.title(titles[i])
    plt.xticks([])
    plt.yticks([])

plt.tight_layout()

## 3.2 EDA with Gray Image

In [None]:

deed = img_path_list[16599]

img = cv.imread(deed)
img_gray = cv.imread(deed, 0)

ret,thresh1 = cv.threshold(img_gray,127,255,cv.THRESH_BINARY)  # +cv2.THRESH_OTSU
ret,thresh2 = cv.threshold(img_gray,127,255,cv.THRESH_BINARY_INV)
ret,thresh3 = cv.threshold(img_gray,127,255,cv.THRESH_TRUNC)
ret,thresh4 = cv.threshold(img_gray,50,255,cv.THRESH_TOZERO)
ret,thresh5 = cv.threshold(img_gray,150,255,cv.THRESH_TOZERO_INV)
thresh6 = cv.adaptiveThreshold(img_gray, 255, cv.ADAPTIVE_THRESH_GAUSSIAN_C, cv.THRESH_BINARY, 33, 8)
thresh7 = cv.adaptiveThreshold(img_gray, 255, cv.ADAPTIVE_THRESH_GAUSSIAN_C, cv.THRESH_BINARY_INV, 33, 8)

titles = ['Original Image','BINARY','BINARY_INV','TRUNC','TOZERO','TOZERO_INV', 'adaptiveThreshold', 'adaptiveThresholdINV']
images = [img, thresh1, thresh2, thresh3, thresh4, thresh5, thresh6, thresh7]


plt.rcParams["figure.figsize"] = (20,20)
n = 8
for i in range(n):
    plt.subplot(3,3,i+1)  
    
    if i == 0:
        original_img = cv.cvtColor(images[i],cv.COLOR_BGR2RGB)
        plt.imshow(original_img)
    else:
        plt.imshow(images[i],'gray')
        
    plt.title(titles[i])
    plt.xticks([])
    plt.yticks([])

plt.tight_layout()

## 4. Final Thought
[Go back to the Table of Contents](#Table-of-Contents)

> I think 'adaptiveThreshold gray image' can mask on 'BINARY color image', and that wiil be a good image for analysis. It showing deed with white background. Handwrite and Print Fonts will be same color which is black. The other part will be different from black and white. (by the way, simplify image color but remaind importent color alone is a way to extract feature we need)

In [11]:
deed = img_path_list[16599]

img_color = cv.imread(deed)
img_gray = cv.imread(deed, 0)


ret,thresh1 = cv.threshold(img_color,95,255,cv.THRESH_BINARY)  # +cv2.THRESH_OTSU
thresh2 = cv.adaptiveThreshold(img_gray, 255, cv.ADAPTIVE_THRESH_GAUSSIAN_C, cv.THRESH_BINARY, 33, 8)

In [None]:
titles = ['Original_Image','BINARY', 'adaptiveThreshold']
images = [img_color, thresh1, thresh2]

plt.rcParams["figure.figsize"] = (20,20)
n = len(images)
for i in range(n):
    plt.subplot(3,3,i+1)
    if i in [0,1]:
        img = cv.cvtColor(images[i],cv.COLOR_BGR2RGB)
        plt.imshow(img)
    else:
         plt.imshow(images[i],'gray')

    plt.title(titles[i])
    plt.xticks([])
    plt.yticks([])

plt.tight_layout()

## 4.1 Let Middle Image Cover by Right Image's Black Pixel

In [13]:
flatten_li = thresh2.flatten().tolist()
indices = [i for i, x in enumerate(flatten_li) if x == 0]


width = thresh2.shape[1]

all_index_li = []
for i in indices:
    index_li = [None, None]
    # return row index
    index_li[1] = i%width
    # return column index
    index_li[0] = int(i/width)
    
    all_index_li.append(index_li)

for i in range(len(all_index_li)):
    thresh1[all_index_li[i][0]][all_index_li[i][1]]=[0, 0, 0]

In [None]:
titles = ['Original_Image','Final_Image']
images = [img_color, thresh1]

plt.rcParams["figure.figsize"] = (20,20)
n = len(images)
for i in range(n):
    plt.subplot(3,3,i+1)
    if i in [0,1]:
        img = cv.cvtColor(images[i],cv.COLOR_BGR2RGB)
        plt.imshow(img)
    else:
         plt.imshow(images[i],'gray')
    
    plt.title(titles[i])
    plt.xticks([])
    plt.yticks([])

plt.tight_layout()