# Analysis Purpose
>### First, using kmeans to simplify image color, and seperate two type of color.<br><br> Second, caculating light pixel and dark pixel number to predict image whether is cover by white paper 

## Table of Contents
* [1. Import Packages](#1.-Import-Packages)
* [2. Useful Function](#2.-Useful-Function)
* [3. Set Direction and Find the File Path](#3.-Set-Direction-and-Find-the-File-Path)
* [4. Model Build by Image Threshold and cv.kmean](#4.-Model-Build-by-Image-Threshold-and-cv.kmean)
    * [4.1 Save Model Result](#4.1-Save-Model-Result)
    * [4.2 Load Model Result](#4.2-Load-Model-Result)    
    * [4.3 Final Dataframe](#4.3-Final-Dataframe)


## 1. Import Packages   
[Go back to the Table of Contents](#Table-of-Contents)

In [1]:
# import packages
import numpy as np
import cv2 as cv
import matplotlib.pyplot as plt
import os
from glob import glob
import pandas as pd
from tqdm import tqdm
import json
import random
# example of pixel normalization
from numpy import asarray
from PIL import Image

## 2. Useful Function
[Go back to the Table of Contents](#Table-of-Contents)

In [2]:
# Python program to check if two to get unique values from list using traversal function to get unique values 
def unique(list1): 
  
    # intilize a null list 
    unique_list = [] 
      
    # traverse for all elements 
    for x in list1: 
        # check if exists in unique_list or not 
        if x not in unique_list: 
            unique_list.append(x) 
    # print list
    li = []
    for x in unique_list: 
        li.append(x)
    return li

In [3]:
# Python Code to find rank of elements 
def rankify(A): 
  
    # Rank Vector 
    R = [0 for x in range(len(A))] 
  
    # Sweep through all elements in A for each element count the number of less than and equal elements separately in r and s. 
    for i in range(len(A)): 
        (r, s) = (1, 1) 
        for j in range(len(A)): 
            if j != i and A[j] < A[i]: 
                r += 1
            if j != i and A[j] == A[i]: 
                s += 1       

        # Use formula to obtain rank 
        R[i] = r + (s - 1) / 2
  
    # Return Rank Vector 
    return R 

## 3. Set Direction and Find the File Path
[Go back to the Table of Contents](#Table-of-Contents)

In [4]:
deed_name = ['AA', 'AD', 'AG', 'BA', 'BD', 'BG', 'EA', 'ED', 'EG', 'HA', 'HD',
             'HG', 'LA', 'LG', 'PA', 'PG', 'QA', 'QG', 'RA', 'RG', 'UA', 'UD', 'UG', ]

In [5]:
ok_file_name = ['P2_OK_P4', 'P3_OK_P4']

In [6]:
ok_deed_dir = "//Srvappweb-t/f$/{}/{}/".format(deed_name[1], ok_file_name[1])

In [7]:
ok_deed_dir_li = []
for i in range(len(deed_name)):
    for t in range(len(ok_file_name)):
        ok_deed_dir = "//Srvappweb-t/f$/{}/{}/".format(deed_name[i], ok_file_name[t])
        ok_deed_dir_li.append(ok_deed_dir)

In [8]:
img_path_list = []
for ok_deed_dir in ok_deed_dir_li:
    for file_path in glob('{}/*.jpg'.format(ok_deed_dir)):
        img_path_list.append(file_path)

In [9]:
img_path_list[0]

'//Srvappweb-t/f$/AA/P2_OK_P4\\AB002-AA0769076.jpg'

In [10]:
print(f'Number of ok deed: {len(img_path_list)}')

69912

## 4. Model Build by Image Threshold and cv.kmean
[Go back to the Table of Contents](#Table-of-Contents)

### algorithm:
>step1. read image<br>
step2. using two threshold method(THRESH_BINARY & ADAPTIVE_THRESH_GAUSSIAN_C + THRESH_BINARY) to create two new images(thresh1 & thresh2)<br>
step3. let thresh1 cover by thresh2 black pixel<br>
step4. resize image to (675, 900)<br>
step5. cut image around itself<br>
step6. using kmeans to merge similar color (k = 3)<br>
step7. find 3 color RGB list<br>
step7. calculate brightness depend on the 3 color (sum of RGB values)<br>
step8. according to brightness, define color to light color or dark color (dop color which brightness was middle)<br>
step9. count how many pixel is light color(or dark color)<br>
step10. caculate light dark ratio (light color/dark color)

In [12]:
# 查看類別資料夾裡的資料
result_light_pixel_count = []
result_dark_pixel_count = []
result_light_dark_ratio = []
for i in tqdm(range(len(img_path_list))):
    deed = img_path_list[i]

    img_color = cv.imread(deed) # read RGB image
    img_gray = cv.imread(deed, 0) # read gray image
    # The method returns two outputs. The first is the threshold that was used and the second output is the thresholded image.
    ret,thresh1 = cv.threshold(img_color, 95, 255, cv.THRESH_BINARY)
    thresh2 = cv.adaptiveThreshold(img_gray, 255, cv.ADAPTIVE_THRESH_GAUSSIAN_C, cv.THRESH_BINARY, 33, 8)

    # find black pixel(value=0), and save index
    flatten_li = thresh2.flatten().tolist()
    indices = [i for i, x in enumerate(flatten_li) if x == 0]  # find which pixel is black
    
    # let thresh1 cover by thresh2 black pixel
    width = thresh2.shape[1] 
    all_index_li = []
    for i in indices:
        index_li = [None, None]
        # return row index
        index_li[1] = i%width
        # return column index
        index_li[0] = int(i/width)

        all_index_li.append(index_li)

    for i in range(len(all_index_li)):
        thresh1[all_index_li[i][0]][all_index_li[i][1]]=[0, 0, 0]

    img = cv.cvtColor(thresh1, cv.COLOR_BGR2RGB)

    img = cv.resize(img,(675, 900))


    ######## cut image
    # 裁切區域的 x 與 y 座標（左上角）
    x = 100
    y = 30
    # 裁切區域的長度與寬度
    w = 525
    h = 770
    # 裁切圖片
    cut_img = img[y:y+h, x:x+w]


    Z = cut_img.reshape((-1,3))
    Z = np.float32(Z)



    criteria = (cv.TERM_CRITERIA_EPS + cv.TERM_CRITERIA_MAX_ITER, 10, 0.5)
    K=3
    ret, label1, center1 = cv.kmeans(Z, K, None, criteria, 10, cv.KMEANS_RANDOM_CENTERS)
    center1 = np.uint8(center1)
    res1 = center1[label1.flatten()]
    output1 = res1.reshape((cut_img.shape))
    # output1為 k-means 後的圖片結果 shape為(770, 525, 3)
    # 所以先攤平，在3個3個當作pixel的值，並取出
    li = output1.flatten().tolist()
    flatten_li = []
    for i in range(len(li)):
        if (i+1)%3 == 0:  
            flatten_li.append(li[i-2:i+1])


    unique_li = unique(flatten_li)
    bright = []
    for i in range(len(unique_li)):
        temp = 0
        for t in range(len(unique_li[i])):
            temp = temp + unique_li[i][t]
        bright.append(temp)

        rank_li = rankify(bright) # 亮度越大，排名越大


    color_type_li = []
    for i in rank_li:
        ########################################################################
        if i==3:
        ########################################################################
            color_type_li.append('light_color')
        ########################################################################
        elif i==1:
        #########################################################################    
            color_type_li.append('dark_color')
        else:
            color_type_li.append('neutral color')


    RGB_df = pd.DataFrame(unique_li, columns=['R', 'G', 'B'])
    color_df = pd.DataFrame(color_type_li, columns = ['color_type'])
    color_type_df = pd.concat([RGB_df, color_df], axis = 1)
    color_type_df = color_type_df.astype({'R': int, 'G': int, 'B': int})

    light_color_min = (int(color_type_df[color_type_df.color_type=='light_color'].R.min()), 
                       int(color_type_df[color_type_df.color_type=='light_color'].G.min()), 
                       int(color_type_df[color_type_df.color_type=='light_color'].B.min()))

    light_color_max = (int(color_type_df[color_type_df.color_type=='light_color'].R.max()), 
                       int(color_type_df[color_type_df.color_type=='light_color'].G.max()), 
                       int(color_type_df[color_type_df.color_type=='light_color'].B.max()))

    dark_color_min = (int(color_type_df[color_type_df.color_type=='dark_color'].R.min()), 
                      int(color_type_df[color_type_df.color_type=='dark_color'].G.min()), 
                      int(color_type_df[color_type_df.color_type=='dark_color'].B.min()))

    dark_color_max = (int(color_type_df[color_type_df.color_type=='dark_color'].R.max()), 
                      int(color_type_df[color_type_df.color_type=='dark_color'].G.max()), 
                      int(color_type_df[color_type_df.color_type=='dark_color'].B.max()))

    # return 一個值為 0 或 255的矩陣
    light_pixel = cv.inRange(output1, light_color_min, light_color_max)
    dark_pixel = cv.inRange(output1, dark_color_min, dark_color_max)
    light_dark_ratio = (light_pixel!=0).sum()/(dark_pixel!=0).sum()

    result_light_pixel_count.append(int((light_pixel!=0).sum()))
    result_dark_pixel_count.append(int((dark_pixel!=0).sum()))
    result_light_dark_ratio.append(light_dark_ratio)

100%|██████████| 69912/69912 [44:23:43<00:00,  2.29s/it]    


## 4.1 Save Model Result

In [16]:
# save a list(cutted word data) to a json file
with open("./result_light_pixel_count_2.json", 'w') as f:
    # indent=2 is not needed but makes the file more
    # human-readable for more complicated data
    json.dump(result_light_pixel_count, f, indent=2) 

In [17]:
# save a list(cutted word data) to a json file
with open("./result_dark_pixel_count_2.json", 'w') as f:
    # indent=2 is not needed but makes the file more 
    # human-readable for more complicated data
    json.dump( result_dark_pixel_count, f, indent=2) 

In [18]:
# save a list(cutted word data) to a json file
with open("./result_light_dark_ratio_2.json", 'w') as f:
    # indent=2 is not needed but makes the file more 
    # human-readable for more complicated data
    json.dump(result_light_dark_ratio, f, indent=2) 

## 4.2 Load Model Result

In [11]:
# load json file to a list
with open('./result_dark_pixel_count_2.json', 'r') as f:
     result_dark_pixel_count = json.load(f)

In [12]:
# load json file to a list
with open('./result_light_pixel_count_2.json', 'r') as f:
    result_light_pixel_count = json.load(f)

In [13]:
# load json file to a list
with open('./result_light_dark_ratio_2.json', 'r') as f:
    result_light_dark_ratio = json.load(f)

## 4.3 Final Dataframe

In [14]:
path_df = pd.DataFrame(img_path_list, columns = ['path'])

In [15]:
result_light_pixel_count_df = pd.DataFrame(result_light_pixel_count, columns = ['result_light_pixel_count'])

In [16]:
result_dark_pixel_count_df = pd.DataFrame(result_dark_pixel_count, columns = ['result_dark_pixel_count'])

In [17]:
result_light_dark_ratio_df = pd.DataFrame(result_light_dark_ratio, columns = ['result_light_dark_ratio'])

In [20]:
final_df = pd.concat([path_df, result_light_pixel_count_df, result_dark_pixel_count_df, result_light_dark_ratio_df], axis = 1)

In [21]:
final_df.head()

Unnamed: 0,path,result_light_pixel_count,result_dark_pixel_count,result_light_dark_ratio
0,//Srvappweb-t/f$/AA/P2_OK_P4\AB002-AA0769076.jpg,311113,54730,5.684506
1,//Srvappweb-t/f$/AA/P2_OK_P4\AB002-AA0769077.jpg,312220,54334,5.74631
2,//Srvappweb-t/f$/AA/P2_OK_P4\AB002-AA0769078.jpg,311719,55292,5.637687
3,//Srvappweb-t/f$/AA/P2_OK_P4\AB002-AA0769097.jpg,306306,61059,5.016558
4,//Srvappweb-t/f$/AA/P2_OK_P4\AB002-AA0816252.jpg,296854,67558,4.394061


In [18]:
threshold_dark = int(result_dark_pixel_count_df.result_dark_pixel_count.mean()-1*result_dark_pixel_count_df.result_dark_pixel_count.std())

In [19]:
threshold_light = int(result_light_pixel_count_df.result_light_pixel_count.mean()+1*result_light_pixel_count_df.result_light_pixel_count.std())

In [31]:
test_df = final_df[(final_df.result_light_dark_ratio>=30) & (final_df.result_light_pixel_count>=threshold_light) & (final_df.result_dark_pixel_count<=threshold_dark)]
path_li_with_several_condition = test_df.path.tolist()

In [33]:
test_df = final_df[(final_df.result_light_dark_ratio>=22.5)]
path_li_with_several_condition = test_df.path.tolist()

In [34]:
print(f'number of deed which is cover by white paper: {len(path_li_with_several_condition)}')

60

In [None]:
# plt.rcParams["figure.figsize"] = (100,100)
if len(path_li_with_several_condition)<=20:
    n = len(path_li_with_several_condition)
else:
    n =len(path_li_with_several_condition)
image = random.sample(path_li_with_several_condition, n)
for i, img in enumerate(image): 
    img = cv.imread(img)
    img = cv.cvtColor(img, cv.COLOR_BGR2RGB)
    plt.rcParams["figure.figsize"] = (20,80)
    plt.subplot(int(n/2)+1, 2, i+1)
    
    plt.imshow(img, aspect='auto')

#     plt.title(titles[i])
#     plt.xticks([])
#     plt.yticks([])


## other method

In [49]:
# test_df = final_df[(final_df.result_light_pixel_count>=(1.4*threshold_light))]
# path_li = test_df.path.tolist()
# final_path_li = []
# for i in range(len(path_li)):
#     if path_li[i].find('/AD/') == -1 and path_li[i].find('/ED/') == -1:
#         final_path_li.append(path_li[i])
#     else:
#         pass
# path_li_with_several_condition = final_path_li

In [29]:
# test_df = final_df[(final_df.result_dark_pixel_count<=(0.4*threshold_dark))]
# path_li_with_several_condition = test_df.path.tolist()

In [32]:
# test_df = final_df[(final_df.result_dark_pixel_count<=(1.15*threshold_dark))]
# path_li = test_df.path.tolist()
# final_path_li = []
# for i in range(len(path_li)):
#     if path_li[i].find('/AD/') == -1 and path_li[i].find('/ED/') == -1:
#         final_path_li.append(path_li[i])
#     else:
#         pass
# path_li_with_several_condition = final_path_li

In [1]:
# test_df = final_df[(final_df.result_light_pixel_count>=threshold_light) & (final_df.result_dark_pixel_count<=0.4*threshold_dark)]
# path_li_with_several_condition = test_df.path.tolist()