In [89]:
# import the mnist data
import keras
from keras.datasets import mnist
import os
import cv2
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

In [307]:
# def the function to catalog the hand write number from 0 to 9
# make the catalogues list before using the function
def catalog_number(INPUT_DATA, INPUT_LIST):
    for i in range(len(INPUT_LIST)):
        # identify different number images by labeling files
        # ref: https://stackoverflow.com/questions/49882068/convert-np-where-array-to-list
        INPUT_LIST[i] = np.where(np.isin(INPUT_DATA, i))[0].tolist()

# def the function for image visualization
def plot_input_image(NUMBER, IMG, LABEL):
    plt.imshow(IMG[NUMBER], cmap = 'binary')
    plt.title(LABEL[NUMBER])
    plt.show()

# def the function for saving image
def saving_img_binary(FILE, IMG):
    with open(FILE, 'wb') as f:
        np.save(f, IMG)
    print(os.path.getsize(FILE))

# import data
# ref: https://www.youtube.com/watch?v=L2cAjgc1-bo
(X_TRAIN, Y_TRAIN), (X_TEST, Y_TEST) = mnist.load_data()
#X_TRAIN.shape, Y_TRAIN.shape, X_TEST.shape, Y_TEST.shape

# get training data
# catalog the number 0 to 9
TRAIN_LIST = ['TRAIN_0', 'TRAIN_1', 'TRAIN_2', 'TRAIN_3', 'TRAIN_4', 'TRAIN_5', 'TRAIN_6', 'TRAIN_7', 'TRAIN_8', 'TRAIN_9']
catalog_number(Y_TRAIN, TRAIN_LIST)
# get first 16 images of each number for training
INPUT_TRAIN_INDEX = TRAIN_LIST[0][:16] + TRAIN_LIST[1][:16] + TRAIN_LIST[2][:16] + TRAIN_LIST[3][:16] + TRAIN_LIST[4][:16] + TRAIN_LIST[5][:16] + TRAIN_LIST[6][:16] + TRAIN_LIST[7][:16] + TRAIN_LIST[8][:16] + TRAIN_LIST[9][:16]
# check the image if it is correct (first image of 0 to 9)
#for i in INPUT_TRAIN_INDEX[0:159:16]:
#    plot_input_image(i, X_TRAIN, Y_TRAIN)
# save 160 image into array
TRAIN_IMG_3D = np.empty((160, 28, 28))
for i in range(len(INPUT_TRAIN_INDEX)):
    INDEX = INPUT_TRAIN_INDEX[i]
    TRAIN_IMG_3D[i] = X_TRAIN[INDEX]
TRAIN_IMG_2D = TRAIN_IMG_3D.reshape(160, -1)

# get testing data
TEST_LIST = ['TEST_0', 'TEST_1', 'TEST_2', 'TEST_3', 'TEST_4', 'TEST_5', 'TEST_6', 'TEST_7', 'TEST_8', 'TEST_9']
catalog_number(Y_TEST, TEST_LIST)
TEST_INDEX = TEST_LIST[0][:10] + TEST_LIST[1][:10] + TEST_LIST[2][:10] + TEST_LIST[3][:10] + TEST_LIST[4][:10] + TEST_LIST[5][:10] + TEST_LIST[6][:10] + TEST_LIST[7][:10] + TEST_LIST[8][:10] + TEST_LIST[9][:10]
# check the image if it is correct (first image of 0 to 9)
#for i in TEST_INDEX[0:99:10]:
#    plot_input_image(i, X_TEST, Y_TEST)
TEST_IMG_3D = np.empty((100, 28, 28))
for i in range(len(TEST_INDEX)):
    INDEX = TEST_INDEX[i]
    TEST_IMG_3D[i] = X_TEST[INDEX]
TEST_IMG_2D = TEST_IMG_3D.reshape(100, -1)

# get the PCA eigenvector and eigenvalue
pca = PCA(n_components = 30)
pca.fit(TRAIN_IMG_2D)
EIGENVECTORS = pca.components_
#print(len(EIGENVECTORS))
EIGENVALUES = pca.explained_variance_
#print(len(EIGENVALUES))

# eigenvector visualization
NUM_IMAGES, IMAGE_SIZE = EIGENVECTORS.shape
HEIGHT = int(np.sqrt(IMAGE_SIZE))
WIDTH = int(IMAGE_SIZE / HEIGHT)
EIGENVECTOR_3D = EIGENVECTORS.reshape(NUM_IMAGES, HEIGHT, WIDTH)
#plt.imshow(EIGENVECTOR_3D[0], cmap = 'binary')
fig, axs = plt.subplots(nrows = 5, ncols = 6, figsize = (10, 10))
for i in range(len(EIGENVECTOR_3D)):
    if i < 6:
        axs[0, i].imshow(EIGENVECTOR_3D[i])
        axs[0, i].set_title('µ' + str(i+1))
    elif i >= 6 and i < 12:
        axs[1, i - 6].imshow(EIGENVECTOR_3D[i])
        axs[1, i - 6].set_title('µ' + str(i+1))
    elif i >= 12 and i < 18:
        axs[2, i - 12].imshow(EIGENVECTOR_3D[i])
        axs[2, i - 12].set_title('µ' + str(i+1))
    elif i >= 18 and i < 24:
        axs[3, i - 18].imshow(EIGENVECTOR_3D[i])
        axs[3, i - 18].set_title('µ' + str(i+1))
    elif i >= 24 and i < 30:
        axs[4, i - 24].imshow(EIGENVECTOR_3D[i])
        axs[4, i - 24].set_title('µ' + str(i+1))
plt.savefig('Eigenvectors.png')

# PCA transform for compress data
TEST_IMG_COMPRESS = pca.transform(TEST_IMG_2D)
# save image
#saving_img_binary('Test_IMG.npy', TEST_IMG_2D)
#saving_img_binary('Test_IMG_compress.npy', TEST_IMG_COMPRESS)

# make compress ratio curve
COMPRESS_RATIO = np.empty((160, 2))
for i in range(len(COMPRESS_RATIO) - 1):
    pca = PCA(n_components = i + 1)
    pca.fit(TRAIN_IMG_2D)
    COMPRESS = pca.transform(TEST_IMG_2D)
    saving_img_binary('Test_IMG_compress.npy', COMPRESS)
    FILE_SIZE = os.path.getsize('Test_IMG_compress.npy')
    COMPRESS_RATIO[i][0] = i + 1
    COMPRESS_RATIO[i][1] = FILE_SIZE / 627328 * 100
plt.plot(COMPRESS_RATIO[:, 0], COMPRESS_RATIO[:, 1])
plt.title('Compress Ratio')
plt.xlabel('Number of Eigenvector')
plt.ylabel('Compress Ratio')
plt.savefig('Compression_Ratio.png')

# make eigenvalue curve
EIGENVALUE_ACCUMULATE = np.empty((160, 2), dtype = float)
VALUE = 0
for i in range(len(EIGENVALUES)):
    VALUE += EIGENVALUES[i]
    EIGENVALUE_ACCUMULATE[i][0] = int(i + 1)
    EIGENVALUE_ACCUMULATE[i][1] = VALUE / np.sum(EIGENVALUES)
print(EIGENVALUE_ACCUMULATE)
plt.plot(EIGENVALUE_ACCUMULATE[:, 0], EIGENVALUE_ACCUMULATE[:, 1])
plt.xlabel('Number of Eigenvalues')
plt.ylabel('Accumulate Ratio')
plt.title('Eigenvalues Accumulate curve')
plt.savefig('Eigenvalues_Accumulate_curve.png')

# make reconstruct image
COMPONENT = [10, 50, 160]
RECONSTRUCT_IMG = np.empty((16, 28, 28))
for i in range(len(COMPONENT)):
    pca = PCA(n_components = COMPONENT[i])
    pca.fit(TRAIN_IMG_2D)
    EIGENVECTORS = pca.components_
# using the training data eigenvector to decrease the test data dimension and reconstruct
    TEST_CENTERED = TEST_IMG_2D - pca.mean_
    TEST_TRANSFORM = np.dot(TEST_CENTERED, EIGENVECTORS.T)
    #TEST_TRANSFORM.shape
    TEST_RECONSTRUCT = np.dot(TEST_TRANSFORM, EIGENVECTORS) + pca.mean_
    #TEST_RECONSTRUCT.shape
# turn 2D array back to 3D for image show
    NUM_IMAGES, IMAGE_SIZE = TEST_RECONSTRUCT.shape
    HEIGHT = int(np.sqrt(IMAGE_SIZE))
    WIDTH = int(IMAGE_SIZE / HEIGHT)
    TEST_RECONSTRUCT_3D = TEST_RECONSTRUCT.reshape(NUM_IMAGES, HEIGHT, WIDTH)
    SAVING_IMG = TEST_RECONSTRUCT_3D[:4]
    START = (i + 1) * 4
    RECONSTRUCT_IMG[START: START + 4] = SAVING_IMG
RECONSTRUCT_IMG[0: 4] = TEST_IMG_3D[0: 4]
#plt.imshow(RECONSTRUCT_IMG[4], cmap = 'binary')
fig, axs = plt.subplots(nrows = 4, ncols = 4, figsize = (10, 10))
for i in range(len(RECONSTRUCT_IMG)):
    if i < 4:
        axs[0, i].imshow(RECONSTRUCT_IMG[i], cmap = 'binary')
        axs[0, i].set_title('Origin img')
    if i >= 4 and i < 8:
        axs[1, i - 4].imshow(RECONSTRUCT_IMG[i], cmap = 'binary')
        axs[1, i - 4].set_title('M = 10')
    if i >= 8 and i < 12:
        axs[2, i - 8].imshow(RECONSTRUCT_IMG[i], cmap = 'binary')
        axs[2, i - 8].set_title('M = 50')
    if i >= 12 and i < 16:
        axs[3, i - 12].imshow(RECONSTRUCT_IMG[i], cmap = 'binary')
        axs[3, i - 12].set_title('M = 160')
plt.savefig('Reconstruct.img.png')

# make PSNR curve
COMPONENT = np.array(range(160))
PSNR_LIST = np.empty((160, 2))
for i in range(len(COMPONENT)):
    pca = PCA(n_components = COMPONENT[i])
    pca.fit(TRAIN_IMG_2D)
    EIGENVECTORS = pca.components_
# using the training data eigenvector to decrease the test data dimension and reconstruct
    TEST_CENTERED = TEST_IMG_2D - pca.mean_
    TEST_TRANSFORM = np.dot(TEST_CENTERED, EIGENVECTORS.T)
    #TEST_TRANSFORM.shape
    TEST_RECONSTRUCT = np.dot(TEST_TRANSFORM, EIGENVECTORS) + pca.mean_
    #TEST_RECONSTRUCT.shape
# calculate PSNR
    M, N = TEST_IMG_2D.shape
    MSE = np.sum(np.square(TEST_IMG_2D - TEST_RECONSTRUCT)) / (M * N)
    MAX = 255
    PSNR = 20 * np.log10(MAX) - 10 * np.log10(MSE)
    PSNR_LIST[i][0] = i + 1
    PSNR_LIST[i][1] = PSNR
# plot PSNR curve
plt.plot(PSNR_LIST[:, 0], PSNR_LIST[:, 1])
plt.title('PSNR Curve')
plt.xlabel('Number of Eigenvector')
plt.ylabel('PSNR')
plt.savefig('PSNR_Curve.png')


928
1728
2528
3328
4128
4928
5728
6528
7328
8128
8928
9728
10528
11328
12128
12928
13728
14528
15328
16128
16928
17728
18528
19328
20128
20928
21728
22528
23328
24128
24928
25728
26528
27328
28128
28928
29728
30528
31328
32128
32928
33728
34528
35328
36128
36928
37728
38528
39328
40128
40928
41728
42528
43328
44128
44928
45728
46528
47328
48128
48928
49728
50528
51328
52128
52928
53728
54528
55328
56128
56928
57728
58528
59328
60128
60928
61728
62528
63328
64128
64928
65728
66528
67328
68128
68928
69728
70528
71328
72128
72928
73728
74528
75328
76128
76928
77728
78528
79328
80128
80928
81728
82528
83328
84128
84928
85728
86528
87328
88128
88928
89728
90528
91328
92128
92928
93728
94528
95328
96128
96928
97728
98528
99328
100128
100928
101728
102528
103328
104128
104928
105728
106528
107328
108128
108928
109728
110528
111328
112128
112928
113728
114528
115328
116128
116928
117728
118528
119328
120128
120928
121728
122528
123328
124128
124928
125728
126528
127328
[[ 1.00000000e+000  1.39

LinAlgError: Singular matrix

Error in callback <function _draw_all_if_interactive at 0x119eb2310> (for post_execute):


LinAlgError: Singular matrix

LinAlgError: Singular matrix

<Figure size 1000x1000 with 30 Axes>