In [None]:
import os
import pickle
import struct
import numpy as np
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [None]:
def dump_pickle_obj(file_path, obj):
    with open(file_path, 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)


def load_pickle_obj(file_path):
    with open(file_path, 'rb') as f:
        return pickle.load(f)


def parse_mpf_file(file_path):
    data, label = [], []

    with open(file_path, 'rb') as f_object:
        size_of_header = struct.unpack('I', f_object.read(4))[0]
        _ = struct.unpack('8s', f_object.read(8))[0]
        _ = f_object.read(size_of_header - 62)
        _ = f_object.read(20)
        code_length = struct.unpack('h', f_object.read(2))[0]
        _ = f_object.read(20)
        sample_number = struct.unpack('I', f_object.read(4))[0]
        dim = struct.unpack('I', f_object.read(4))[0]

        for i in range(1, sample_number):
            sample_label = f_object.read(code_length).decode('GBK').encode('utf-8')
            sample_data = struct.unpack(str(dim) + 'B', f_object.read(dim))

            data.append(list(sample_data))
            label.append(sample_label)

    return data, label


def read_train_set():
    train_set_path = os.path.join(os.path.abspath('.'), 'OLHWDB1.1', 'OLHWDB1.1trn')

    train_set, train_label = [], []
    for file_name in os.listdir(train_set_path):
        mpf_file_path = os.path.join(train_set_path, file_name)
        mpf_data, mpf_label = parse_mpf_file(mpf_file_path)

        train_set += mpf_data
        train_label += mpf_label

    file_path = os.path.join(os.path.abspath('.'), 'cached_object', 'train_set.pkl')
    dump_pickle_obj(file_path, np.array(train_set))

    file_path = os.path.join(os.path.abspath('.'), 'cached_object', 'train_label.pkl')
    dump_pickle_obj(file_path, np.array(train_label))


def read_test_set():
    test_set_path = os.path.join(os.path.abspath('.'), 'OLHWDB1.1', 'OLHWDB1.1tst')

    test_set, test_label = [], []
    for file_name in os.listdir(test_set_path):
        mpf_file_path = os.path.join(test_set_path, file_name)
        mpf_data, mpf_label = parse_mpf_file(mpf_file_path)

        test_set += mpf_data
        test_label += mpf_label

    file_path = os.path.join(os.path.abspath('.'), 'cached_object', 'test_set.pkl')
    dump_pickle_obj(file_path, np.array(test_set))

    file_path = os.path.join(os.path.abspath('.'), 'cached_object', 'test_label.pkl')
    dump_pickle_obj(file_path, np.array(test_label))

In [None]:
# read_train_set()
train_set = load_pickle_obj(os.path.abspath('.'), 'cached_object', 'train_set.pkl')
train_label = load_pickle_obj(os.path.abspath('.'), 'cached_object', 'train_label.pkl')

# read_test_set()
test_set = load_pickle_obj(os.path.abspath('.'), 'cached_object', 'test_set.pkl')
test_label = load_pickle_obj(os.path.abspath('.'), 'cached_object', 'test_label.pkl')

In [None]:
clf = LinearDiscriminantAnalysis()
for dim in range(10, 110, 10):
    # PCA reduced
    pca_model = PCA(n_components=dim)
    pca_model.fit(train_set)
    pca_X_train = pca_model.transform(train_set)
    pca_X_test = pca_model.transform(test_set)

    clf.fit(pca_X_train, train_label)
    score = clf.score(pca_X_test, test_label)
    print('PCA reduced %s, mean accuracy: %s. \n' % (dim, score))
    
    # LDA reduced
    lda_model = LinearDiscriminantAnalysis(n_components=dim)
    lda_model.fit(train_set, train_label)
    lda_X_train = lda_model.transform(train_set)
    lda_X_test = lda_model.transform(test_set)

    clf.fit(lda_X_train, train_label)
    score = clf.score(lda_X_test, test_label)
    print('LDA reduced %s, mean accuracy: %s. \n' % (dim, score))