In [4]:
import numpy as np
import multiprocessing as mp
from multiprocessing import Pool, cpu_count, Value
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA, KernelPCA
from sklearn.pipeline import Pipeline
from sklearn.datasets import fetch_openml
import os.path
import cv2
import matplotlib.pyplot as plt
from ctypes import *
import sys
import threading

In [5]:
SO_DIRPATH = "../libs/"
TRAIN_IMAGES_DIR = "../../mnist/images/train_images" 
TEST_IMAGES_DIR = "../../mnist/images/test_images"
TRAIN_LABELS_DIR = "../../mnist/labels/train_labels"
TEST_LABELS_DIR =  "../../mnist/labels/test_labels"
IMAGE_NAME = "number{}.png"
LABELS_NAME = "labels.txt"
LOOP_LABELS_NAME = "loops.txt"
N_TRAIN = 60000

In [11]:
def readImages(data):
    global n_finished
    path, loop_label = data
    a = cv2.imread(path, 0)
    a = a.reshape(-1)
    with n_finished.get_lock():
        n_finished.value += 1
    return np.append(a, loop_label)

def readLabels(label_dir, label_name):
    with open(os.path.join(label_dir, label_name), "r") as f:
        content = f.read()
    
    label = content.split(",")
    label = map(lambda x: int(x), label)
    return np.array(list(label))

def run(images, data):
    images.append(None)
    with Pool(cpu_count()) as pool:
        images[0] = pool.map(readImages, data)
        
def read(image_dir, label_dir):
    global n_finished
    number_label = readLabels(label_dir, LABELS_NAME)[:N_TRAIN]
    loop_label = readLabels(label_dir, LOOP_LABELS_NAME)[:N_TRAIN]
    images = []
    data = [(os.path.join(image_dir, IMAGE_NAME.format(i + 1)), loop_label[i]) for i in range(N_TRAIN)]
    t = threading.Thread(target = run, args=(images, data,  ))
    t.start()
    while t.is_alive():
        sys.stdout.write('\r' + "n_finished={}".format(n_finished.value))
        sys.stdout.flush()
   
    
    return images, number_label.reshape(-1, 1)
     

In [12]:
n_finished = Value('i', 0)
X_train, y_train = read(TRAIN_IMAGES_DIR, TRAIN_LABELS_DIR)

n_finished=60000

In [15]:
X_train = X_train[0]

In [16]:
np.shape(y_train)

(60000, 1)

In [19]:
train_data = list(zip(X_train, y_train))
np.random.shuffle(train_data)
X_train, y_train = list(zip(*train_data))

In [28]:
X_train = np.array(X_train)
y_train = np.array(y_train).reshape(-1, )

In [29]:
np.shape(y_train)

(60000,)

# Compression

## 1. Linear PCA

In [24]:
pca = PCA(n_components = 154)
X_train_reduced = pca.fit_transform(X_train)

## 2. Kernel PCA

## 3. LLE

# Model Training

## 1. KNN Classifier

In [30]:
knn_clf = KNeighborsClassifier(n_neighbors=4, weights="distance", n_jobs=-1)
knn_clf.fit(X_train, y_train)

## 2. XGB

# Evaluation

In [None]:
cross_val_score(knn_clf, X_train, y_train, cv=3, scoring="accuracy")

In [None]:
y