In [1]:
# Import the modules
from sklearn.externals import joblib
from sklearn import datasets
from skimage.feature import hog
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from collections import Counter
from os import system
import random
from sklearn.metrics import f1_score
from sklearn import linear_model

# Hog features extraction
def hog_f(features):
    list_hog_fd = []
    for feature in features:
        fd = hog(feature.reshape((28, 28)), orientations=9, pixels_per_cell=(14, 14), cells_per_block=(1, 1), visualise=False)
        list_hog_fd.append(fd)
    return np.array(list_hog_fd, 'float64')

In [3]:
# Load and prepare the dataset
try:
    dataset = joblib.load("digits.pkl")
except FileNotFoundError:
    dataset = datasets.fetch_mldata("MNIST Original")
    d = [[dataset.data[i], dataset.target[i]] for i in range(len(dataset.data))]
    np.random.shuffle(d)
    d = np.transpose(d)
    dataset.data = [x for x in d[0]]
    dataset.target = d[1]
    joblib.dump(dataset, "digits.pkl", compress=5)

# Extract the features and labels
features = np.array(dataset.data, 'int16')
labels = np.array(dataset.target, 'int')

# Extract the hog features
hog_features = hog_f(features)

size = int(len(hog_features) * 0.8)

In [4]:
# Determine optimal neighbors number according accuracy and F1 score
# Requires a minute on every iteration so can be skipped with hardcoded n_neighbors_opt
list_v = []
idmax = 0
max_v = 0.0
for i in range (1,21):
    clf = KNeighborsClassifier(n_neighbors=i)
    clf.fit(hog_features[:size], labels[:size])
    pred = clf.predict(hog_features[size:])
    list_v.append([f1_score(labels[size:], pred, average="macro"), clf.score(hog_features[size:], labels[size:])])
    if list_v[-1][0]+list_v[-1][-1] > max_v:
        max_v = list_v[-1][0]+list_v[-1][-1]
        idmax = i
n_neighbors_opt = idmax

In [6]:
# Prepare kNN classifier and compute accuracy and F1 score
clf = KNeighborsClassifier(n_neighbors=n_neighbors_opt)
clf.fit(hog_features[:size], labels[:size])
pred = clf.predict(hog_features[size:])
result = [f1_score(labels[size:], pred, average="macro"), clf.score(hog_features[size:], labels[size:])]
print ("KNN accuracy = %s, F1 score = %s" % (result[1], result[0]))

joblib.dump(clf, "digits_cls.pkl", compress=3)

KNN accuracy = 0.922071428571, F1 score = 0.921930710115


['digits_cls.pkl']

In [7]:
# Prepare noise filter
# Works up to 5 minutes in case of generating so expect delays
try:
    joblib.load("filter_cls.pkl")
except FileNotFoundError:
    noise = [[[random.randint(0, 255) if random.randint(0, 2) == 0 else 0 for y in range(len(dataset.data[0]))] 
              for x in range(len(dataset.data))], [0 for x in range(len(dataset.target))]]

    labels_f = [1 for x in range(len(dataset.target))] + noise[1]
    features_f = np.append(np.array(dataset.data, 'int16'), noise[0]).reshape((-1, 28*28))

    hog_features_f = hog_f(features_f)

    clf_f = KNeighborsClassifier(n_neighbors=n_neighbors_opt)
    clf_f.fit(hog_features_f, labels_f)
    joblib.dump(clf_f, "filter_cls.pkl", compress=3)

In [8]:
# Test kNN classifier on pictures
system("python performRecognition.py 3402333_skanirovanie0003.jpg")
system("python performRecognition.py 2448729.jpg")
system("python performRecognition.py 63555_640.jpg")
system("python performRecognition.py 1296635738_4.jpg")

0

In [9]:
# Prepare linear regression and compute accuracy and F1 score
# I decided to choose logistic regression because of multiclass problem
regr = linear_model.LogisticRegression(C=10, solver='newton-cg')
regr.fit(hog_features[:size], labels[:size])
pred = regr.predict(hog_features[size:])
result = [f1_score(labels[size:], pred, average="macro"), regr.score(hog_features[size:], labels[size:])]
print ("Linear regression accuracy = %s, F1 score = %s" % (result[1], result[0]))
joblib.dump(regr, "digits_cls.pkl", compress=3)

Linear regression accuracy = 0.867214285714, F1 score = 0.866711182618


['digits_cls.pkl']

In [10]:
# Test linear regression on pictures
system("python performRecognition.py 3402333_skanirovanie0003.jpg")
system("python performRecognition.py 2448729.jpg")
system("python performRecognition.py 63555_640.jpg")
system("python performRecognition.py 1296635738_4.jpg")

0

Thus, the kNN classifier shows the best results, comparing both the accuracy and the F1 score, and the results in the tests with images.