In [1]:
import numpy as np
from mnist import MNIST
from sklearn.linear_model import LogisticRegression

In [2]:
mnist_loader = MNIST("../data/mnist/")
mnist_loader.gz = True

In [3]:
X, y = mnist_loader.load_training()
_X, _y = mnist_loader.load_testing()

In [4]:
X = np.array(X, dtype='float32') / 255
y = np.array(y, dtype='int16')
_X = np.array(_X, dtype='float32') / 255
_y = np.array(_y, dtype='int16')

In [5]:
def compute_black(X, print_iterval):
    
    # init
    black = []
    
    for idx in range(X.shape[0]):
        
        # init
        b = np.empty((28,28))
        
        # print stat
        if idx % print_iterval == 0:
            print('done', idx)
            
        # reshape image
        x = X[idx].reshape((28,28))
        
        b[0, 0] = 1 if x[0, 0] == 0 else 0
        
        # base case
        for i in range(1, 28):
            t = b[0, i - 1]
            b[0, i] = t + 1 if x[0, i] == 0 else t
            
        # recursive
        for i in range(1, 28):
            for j in range(28):
                if j == 0:
                    t = b[i - 1, j]
                    b[i, j] = t + 1 if x[i, j] == 0 else t
                else:
                    t = b[i - 1, j] + b[i, j - 1] - b[i - 1, j - 1]
                    b[i, j] = t + 1 if x[i, j] == 0 else t
                    
        black.append(b)
        
    return np.array(black, dtype='int')

In [6]:
%%time
X_black = compute_black(X, 10000)

done 0
done 10000
done 20000
done 30000
done 40000
done 50000
CPU times: user 2min 8s, sys: 526 ms, total: 2min 8s
Wall time: 2min 9s


In [7]:
k = 100
np.random.seed(666)
rect = []

while len(rect) != k:
    
    # randomly pick top left point, length, and breadth
    tl = np.random.randint(low=5, high=23, size=2)
    b = np.random.randint(low=5, high=28 - tl[1])
    l = np.random.randint(low=5, high=28 - tl[0])
    if b * l < 130 or b * l >170:
        continue
    
    # other corners
    tr = np.array([tl[0] + l, tl[1]])
    bl = np.array([tl[0], tl[1] + b])
    br = np.array([tl[0] + l, tl[1] + b])
    
    # mid points
    vt = np.array([tl[0] + l//2, tl[1]])
    vb = np.array([tl[0] + l//2, bl[1]])
    hl = np.array([tl[0], tl[1] + b//2])
    hr = np.array([tr[0], tr[1] + b//2])

    rect.append([tl, tr, bl, br, vt, vb, hl, hr])

In [8]:
def HARR(X, b, print_interval):
    _X = np.empty((X.shape[0], 200), dtype='int16')
    for i in range(X.shape[0]):
        
        # debug info
        if i % print_interval == 0:
            print('done', i)
        
        # extract image
        img = b[i]
        
        # compute features
        for j in range(k):
            tl, tr, bl, br, vt, vb, hl, hr = rect[j]
            
            top = img[hr[0], hr[1]] - img[hl[0], hl[1]] - img[tr[0], tr[1]] + img[tl[0], tl[1]]
            bottom = img[br[0], br[1]] - img[bl[0], bl[1]] - img[hr[0], hr[1]] + img[hl[0], hl[1]]
            v_score = top - bottom
            
            left = img[vb[0], vb[1]] - img[bl[0], bl[1]] - img[vt[0], vt[1]] + img[tl[0], tl[1]]
            right = img[br[0], br[1]] - img[vb[0], vb[1]] - img[tr[0], tr[1]] + img[vt[0], vt[1]]
            h_score = left - right

            _X[i][j*2] = v_score
            _X[i][(j*2)+1] = h_score

    return _X

In [9]:
%%time
harr_X = HARR(X, X_black, 10000)

done 0
done 10000
done 20000
done 30000
done 40000
done 50000
CPU times: user 44 s, sys: 964 µs, total: 44 s
Wall time: 44.1 s


In [11]:
mnist_lr = LogisticRegression(n_jobs=-1)

In [12]:
%%time
mnist_lr.fit(harr_X, y)

  " = {}.".format(self.n_jobs))


CPU times: user 7min 51s, sys: 64.8 ms, total: 7min 51s
Wall time: 7min 55s


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [13]:
yhat = mnist_lr.predict(harr_X)
print('Train acc:', sum(yhat == y)/len(y))

Train acc: 0.9107166666666666


In [15]:
_X_black = compute_black(_X, 1000)

done 0
done 1000
done 2000
done 3000
done 4000
done 5000
done 6000
done 7000
done 8000
done 9000


In [16]:
_harr_X = HARR(_X, _X_black, 1000)

done 0
done 1000
done 2000
done 3000
done 4000
done 5000
done 6000
done 7000
done 8000
done 9000


In [18]:
_yhat = mnist_lr.predict(_harr_X)
print('Test acc:', sum(_yhat == _y)/len(_y))

Test acc: 0.9133
