Class to allow epochs of InfiMNIST to be iterated over and loaded into Tensorflow notebooks

http://leon.bottou.org/projects/infimnist

TODO:  change APIs to only load deformations

use orginal minst for actual train and test

merge classes so minst, infimnist, and alignmnsit all use the same base class


In [None]:
%%writefile  infimnist.py

from __future__ import print_function
import gzip
import os
import urllib
import numpy as np

class InfiMNIST(object):

    def __init__(self, filename='alignmnist.npz'):
        self.data_dir = "."
        self._infimnist_start = 10000
        self._infimnist_stop =  self._infimnist_start + 59999        
        
    # does not work
    def dense_to_one_hot(self, labels_dense, num_classes=10):
        """Convert class labels from scalars to one-hot vectors."""
        num_labels = labels_dense.shape[0]
        index_offset = np.arange(num_labels) * num_classes
        labels_one_hot = np.zeros((num_labels, num_classes))
        labels_one_hot.flat[index_offset + labels_dense.ravel()] = 1
        return labels_one_hot
    
    def reshape_images(self, images):
        #print images.shape[0], images.shape[1] * images.shape[2]
        images = images.reshape(images.shape[0], images.shape[1] * images.shape[2])
        # Convert from [0, 255] -> [0.0, 1.0].
        images = images.astype(np.float32)
        images = np.multiply(images, 1.0 / 255.0)
        return images
        
    def shuffle_epoch(self, images, labels):   
        num = labels.shape[0]
        pids = np.arange(num)
        np.random.shuffle(pids)
        images, labels = images[pids], labels[pids]
        return images, labels
    
    
    def _read32(bytestream):
        dt = np.dtype(np.uint32).newbyteorder('>')
        return np.frombuffer(bytestream.read(4), dtype=dt)

    def extract_images(filename):
        """Extract the images into a 4D uint8 numpy array [index, y, x, depth]."""
        #print('Extracting', filename)
        with gzip.open(filename) as bytestream:
        magic = _read32(bytestream)
        if magic != 2051:
            raise ValueError('Invalid magic number %d in MNIST image file: %s' % (magic, filename))
        num_images = _read32(bytestream)
        rows = _read32(bytestream)
        cols = _read32(bytestream)
        buf = bytestream.read(rows * cols * num_images)
        data = np.frombuffer(buf, dtype=np.uint8)
        data = data.reshape(num_images, rows, cols, 1)
        return data
    
    def extract_labels(filename, one_hot=False):
        """Extract the labels into a 1D uint8 numpy array [index]."""
        #print('Extracting', filename)
        with gzip.open(filename) as bytestream:
            magic = _read32(bytestream)
            if magic != 2049:
                raise ValueError('Invalid magic number %d in MNIST label file: %s' %(magic, filename))
        num_items = _read32(bytestream)
        buf = bytestream.read(num_items)
        labels = numpy.frombuffer(buf, dtype=np.uint8)
        return dense_to_one_hot(labels)
    
    def read_data_sets(self,image_file, label_file):
        train_images = extract_images(image_file)
        train_labels = extract_labels(label_file)
        
        return images, labels
  
    
    def next_epoch(self):
        #print "creating infimnist pat files %d - %d" % (self._infimnist_start, self._infimnist_stop)
        lab_file = os.path.join(self.data_dir, 'infimnist-labels')
        pat_file = os.path.join(self.data_dir, 'infimnist-images')

        # execute cmd

        with open(lab_file, 'wb') as out:
                cmd = "{} lab {} {} ".format(INFIMNIST, self._infimnist_start, self._infimnist_stop)
                #print cmd
                Popen(cmd, shell=True, stdout=out, cwd=self.data_dir).wait()

        with open(pat_file, 'wb') as out:
                cmd = "{} pat {} {} ".format(INFIMNIST, self._infimnist_start, self._infimnist_stop)
                #print cmd
            Popen(cmd, shell=True, stdout=out, cwd=self.data_dir).wait()

        cmd1 = "rm infimnist-labels.gz infimnist-images.gz mnist-labels.gz mnist-images.gz"
        cmd2 = "gzip -f infimnist-labels infimnist-images"

        # note: we don't regenerate the test data, only the training data

        os.system(cmd1)
        os.system(cmd2)

        self._infimnist_start = self._infimnist_stop + 1
        self._infimnist_stop =  self._infimnist_start + 59999
        
        # read files in, reshape, and shuffle for each epoch
        # (should not shuffle original)
        x_train, y_train = read_data_sets(pat_file+'.gz', lab_file+'.gz')
     
        x_train = self.reshape_images(x_train)
        y_train = self.dense_to_one_hot(y_train)

        #print x_train.shape, y_train.shape, x_test.shape, y_test.shape
        x_train, y_train = self.shuffle_epoch(x_train,y_train)
        #print x_train.shape, y_train.shape, x_test.shape, y_test.shape
        
        
        return x_train, y_train, x_test, y_test

In [None]:
%run alignmnist.py

In [None]:
a = AlignMNIST()

In [None]:
x_train, y_train, x_test, y_test = a.next_epoch()

In [None]:
print x_train.shape, y_train.shape, x_test.shape, y_test.shape

In [None]:
from matplotlib import pyplot
import matplotlib as mpl
%matplotlib inline  

def show(image): 
    fig = pyplot.figure()
    ax = fig.add_subplot(1,1,1)
    imgplot = ax.imshow(image, cmap=mpl.cm.Greys)
    imgplot.set_interpolation('nearest')
    ax.xaxis.set_ticks_position('top')
    ax.yaxis.set_ticks_position('left')
    pyplot.show()

In [None]:
image = x_train[0,:]
image = image.reshape(28,28)
print y_train[0]
show(image)

In [None]:
for i in range(75):
    x_train, y_train, x_test, y_test = a.next_epoch()
    print i, x_train.shape, y_train.shape, x_test.shape, y_test.shape