In [1]:
# Make sure you can import them before proceeding further.
from __future__ import print_function
import imageio
import matplotlib.pyplot as plt
import numpy as np
import os
import sys
import tarfile
import gzip
import struct
from IPython.display import display, Image
from sklearn.linear_model import LogisticRegression
from six.moves.urllib.request import urlretrieve
from six.moves import cPickle as pickle

# Config the matplotlib backend as plotting inline in IPython
%matplotlib inline

In [2]:
#MNIST urls
url_base = 'http://yann.lecun.com/exdb/mnist/'
training_set_images = 'train-images-idx3-ubyte.gz' #size in bytes 9912422
training_set_labels = 'train-labels-idx1-ubyte.gz' #size in bytes 28881
test_set_images = 't10k-images-idx3-ubyte.gz' #size in bytes 1648877
test_set_labels = 't10k-labels-idx1-ubyte.gz' #size in bytes 4542
last_percent_reported = None
# Change me to store data elsewhere
data_root = 'D:\\10_work_spaces\\1_Under_VCS\\github\\4_NN_ML\\data_for_trainings'

In [3]:
"""A hook to report the progress of a download. This is mostly intended for users with
   slow internet connections. Reports every 5% change in download progress.
"""
def download_progress_hook(count, blockSize, totalSize):  
  global last_percent_reported
  percent = int(count * blockSize * 100 / totalSize)
  if last_percent_reported != percent:
    if percent % 5 == 0:
      sys.stdout.write("%s%%" % percent)
      sys.stdout.flush()
    else:
      sys.stdout.write(".")
      sys.stdout.flush()
      
    last_percent_reported = percent

In [4]:
def maybe_download(filename, expected_bytes, force=False):
  """Download a file if not present, and make sure it's the right size."""
  dest_filename = os.path.join(data_root, filename)
  if force or not os.path.exists(dest_filename):
    print('Attempting to download:', filename) 
    filename, _ = urlretrieve(url_base + filename, dest_filename, reporthook=download_progress_hook)
    print('\nDownload Complete!')
  statinfo = os.stat(dest_filename)
  if statinfo.st_size == expected_bytes:
    print('Found and verified', dest_filename)
  else:
    raise Exception(
      'Failed to verify ' + dest_filename + '. Can you get to it with a browser?')
  return dest_filename

In [5]:
train_filename_images = maybe_download(training_set_images, 9912422)
train_filename_labels = maybe_download(training_set_labels, 28881)
test_filename_images = maybe_download(test_set_images, 1648877)
test_filename_labels = maybe_download(test_set_labels, 4542)

Found and verified D:\10_work_spaces\1_Under_VCS\github\4_NN_ML\data_for_trainings\train-images-idx3-ubyte.gz
Found and verified D:\10_work_spaces\1_Under_VCS\github\4_NN_ML\data_for_trainings\train-labels-idx1-ubyte.gz
Found and verified D:\10_work_spaces\1_Under_VCS\github\4_NN_ML\data_for_trainings\t10k-images-idx3-ubyte.gz
Found and verified D:\10_work_spaces\1_Under_VCS\github\4_NN_ML\data_for_trainings\t10k-labels-idx1-ubyte.gz


In [6]:
#it sets byte stream from IMAGES FOR TRAINING to a right position
gzipFile_TrainImages = gzip.open(train_filename_images)
magicNumberImages = int.from_bytes(gzipFile_TrainImages.read(4), byteorder='big') #2051
numberOfImages = int.from_bytes(gzipFile_TrainImages.read(4), byteorder='big') #60000
numberOfRows = int.from_bytes(gzipFile_TrainImages.read(4), byteorder='big') #28
numberOfColumns = int.from_bytes(gzipFile_TrainImages.read(4), byteorder='big') #28
print (magicNumberImages, '\t',numberOfImages, '\t', numberOfRows, '\t', numberOfColumns)

2051 	 60000 	 28 	 28


In [7]:
#it sets byte stream from LABELS FOR TRAINING to a right position
gzipFile_TrainLabels = gzip.open(train_filename_labels)
magicNumberLabels = int.from_bytes(gzipFile_TrainLabels.read(4), byteorder='big') #2049
numberOfLabels = int.from_bytes(gzipFile_TrainLabels.read(4), byteorder='big') #60000
print (magicNumberLabels, '\t',numberOfLabels)

2049 	 60000


In [8]:
#TODO - it doesn't work check it!
NUMBER_FEATURES = 784; #square 28*28 as from data set -> array 784 items
#it creates n-dim array with picture as 784 features and an appropriate label
datasetMnist = np.ndarray(shape=(numberOfLabels, NUMBER_FEATURES), dtype=np.int32) #it creates a 2d array of 32-bit integers
def read_single_image_with_label():
    #read and create array with 784 features
    imagePixelsAsFeatures = np.ndarray(shape=(NUMBER_FEATURES), dtype=np.int32)
    for index in range(NUMBER_FEATURES):
        imagePixelsAsFeatures[index] = int.from_bytes(gzipFile_TrainImages.read(1), byteorder='big')
    #read label    
    label = 0
    label = int.from_bytes(gzipFile_TrainLabels.read(1), byteorder='big')
    # put label and features to a dataset
    datasetMnist[label, :] = imagePixelsAsFeatures    
    #print (label, '\n', imagePixelsAsFeatures)

In [9]:
#x = struct.unpack('f', b'\xdb\x0fI@')
#print (x[0])
for i in range(numberOfLabels):
    read_single_image_with_label()
    
print (datasetMnist.shape)

(60000, 784)


In [12]:
print (datasetMnist[12000])

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 

In [None]:
#todo dump the array to a file?