In [22]:
from __future__ import print_function
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import re
import sys
import tarfile
from IPython.display import display, Image
from PIL import Image
from sklearn.linear_model import LogisticRegression
from six.moves.urllib.request import urlretrieve
from six.moves import cPickle as pickle

%matplotlib inline

In [23]:
# setting constants used
num_classes = 10 # total number of classes
image_size = 28  # Pixel width and height.
pixel_depth = 255.0  # Number of levels per pixel
np.random.seed(133)

In [24]:
def extract_tar_file(filename):
    global num_classes
    root = os.path.splitext(os.path.splitext(filename)[0])[0] # remove tar.gz
    if os.path.isdir(root):
        pass
    else:
        print('Extraction of data for %s this may take while please wait...' % root)
        tar = tarfile.open(filename)
        sys.stdout.flush()
        tar.extractall()
        tar.close()
        
    data_folders = [os.path.join(root, d) for d in sorted(os.listdir(root))
                                    if os.path.isdir(os.path.join(root, d))]
    
    assert len(data_folders) == num_classes
    return data_folders
# End

In [25]:
train_filename = 'notMNIST_large.tar.gz'
test_filename = 'notMNIST_small.tar.gz'

In [26]:
train_folders = extract_tar_file(train_filename)
test_folders = extract_tar_file(test_filename)

In [27]:
def load_letter_data(folder, min_number_of_image):
    # load datafor a single letter label
    image_files = os.listdir(folder)
    #dataset = np.ndarray(shape=(len(image_files), image_size, image_size), dtype=np.float32)
    data = []
    #num_images = 0
    
    for image in image_files:
        image_file = os.path.join(folder, image)
        try:
            # image_data = (ndimage.imread(image_file).astype(float) - pixel_depth / 2) / pixel_depth
            image_data = (np.array(Image.open(image_file)).astype(float) - pixel_depth / 2) / pixel_depth
            if image_data.shape != (image_size, image_size):
                print('UnExpected Image shape {0} skipping file {1}'.format(str(image_data.shape), image))
                continue
            data.append({'ImagePath' : image,
                        'Dimension': image_size,
                        'Feature': image_data,
                        'Label': folder})
            #num_images += 1
        except IOError as er:
            print('couldn \'t read ', image_file, ':', er)
    return data
# End

In [28]:
def maybe_pickle(data_folders: str, min_num_images_per_class: int, force=False) ->list:
    dataset_names = []
    for folder in data_folders:
        set_filename = folder + '.pickle'
        dataset_names.append(set_filename)
        
        if os.path.exists(set_filename) and not force:
            print('%s already present skipping pickling.'%set_filename)
        else:
            print('pickling %s '%set_filename)
            dataset = load_letter_data(folder, min_num_images_per_class)
            #print(dataset)
            try:
                with open(set_filename, 'wb') as f:
                    pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
            except IOError as e:
                print('unable to save the data to %s'%set_filename, ':', e)
    return dataset_names
# End

In [29]:
train_datasets = maybe_pickle(train_folders, 1, True)
test_datasets = maybe_pickle(test_folders, 1, True)

pickling notMNIST_large\A.pickle 
couldn 't read  notMNIST_large\A\RnJlaWdodERpc3BCb29rSXRhbGljLnR0Zg==.png : cannot identify image file 'notMNIST_large\\A\\RnJlaWdodERpc3BCb29rSXRhbGljLnR0Zg==.png'
couldn 't read  notMNIST_large\A\SG90IE11c3RhcmQgQlROIFBvc3Rlci50dGY=.png : cannot identify image file 'notMNIST_large\\A\\SG90IE11c3RhcmQgQlROIFBvc3Rlci50dGY=.png'
couldn 't read  notMNIST_large\A\Um9tYW5hIEJvbGQucGZi.png : cannot identify image file 'notMNIST_large\\A\\Um9tYW5hIEJvbGQucGZi.png'


Exception: Data must be 1-dimensional

In [10]:
def load_dataset(pkl_dataset_file: list) ->dict:
    res = {}
    for dataset in pkl_dataset_file:
        with open(dataset,'rb') as f:
            res[dataset.replace('\\', '.').split('.')[1]] = pickle.load(f)
    return res
# End

In [11]:
res = load_dataset(train_datasets)

In [12]:
def make_array(nb_rows, img_size):
    dataset , label = None, None
    if nb_rows:
        dataset = np.ndarray((nb_rows, img_size, img_size), dtype=np.float32)
        label = np.ndarray(nb_rows, dtype=np.float32)
    return dataset, label
#End

In [13]:
def merge_datasets(pickles_files, train_size, valid_size=0):
    global image_size
    num_classes = len(pickles_files)
    valid_dataset, valid_labels = make_array(valid_size, image_size)
    train_dataset, train_labels = make_array(train_size, image_size)
    vsize_per_class = valid_size // num_classes
    tsize_per_class = train_size // num_classes
    
    start_v, start_t = 0, 0
    end_v, end_t = vsize_per_class, tsize_per_class
    end_l = vsize_per_class+tsize_per_class
    for label, pickle_file in enumerate(pickles_files):
        try:
            with open(pickle_file) as f:
                letter_set = pickle.load(f)
                # let's shuffle the letters to have random validation and training set
                np.random.shuffle(letter_set)
                if valid_dataset is not None:
                    valid_latter = letter_set[:vsize_per_class, :, :]
                    valid_dataset[start_v: end_v ,:, :] = valid_latter
                    valid_labels[start_v: end_v] = label
                    start_v += vsize_per_class
                    end_v += vsize_per_class
                train_latter = letter_set[vsize_per_class: end_l, :, :]
                train_dataset[start_t: end_t ,:, :] = train_latter
                train_labels[start_t: end_t] = label
                start_t += tsize_per_class
                end_t += tsize_per_class
        except Exception as e:
            print("not able to process the data from ", pickle_file,':', e)
    return valid_dataset, valid_labels, train_dataset, train_labels
# End

In [14]:
train_size = 200000
valid_size = 10000
test_size = 10000

valid_dataset, valid_labels, train_dataset, train_labels = merge_datasets(
  train_datasets, train_size, valid_size)
_, _, test_dataset, test_labels = merge_datasets(test_datasets, test_size)

print('Training:', train_dataset.shape, train_labels.shape)
print('Validation:', valid_dataset.shape, valid_labels.shape)
print('Testing:', test_dataset.shape, test_labels.shape)

not able to process the data from  notMNIST_large\A.pickle : 'charmap' codec can't decode byte 0x9d in position 1081: character maps to <undefined>
not able to process the data from  notMNIST_large\B.pickle : 'charmap' codec can't decode byte 0x81 in position 1020: character maps to <undefined>
not able to process the data from  notMNIST_large\C.pickle : 'charmap' codec can't decode byte 0x9d in position 368: character maps to <undefined>
not able to process the data from  notMNIST_large\D.pickle : 'charmap' codec can't decode byte 0x90 in position 881: character maps to <undefined>
not able to process the data from  notMNIST_large\E.pickle : 'charmap' codec can't decode byte 0x90 in position 696: character maps to <undefined>
not able to process the data from  notMNIST_large\F.pickle : 'charmap' codec can't decode byte 0x8d in position 284: character maps to <undefined>
not able to process the data from  notMNIST_large\G.pickle : 'charmap' codec can't decode byte 0x8d in position 3781