In [14]:
import svhnFileReader as sv
import numpy as np
import TrainSVHN as ts
import importlib

## loading files

In [24]:
# Dataset download and confirming data and functions
# ==================================================

# reload dependencies 
importlib.reload(ts)
importlib.reload(sv)

files = ["train", "test", "extra"]

# download svhn files
for file in files:
    sv.maybeDownload(file)

# choose some random indieces
data_samples = np.random.permutation(400)[:20]
print("The random file indexes", data_samples) # Looking good?

# get file names and labels associated with random indieces
train_files, train_labels = sv.getLabels(
    'train/digitStruct.mat', data_samples)

# check the file names and labels
print("Train files are", train_files)
print("Label of train files are", train_labels)

# get the data in file names
data = sv.getImage(train_files, 'train/', shape=(80, 40))
print("Shape of train data is",data.shape) # what is the size? does it match?
print("Min and Max of data are", np.min(data), np.max(data))
print("Train labels are before parsing:\n", train_labels[:15])
print("Train labels are after parsing:\n", sv.parseLabels(train_labels[:15],3))

# show some of the data
sv.showMultipleArraysHorizontally(data[:15], train_labels[:15], 3)


train already exists
test already exists
extra already exists
The random file indexes [114 342 143 252 321 268 146 394 177  45 173  39 372 340  16 209  55  37
 384 275]
Train files are ['115.png', '343.png', '144.png', '253.png', '322.png', '269.png', '147.png', '395.png', '178.png', '46.png', '174.png', '40.png', '373.png', '341.png', '17.png', '210.png', '56.png', '38.png', '385.png', '276.png']
Label of train files are [[3.0, 2.0], [2.0], [1.0, 9.0, 5.0], [1.0, 9.0], [3.0, 1.0], [3.0, 5.0, 3.0], [1.0, 5.0], [1.0, 2.0, 9.0], [2.0, 1.0], [3.0, 2.0, 7.0], [5.0, 9.0], [2.0, 10.0], [3.0, 8.0], [2.0, 8.0, 9.0], [7.0, 9.0], [2.0, 6.0], [5.0, 6.0], [1.0, 7.0], [2.0, 6.0, 1.0], [1.0, 7.0, 8.0]]
Shape of train data is (20, 50, 100, 3)
Min and Max of data are 0.0 255.0
Train labels are before parsing:
 [[3.0, 2.0], [2.0], [1.0, 9.0, 5.0], [1.0, 9.0], [3.0, 1.0], [3.0, 5.0, 3.0], [1.0, 5.0], [1.0, 2.0, 9.0], [2.0, 1.0], [3.0, 2.0, 7.0], [5.0, 9.0], [2.0, 10.0], [3.0, 8.0], [2.0, 8.0, 9.0], [7.0

In [155]:
# Preprocessing
# =============

# dependencies
importlib.reload(ts)
importlib.reload(sv)

# configurations
dataset = 'train'
big_batch_size = 2000
image_shape = (80,40)
max_digits_in_label = 4
pickle_file = dataset+"_preprocessed"

def preprocess():
    # read lots of files
    struct_file = dataset+"/digitStruct.mat"
    number_of_files = sv.getNumberOfFiles(struct_file)
#     number_of_files = big_batch_size # just for debug
    data_samples = np.random.permutation(number_of_files)
    file_handle = open(pickle_file,"wb")

    # iterate over data in big batches
    for batch_start in range(0,number_of_files, big_batch_size):

        # read the .mat file and parse attributes of data files
        batch_indexes = data_samples[batch_start:batch_start+big_batch_size]

        file_names,train_labels = sv.getLabels(struct_file,batch_indexes)
        train_values = sv.getImage(file_names, dataset,shape=image_shape)


        # form and normalize
        pixel_depth = 255
        train_values = sv.scaleData(train_values,pixel_depth)
        train_labels = sv.parseLabels(train_labels,max_digits_in_label)

        # save in file
        np.save(file_handle, train_values)
        np.save(file_handle, train_labels)

        # process status
        completion_percentil = 100*(batch_start+big_batch_size)/number_of_files
        print("Compeleted %%%d"%completion_percentil)

    # always close the file
    file_handle.close()
    
# perform preprocessing
preprocess()

Compeleted %5
Compeleted %11
Compeleted %17
Compeleted %23
Compeleted %29
Compeleted %35
Compeleted %41
Compeleted %47
Compeleted %53
Compeleted %59
Compeleted %65
Compeleted %71
Compeleted %77
Compeleted %83
Compeleted %89
Compeleted %95
Compeleted %101


In [44]:
# Data extractor
# ==============

def dataGenerator(batch_size,file_name):
    file_handle = open(file_name, "rb")
    while True:

        # get data array
        try:
            data = np.load(file_handle)
        # if reached end of file
        except OSError:
#             print("in dataGenerator() pointer is at",file_handle.tell(),"... going back.")
            # go to the beginning
            file_handle.seek(0)
            # and try loading again
            data = np.load(file_handle)

        # get label array
        labels = np.load(file_handle)
        
        # randomize
        data,labels = sv.shuffleArrays([data,labels])
        
        # get batches        
        number_of_datapoints = labels.shape[0]
        full_batches = number_of_datapoints//batch_size # few datapoints are going to waste here
        start_point = 0
        for batch_start in range(0,full_batches,batch_size):
            batch_data = data[batch_start:batch_start+batch_size]
            batch_labels = labels[batch_start:batch_start+batch_size]
            
            # yield both
            yield batch_data, batch_labels

In [37]:
# validate dataGenerator and disk data
# ====================================

importlib.reload(sv)

gen = dataGenerator(3,pickle_file)
sample_data,sample_labels = next(gen)

print(sv.multipleOneHots(sample_labels,[max_digits_in_label+1]+[11]*max_digits_in_label))
sv.showMultipleArraysHorizontally(sample_data+.5,sample_labels,3)


[[ 0.  0.  1.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  1.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  1.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  1.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  1.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  1.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]]


In [66]:
def oneHotsToLabels(onehots,class_sizes):
    offset=0
    labels = np.zeros((len(onehots),len(class_sizes)),int)
    for i in range(len(class_sizes)):
        labels[:,i]=np.argmax(onehots[:,offset:offset+class_sizes[i]],1)
        offset+=class_sizes[i]
    return labels

one_hots = sv.multipleOneHots(sample_labels,[max_digits_in_label+1]+[11]*max_digits_in_label)
print(oneHotsToLabels(one_hots,[max_digits_in_label+1]+[11]*max_digits_in_label))

sv.showMultipleArraysHorizontally(sample_data+.5,sample_labels,3)

[[2 2 5 0 0]
 [2 1 9 0 0]
 [2 2 3 0 0]]


In [163]:
# Make model
# ==========

# dependencies
importlib.reload(ts)

# configurations
initial_learning_rate = 1e-3
batch_size = 8

# model
size_of_classes = [max_digits_in_label+1]+[11]*max_digits_in_label
network = ts.SVHNTrainer()
network.initial_learning_rate = initial_learning_rate
network.image_height = image_shape[1]
network.image_width = image_shape[0]
network.batch_size = batch_size
network.class_sizes = size_of_classes
network.num_labels = sum(network.class_sizes)
network.makeGraph()

# validation_data,validation_labels=dataMaker(9)
# test_data,test_labels=dataMaker(100)
# mf.showMultipleArraysHorizontally([test_data[i,0:28,0:140,0] for i in range(5)], test_labels,max_per_row=1)

def dataMaker(batch_size):
    gen = dataGenerator(batch_size,pickle_file)
    while True:
        data, labels = next(gen)
        labels = sv.multipleOneHots(labels,size_of_classes)
        yield data, labels

    
# test generator
# --------------
# generator = dataMaker(3)
# sample_gen_data, sample_gen_label = next(generator)
# print(sample_gen_label)
# sv.showMultipleArraysHorizontally(sample_gen_data+.5)



logit shape is [8, 49]
Seperated shapes are [[8, 5], [8, 11], [8, 11], [8, 11], [8, 11]]


In [164]:
# Train Model
# ===========

# dependencies
importlib.reload(ts)

# configurations
number_of_steps = 1000
validation_steps = 10
test_steps = 100
network.report_step = number_of_steps/10

# train model
generator = dataMaker(network.batch_size)
prediction_sample = network.train(number_of_steps,generator,validation_steps,test_steps)

self.loss 29.9678
Validation accuracy [ 17.5    3.75  11.25   3.75  30.  ]
self.loss 7.99489
Validation accuracy [ 52.5    5.     8.75  60.    92.5 ]
self.loss 7.53297
Validation accuracy [ 57.5   16.25  15.    73.75  92.5 ]
self.loss 7.78253
Validation accuracy [ 56.25  20.     8.75  68.75  95.  ]
self.loss 9.40606
Validation accuracy [ 60.    27.5   15.    78.75  98.75]
self.loss 7.57804
Validation accuracy [ 53.75  25.    17.5   72.5   96.25]
self.loss 8.58639
Validation accuracy [ 45.    33.75  13.75  65.    92.5 ]
self.loss 7.75013
Validation accuracy [ 52.5   21.25  15.    71.25  98.75]
self.loss 8.51332
Validation accuracy [ 50.    25.    15.    66.25  93.75]
self.loss 8.14799
Validation accuracy [ 61.25  32.5   13.75  72.5   93.75]
self.loss 6.70943
Validation accuracy [ 52.5   17.5   12.5   65.    96.25]
Test accuracy: [ 55.     24.875  13.875  69.25   96.75 ]


In [131]:
# Check test results
# ==================


predicted_labels = oneHotsToLabels(prediction_sample[1],size_of_classes)
print(predicted_labels)
sv.showMultipleArraysHorizontally(prediction_sample[0]+.5,predicted_labels,3)


[[0 0 0 0 0]
 [2 1 0 0 0]
 [2 1 0 0 0]
 [0 0 0 0 0]
 [2 0 0 0 0]
 [0 0 0 0 0]
 [2 0 0 0 0]
 [2 0 0 0 0]]
