In [None]:
%%time 

# Python 2.7
%matplotlib nbagg
%matplotlib inline 

import os
import sys
print(sys.version)


import cPickle as pickle
import scipy.io
import numpy as np
# import matplotlib.pyplot as plt
# import theano
# import theano.tensor as T
# import lasagne 
# import lasagne.layers as L

import tools as tls

from sklearn.utils import shuffle
from data_loaders import svhn


In [None]:
### META - HOW THE PROGRAM WORKS
reduce_dataset_size = False
reduced_size = 10000

reduce_dataset_classes = True
reduced_classes = 3

integet_encoding = False # Are the targets integers or one-hot?

random_state = 1234

In [None]:
%%time

### LOAD DATA!
# Source: http://ufldl.stanford.edu/housenumbers/


(x_trai, t_trai), (x_test, t_test), (x_vali, t_vali) = \
    svhn.load_supervised(conv=False, extra=False, normalize=True)

# print('train_set', train_set[0].shape, train_set[1].shape)
# print('train_set', test_set[0].shape, test_set[1].shape)
# print('train_set', valid_set[0].shape, valid_set[1].shape)

if(reduce_dataset_size): ## Reduce the total size of the data set
    print('Reduce size')
    x_trai = x_trai[:reduced_size, :]
    t_trai = t_trai[:reduced_size]

    x_test = x_test[:reduced_size, :]
    t_test = t_test[:reduced_size]

    x_vali = x_vali[:reduced_size, :]
    t_vali = t_vali[:reduced_size]

print('train set', x_trai.shape, t_trai.shape)
print('valid set', x_vali.shape, t_vali.shape)
print('test  set', x_test.shape, t_test.shape)



In [None]:
%%time

### REDUCE DATA SET

def bernoullisample(x):
    return np.random.binomial(1,x,size=x.shape).astype(theano.config.floatX)

if reduce_dataset_classes:
    print('Reduced dataset, with {} classes'.format(reduce_dataset_classes))
    num_classes = reduced_classes
    idxs_train = []
    idxs_valid = []
    idxs_test = []

    if(integet_encoding): ### IF integer encoding
        print('integer encoding')
        idxs_train += np.where(t_trai == i)[0].tolist()
        idxs_valid += np.where(t_vali == i)[0].tolist()
        idxs_test  += np.where(t_test == i)[0].tolist()
    
    else: ### IF one-hot encoding
        print('one-hot encoding')
        int_trai = np.where(t_trai == 1)[1]
        int_test = np.where(t_test == 1)[1]
        int_vali = np.where(t_vali == 1)[1]
        
        for i in range(num_classes):
            idxs_train += np.where(int_trai == i)[0].tolist()
            idxs_valid += np.where(int_vali == i)[0].tolist()
            idxs_test  += np.where(int_test == i)[0].tolist()
        

#     x_trai = bernoullisample(x_trai[idxs_train]).astype('float32')
    x_trai = x_trai[idxs_train, :].astype('float32')
    t_trai = t_trai[idxs_train].astype('int32') 
    x_trai, t_trai = shuffle(x_trai, t_trai, random_state=random_state)

#     x_vali = bernoullisample(x_vali[idxs_valid]).astype('float32')
    x_vali = x_vali[idxs_valid].astype('float32')
    t_vali = t_vali[idxs_valid].astype('int32')

#     x_test = bernoullisample(x_test[idxs_test]).astype('float32')
    x_test = x_test[idxs_test].astype('float32')
    t_test = t_test[idxs_test].astype('int32')
else:
    print('Full dataset')
#     x_trai = bernoullisample(x_trai).astype('float32')
    x_trai = x_trai.astype('float32')
    t_trai = t_trai.astype('int32') 
    x_trai, t_trai = shuffle(x_trai, t_trai, random_state=random_state)

#     x_vali = bernoullisample(x_vali).astype('float32')
    x_vali = x_vali.astype('float32')
    t_vali = t_vali.astype('int32')

#     x_test = bernoullisample(x_test).astype('float32')
    x_test = x_test.astype('float32')
    t_test = t_test.astype('int32')
    
x_trai, t_trai = tls.shared_dataset((x_trai, t_trai), borrow=True)
x_vali, t_vali = tls.shared_dataset((x_vali, t_vali), borrow=True)
x_test, t_test = tls.shared_dataset((x_test, t_test), borrow=True)


In [None]:
print('Size of total dataset: {:.2f} MB'.format(
        (
              sys.getsizeof(x_trai.get_value())
            + sys.getsizeof(t_trai.eval())
            + sys.getsizeof(x_vali.get_value())
            + sys.getsizeof(t_vali.eval())
            + sys.getsizeof(x_test.get_value())
            + sys.getsizeof(t_test.eval())
        )/1.0e6))

num_features = x_trai.get_value().shape[-1]
print('num_features {}'.format(num_features))
print('')

print('Train shape: ', x_trai.get_value().shape, t_trai.eval().shape)
print('Valid shape: ', x_vali.get_value().shape, t_vali.eval().shape)
print('Test shape:  ', x_test.get_value().shape, t_test.eval().shape)

print('{}'.format(type(x_trai)))
print('{}'.format(type(x_vali)))
print('{}'.format(type(x_test)))



In [None]:
### VISUALIZE DATA, ENSURE THAT EVERYTHING IS OKAY

tls.plot_svhn(x_trai.get_value(), t=10)

In [None]:
### CREATE DATA NAME
name = 'data'
if reduce_dataset_classes:
    name += '_c' + str(reduced_classes)

if reduce_dataset_size:
    name += '_s' + str(reduced_size)
    
full_path = os.path.join(os.getcwd(), 'data')
full_path = os.path.join(full_path, name)
full_path  += '.pkl'
print(full_path)


In [None]:
%%time

### SAVE DATA
with open(full_path, 'wb') as f:
    pickle.dump((x_trai, t_trai, x_vali, t_vali, x_test, t_test), 
                f, protocol=pickle.HIGHEST_PROTOCOL)
                

In [None]:
%%time

### LOAD DATA
import cPickle as pickle
with open(full_path, 'rb') as f:
    x_trai, t_trai, x_vali, t_vali, x_test, t_test = pickle.load(f)

print('')
print('Train shape: ', 
      x_trai.get_value().shape, t_trai.eval().shape)
print(type(x_trai))

print('Valid shape: ', 
      x_vali.get_value().shape, t_vali.eval().shape)
print(type(x_vali))

print('Test shape:  ', 
      x_test.get_value().shape, t_test.eval().shape)
print(type(x_test))
print('')