In [None]:
%%time
# Python 2.7

%matplotlib nbagg
%matplotlib inline 

import sys
sys.path
sys.path.append('..')
print(sys.version)

import os
import cPickle 
import scipy.io
import numpy as np
import matplotlib.pyplot as plt
import theano
import theano.tensor as T
import lasagne 
import lasagne.layers as L
import parmesan
import cPickle as pickle

from sklearn.utils import shuffle

import tools as tls


from data_loaders import svhn



In [None]:
### GLOBAL PARAMETERS ###

### META - HOW THE PROGRAM WORKS
file_name = 'data_no_share_c3' # assumes '.pkl'
np.random.seed(1234) # reproducibility

### CONSTANTS
IMG_LEN = 32




In [None]:
%%time
### LOAD DATA


full_path = os.path.join(os.getcwd(), 'data')
full_path = os.path.join(full_path, file_name)
full_path  += '.pkl'
print(full_path)

with open(full_path, 'rb') as f:
    x_trai, t_trai, x_vali, t_vali, x_test, t_test = pickle.load(f)

print('Size of total dataset: {:.2f} MB'.format(
        (
              sys.getsizeof(x_trai)
            + sys.getsizeof(t_trai)
            + sys.getsizeof(x_vali)
            + sys.getsizeof(t_vali)
            + sys.getsizeof(x_test)
            + sys.getsizeof(t_test)
        )/1.0e6))

In [None]:
### CHECK DATA
num_classes = np.unique(np.where(t_trai == 1)[1]).shape[0]
print('Number of classes {}'.format(num_classes))

num_features = x_trai[0].shape[0]
print('Number of features {}'.format(num_features))

print('')
print('Train shape: ', 
      x_trai.shape, t_trai.shape)

print('Valid shape: ', 
      x_vali.shape, t_vali.shape)

print('Test shape:  ', 
      x_test.shape, t_test.shape)

print('{}'.format(type(x_trai)))
print('{}'.format(type(x_vali)))
print('{}'.format(type(x_test)))
print('')


In [None]:
### VISUALIZE 

# TODO: WHEN normalize/renormalize are made DO add color to this function

tls.plot_svhn(x_trai, t=10)

In [None]:
### HYPER PARAMETERS
# VOLATILE HP
learning_rate = 1e-4
L1 = 0
L2 = 0

batch_size = 60
samples_to_process = 1e5
val_interval       = 600

# ARCHITECTURE
fraction = 0.1
hid_size = 50
num_latent_1 = round(hid_size*fraction)

# STABLE HP
eq_size = 1
iw_size = 1
max_epoch = 100


In [None]:
### HELPER FUNCTIONS
from lasagne.objectives import squared_error

def onehot(t, num_classes):
    out = np.zeros((t.shape[0], num_classes)).astype('float32')
    for row, col in enumerate(t):
        out[row, col] = 1
    return out


# c = -0.5 * np.log(2*np.pi)
clip = lambda x: T.clip(x,-10,10) #used to limit the variance (why?)

def log_bernoulli(x, p, eps=1e-32):
    """
    Computes the binary cross-entropy between a target and 

    Use eps if you don't want to alow values ==0, ==1
    """

    p = T.clip(p, eps, 1.0 - eps)
    return -T.nnet.binary_crossentropy(p, x)



def kl_normal_2_stdnormal(mu, lv):
    """Compute the KL divergence from the standard normal dist"""
    return - 0.5 * (1 + lv - mu**2 - T.exp(lv))


def ReconLogLikelihood(mux, x, muq, lvq):
    """
    Compute the cost of the network, using 
    """
    #Sum over the latent dimension, mean over the the samples
    reconstruction_cost = -squared_error(x, mux).sum(axis=1).mean()
    #reconstruction_cost = log_bernoulli(x, mux).sum(axis=1)
    #epsilon = 1e-8
    #reconstruction_cost =( x * T.log(mux + epsilon) + (1-x)*T.log(1-mux+epsilon))
    KL_qp = kl_normal_2_stdnormal(muq, lvq).sum(axis=1)#.mean()
    
    LL = reconstruction_cost - KL_qp
    
    return LL, reconstruction_cost, KL_qp


def LogSumExp(x, axis=None, keepdims=False):
    ''' Numerically stable theano version of the Log-Sum-Exp trick'''
    x_max = T.max(x, axis=axis, keepdims=True)

    preres = T.log(T.sum(T.exp(x - x_max), axis=axis, keepdims=keepdims))
    return preres + x_max.reshape(preres.shape)

In [None]:
%%time
### CREATE MODEL
from lasagne.nonlinearities import leaky_rectify, rectify, sigmoid,softmax
from parmesan.layers import SampleLayer
### CLASSIFIER
l_in_x   = L.InputLayer(shape=(None, num_features))#, name='l_in_x')
l_in_y   = L.InputLayer(shape=(None, num_classes))#, name='l_in_y')
l_in_x_con= L.InputLayer(shape=(None, num_features))#, name='l_in_x')


l_cl_1   = L.DenseLayer(l_in_x, 
                        num_units=hid_size,
                        nonlinearity=rectify)#,
#                        name='l_cl_1')
l_cl_2   = L.DenseLayer(l_cl_1,
                        num_units=hid_size,
                        nonlinearity=rectify)#,
#                        name='l_cl_2')
l_y    = L.DenseLayer(l_cl_2, 
                        num_units=num_classes,
                        nonlinearity=softmax)#,
#                        name='l_y')

## CONVULUTIONAL CLASSIFIER
l_in_class = L.InputLayer(shape=(None,3,32,32)) #note that we use a 4D input since we need to retain the spatial arrangement of the pixels when working with convolutions
l_conv_cl_1 = L.Conv2DLayer(l_in_class,num_filters=hid_size//32,filter_size=3 ,pad=(1,1))#32*32*hid_size//32
l_conv_cl_1_pool = L.MaxPool2DLayer(l_conv_cl_1,pool_size=(2, 2),stride = 2)
l_conv_cl_2 =L.Conv2DLayer(l_conv_cl_1_pool,num_filters=hid_size//16,filter_size=3,pad=(1,1))#16*16*hid_size//16
l_conv_cl_2_pool = L.MaxPool2DLayer(l_conv_cl_2,pool_size=(2, 2),stride = 2)
l_conv_cl_3 =L.Conv2DLayer(l_conv_cl_2_pool,num_filters=hid_size//8,filter_size=3,pad=(1,1))#8*8*hid_size//8
l_conv_cl_3_pool = L.MaxPool2DLayer(l_conv_cl_3,pool_size=(2, 2),stride = 2)
l_conv_cl_3 =L.Conv2DLayer(l_conv_cl_2_pool,num_filters=hid_size//4,filter_size=3,pad=(1,1))#4*4*hid_size//4
l_conv_cl_dense=L.DenseLayer(l_conv_cl_3, num_units=hid_size//2, nonlinearity=rectify)
l_out_class = L.DenseLayer(l_conv_cl_dense, num_units=num_classes, nonlinearity=softmax)
    
    
### ENCODER
l_en_con = L.ConcatLayer([l_in_x, l_in_y])#,name='l_en_con')

l_en_1   = L.DenseLayer(l_en_con, 
                        num_units=hid_size,
                        nonlinearity=rectify)#,
#                        name='l_en_1')
l_en_2   = L.DenseLayer(l_en_1,
                        num_units=hid_size,
                        nonlinearity=rectify)#,
#                        name='l_en_2')


# Create latent parameters
l_mu_1   = L.DenseLayer(l_en_2,
                        num_units=num_latent_1,
                        nonlinearity=None)#,
#                        name='l_mu_1')
l_lv_1   = L.DenseLayer(l_en_2,
                        num_units=num_latent_1,
                        nonlinearity=clip)#,
#                        name='l_lv_1')

# sample a latent representation:
#    z ~ q(z|x) = N(mu(x), logvar(x)
l_z_1      = SampleLayer(mean=l_mu_1, 
                         log_var=l_lv_1, 
                         eq_samples=eq_size, 
                         iw_samples=iw_size)#, 
#                         name='l_z_1')

### DECODER

l_in_z   = L.InputLayer(shape=(None, num_latent_1))#,
#                        name = 'l_in_z')

l_dec_con = L.ConcatLayer([l_in_z, l_in_y])#,name='l_en_con')

l_dec_1  = L.DenseLayer(l_dec_con, 
                        num_units = hid_size,
                        nonlinearity = rectify)#,
#                        name = 'l_dec_1')
l_dec_2  = L.DenseLayer(l_dec_1, 
                        num_units = hid_size,
                        nonlinearity = rectify)#,
#                       name='l_dec_2')

# Sigmoid is used because the original images are $\in [0,1]$
l_out    = L.DenseLayer(l_dec_2, 
                        num_units=num_features,
                        nonlinearity=sigmoid)#,
#                        name='l_out')

In [None]:
%%time
from lasagne.objectives import categorical_crossentropy, categorical_accuracy
##################GENERATING RECONSTRUCTIONS AND CLASSES

sym_x = T.matrix('x') # (batch_size x 3072)
sym_z = T.matrix('z') # Latent variable (batch_size x num_latent)
sym_y = T.matrix('y_l') # Latent variable (batch_size x num_classes)
sym_x_l = T.matrix('x_l') # Latent variable (batch_size x 3072)


####### Repeatition of data for unsupervised learning
t_eye = T.eye(num_classes, k=0)
t_u = t_eye.reshape((num_classes, 1, num_classes)).repeat(sym_x.shape[0], axis=1).reshape((-1, num_classes))
x_u = sym_x.reshape((1, sym_x.shape[0], sym_x.shape[1])).repeat(num_classes, axis=0).reshape((-1, sym_x.shape[1]))
#x_u = T.slinalg.kron(sym_x,T.ones((num_classes,1))) 


#####################SUPERVISED###################

y_train_l= L.get_output(l_y,{l_in_x:sym_x_l},deterministic = False)#Classifier

z_train_l, mu_train_l, lv_train_l = L.get_output([l_z_1, l_mu_1, l_lv_1],#Encoder
                                           {l_in_x:sym_x_l,l_in_y:sym_y}, deterministic = False)

recon_train_l = L.get_output(l_out,{l_in_z:z_train_l,l_in_y:sym_y},deterministic = False)#Decoder

#Likelihood
LL_rec_train_l, log_px_train_l, KL_train_l = ReconLogLikelihood(recon_train_l, sym_x_l, mu_train_l, lv_train_l)

LL_train_l=T.mean(LL_rec_train_l+0.1*T.sum(sym_y*T.log(y_train_l+1e-8)))


#############UNSUPERVISED####################
y_train_u= L.get_output(l_y,{l_in_x:sym_x},deterministic = False)#Classifier

z_train_u, mu_train_u, lv_train_u = L.get_output([l_z_1, l_mu_1, l_lv_1],#Encoder
                                           {l_in_x:x_u,l_in_y:t_u}, deterministic = False)

recon_train_u = L.get_output(l_out,{l_in_z:z_train_u,l_in_y:t_u}, deterministic = False)#Decoder

#Likelihood
LL_rec_train_u, log_px_train_u, KL_train_u = ReconLogLikelihood(recon_train_u, x_u, mu_train_u, lv_train_u)
LL_rec_train_u = LL_rec_train_u.reshape((num_classes,sym_x.shape[0])).T
LL_train_u=T.mean(T.sum(y_train_u*LL_rec_train_u,axis=1))


################## EVALUATION##############################
y_eval = L.get_output(l_y,{l_in_x:sym_x},deterministic = True) #Classifier

z_eval, mu_eval, lv_eval = L.get_output([l_z_1, l_mu_1, l_lv_1],#Encoder using same split as unsupervised
                                           {l_in_x:x_u,l_in_y:t_u}, deterministic = True)

recon_eval                   = L.get_output(l_out,{l_in_z:z_eval,l_in_y:t_u},deterministic = True) #Decoder

LL_rec_eval, log_px_eval, KL_eval = ReconLogLikelihood(recon_eval, sym_x, mu_eval, lv_eval)
LL_rec_eval = LL_rec_eval.reshape((num_classes,sym_x.shape[0])).T

LL_eval=T.mean(T.sum(y_eval*LL_rec_eval,axis=1))

p = [y_eval.shape,LL_rec_eval.shape]
size_f = theano.function=([y_eval,LL_rec_eval],p)
#########Training likelihood
LL_train=LL_train_u+LL_train_l


In [None]:
#Define variables to output
#LL_train,LL_train_u,LL_train_l allready defined
certainty_class_u=T.mean(T.max(y_train_u,axis=-1))
prob_recon_u=T.exp(LL_rec_train_u-LogSumExp(LL_rec_train_u,axis=-1,keepdims=True))
certainty_recon_u=T.mean(T.argmax(prob_recon_u,axis=-1))
mean_var_u=T.mean(lv_train_u)
recon_class_same_u=categorical_crossentropy(prob_recon_u,y_train_u)

#supervised outputs
certainty_class_l=T.mean(T.max(y_train_l,axis=-1))
mean_var_l=T.mean(lv_train_l)



In [None]:
%%time
### CREATE TRAINING FUNCTIONS
all_params = L.get_all_params([l_z_1, l_out,l_y], trainable=True)
all_grads  = T.grad(-LL_train, all_params)

updates    = lasagne.updates.adam(all_grads, all_params,
                                  learning_rate=learning_rate)

# Training function: Return loss, and update weights
f_train = theano.function(inputs=[sym_x,sym_x_l,sym_y],
                          outputs=[LL_train,
                                   LL_train_u,
                                   LL_train_l,
                                   certainty_class_u,
                                   mean_var_u,
                                   recon_class_same_u,
                                   certainty_class_l,
                                   mean_var_l,
                                   prob_recon_u,
                                   y_train_u],
                          updates=updates)

# Evaluation function: Return loss
f_eval = theano.function(inputs=[sym_x],
                         outputs=[LL_eval])

# Get latent variable values
f_z = theano.function(inputs=[sym_x], outputs=[z_eval])

# Return the reconstruction
#f_reconstruction = theano.function(inputs=[sym_x], outputs=[out_eval])

# Simulate artificial data, given an artificial latent variable
#f_simulate = theano.function(inputs=[sym_z], outputs=[mux_sample])


In [None]:
#test functions
test_y=np.random.randint(num_classes, size=(50)).astype('float32') #dummy data
test_y=onehot(test_y,num_classes)
test_x =np.random.normal(0,1, (200, 32*32*3)).astype('float32') #dummy data
text_x_l=np.random.normal(0,1, (50, 32*32*3)).astype('float32') #dummy data

f_temp = theano.function(inputs=[sym_x,sym_x_l,sym_y],
                          outputs=[LL_train,LL_train_u,LL_train_l,certainty_class_u,mean_var_u,certainty_class_l,mean_var_l,prob_recon_u,y_train_u]#,,
#        prob_recon_u],
                          ,updates=updates)
out=f_temp(test_x,text_x_l,test_y)
#[LL_train,LL_train_u,LL_train_l,certainty_class_u,mean_var_u,recon_class_same_u,certainty_class_l,mean_var_l,
#        prob_recon_u,y_train_u]

In [None]:
print(num_classes)

In [None]:
# Generate a subset of labeled data points

num_labeled = 3 # You decide on the size of the fraction...

idxs_train_l = []
for i in range(num_classes):
    idxs = np.where(np.argmax(t_trai,axis=-1) == i)[0]
    idxs_train_l += np.random.choice(idxs, size=num_labeled).tolist()

x_train_l = x_trai[idxs_train_l]
t_train_l = (t_trai[idxs_train_l,:])

In [None]:
t_train_l

In [None]:
# TRAINING

LL_train, KL_train, logpx_train = [],[],[]
LL_valid, KL_valid, logpx_valid = [],[],[]
samples_processed = 0
plt_vals = []
plt_vals_t = []
samples_processed =0
valid_samples_processed = []
scalarInputs=8
NonScalarInputs=4
out_accum=[0]*(scalarInputs+NonScalarInputs)

try:
    while samples_processed < samples_to_process:
#        print("Number of samples processed: {}".format(samples_processed))
        idxs = np.random.choice(range(x_trai.shape[0]), size=(batch_size), replace=False) 
        x_batch = x_trai[idxs]
        t_batch = t_trai[idxs]
        out = f_train(x_batch, x_train_l, t_train_l)
#        uns_results.add_batch([out,t_batch],batch_size)
  
        mean_factor=min(0.99,samples_processed/batch_size)
        samples_processed += batch_size

#        print(out_accum[i],mean_factor,(1-mean_factor),out[i])

        for i in range(scalarInputs):
            out_accum[i]=out_accum[i]*mean_factor+(1-mean_factor)*out[i]
        
        plt_vals += [out_accum[0]]
        plt_vals_t += [out_accum[4]]
        print("Samples processed: {} ".format(samples_processed))   
        print("Accumulated outs: {}".format(out_accum[0:3]))
        
        if samples_processed % val_interval == 0:
            valid_samples_processed += [samples_processed]
            size_f(y_eval,LL_rec_eval)
            out = f_eval(x_trai)
            LL_train += [out[0]] 
            logpx_train += [out[1]]
            KL_train += [out[2]]
            
            out = f_eval(x_vali)
            LL_valid += [out[0]]
            logpx_valid += [out[1]]
            KL_valid += [out[2]]
            
            z_eval = f_z(x_vali)[0]
            x_sample = f_sample(np.random.normal(size=(100, num_latent_z)).astype('float32'))[0]
            x_sample = f_sample(np.random.normal(size=(100, num_latent_z * size_up_factor)).astype('float32'))[0]
            x_recon = f_recon(x_valid)[0]

            
except KeyboardInterrupt:
    pass


In [None]:
plt_vals

In [None]:
plt.plot(plt_vals)

In [None]:
plt.plot(plt_vals_t)

In [None]:
25362*3