In [147]:
from __future__ import division
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
import cPickle as pickle
import numpy as np
import tensorflow as tf
#from sklearn import metrics
import pickle
from lib import accuracy
from lib import labels

In [164]:
data_dir = "../data/"
pickle_file = 'tf_data_mean.pickle'

with open(data_dir + pickle_file, 'rb') as f:
    save = pickle.load(f)
    train_dataset = np.matrix(save['train_data'], dtype='float32')
    train_labels = np.matrix(save['train_labels'], dtype='float32')
    valid_dataset = np.matrix(save['validate_data'], dtype='float32')
    valid_labels = np.matrix(save['validate_labels'], dtype='float32')
    test_dataset = np.matrix(save['test_data'], dtype='float32')
    test_bids = list(save['test_business_ids'])
    #test_bids = np.ravel(test_bids)
    del save  # hint to help gc free up memory
    

#Put all our data (train and valid) together for final training.
train_dataset = np.concatenate((train_dataset, valid_dataset))
train_labels = np.concatenate((train_labels, valid_labels))

print 'Training set', train_dataset.shape, train_labels.shape
print 'Validation set', valid_dataset.shape, valid_labels.shape
print 'Test set', test_dataset.shape, len(test_bids)
    
    
# Convert labels to a dict of binarized labels [1. if true, 1. if false]
# So can be used for softmax per label.
train_labels = labels.binarize_softmax_labels(train_labels)
valid_labels = labels.binarize_softmax_labels(valid_labels)



Training set (2000, 1024) (2000, 9)
Validation set (400, 1024) (400, 9)
Test set (10000, 1024) 10000


In [166]:


# With gradient descent training, even this much data is prohibitive.
# Subset the training data for faster turnaround.

#set to -1 to use all data
train_subset = -1
input_size = 1024
labels_size = 2
label_to_train = 8
layer_1_units = 512
dropout_keep_prob = 0.80
training_rate = 0.5
regularizer_factor = 1e-8

num_steps = 3000

test_results = np.matrix(np.zeros((test_dataset.shape[0], 9)))
valid_results = np.matrix(np.zeros((valid_dataset.shape[0], 9)))



print valid_results

for i in range(9):
    print ""
    print "TRAINING LABEL", i
    print "=================="
    label_to_train = i
    save_weights = dict() 
    
    graph = tf.Graph()
    with graph.as_default():

        # Input data.
        # Load the training, validation and test data into constants that are
        # attached to the graph.
        tf_train_dataset = tf.constant(train_dataset[:train_subset, :])
        tf_train_labels = tf.constant(train_labels[label_to_train][:train_subset, :])
        tf_valid_dataset = tf.constant(valid_dataset)
        tf_valid_labels = tf.constant(valid_labels[label_to_train])
        tf_test_dataset = tf.constant(test_dataset)

        # Variables.
        # These are the parameters that we are going to be training. The weight
        # matrix will be initialized using random valued following a (truncated)
        # normal distribution. The biases get initialized to zero.
        weights1 = tf.Variable(
            tf.truncated_normal([input_size,  layer_1_units]))
        biases1 = tf.Variable(tf.zeros([layer_1_units]))

        #Inputs are of length 128 (number hidden units of layer1), but outputs need to be of size 9 (number of labels))
        weights2 = tf.Variable(
            tf.truncated_normal([layer_1_units, labels_size]))
        biases2 = tf.Variable(tf.zeros([labels_size]))
        

        # Training computation.
        def layers(inputs, dropout=False):
            if dropout:
                inputs = tf.nn.dropout(inputs, dropout_keep_prob)

            logits1 = tf.matmul(inputs, weights1) + biases1
            layer1_out = tf.nn.relu(logits1)

            logits_out = tf.matmul(layer1_out, weights2) + biases2

            return logits_out

        # Training computation.
        # We multiply the inputs with the weight matrix, and add biases. We compute
        # the softmax and cross-entropy (it's one operation in TensorFlow, because
        # it's very common, and it can be optimized). We take the average of this
        # cross-entropy across all training examples: that's our loss.

        loss = tf.reduce_mean(
            #This uses sigmoid cross entropy, which allows for multiple labels
            tf.nn.softmax_cross_entropy_with_logits(layers(tf_train_dataset, True), tf_train_labels))

        # L2 regularization for the fully connected parameters.
        #regularizers = (tf.nn.l2_loss(weights1) + tf.nn.l2_loss(biases1))

        # Add the regularization term to the loss.
        #loss += regularizer_factor * regularizers

        # Optimizer.
        # We are going to find the minimum of this loss using gradient descent.
        #optimizer = tf.train.AdamOptimizer(training_rate).minimize(loss)
        optimizer = tf.train.AdagradOptimizer(training_rate).minimize(loss)



        # Predictions for the training, validation, and test data.
        # These are not part of training, but merely here so that we can report
        # accuracy figures as we train.
        train_prediction = tf.nn.softmax(layers(tf_train_dataset))
        valid_prediction = tf.nn.softmax(layers(tf_valid_dataset))
        test_prediction = tf.nn.softmax(layers(tf_test_dataset))
        
        # Add an op to initialize the variables.
        init_op = tf.initialize_all_variables()
        
        # Add ops to save and restore all the variables.
        saver = tf.train.Saver()


    with tf.Session(graph=graph) as session:
        # This is a one-time operation which ensures the parameters get initialized as
        # we described in the graph: random weights for the matrix, zeros for the
        # biases. 
        session.run(init_op)
        print 'Initialized'
        for step in xrange(num_steps):
            # Run the computations. We tell .run() that we want to run the optimizer,
            # and get the loss value and the training predictions returned as numpy
            # arrays.
            _, l, predictions = session.run([optimizer, loss, train_prediction])
            if (step % 100 == 0) or (step+1 == num_steps):
                #print predictions.shape
                print ""
                print ">---------- step:", step
                print 'Loss at step', step, ':', l
                #print "L2 Loss:", L2_loss
                print "Training Rate:", training_rate
                #print "F1 Score:", (valid_labels)
                print "Training accuracy: {0:.2f}%".format(100 * accuracy.binarized_accuracy(
                   train_labels[label_to_train][:train_subset, :], predictions))
                # Calling .eval() on valid_prediction is basically like calling run(), but
                # just to get that one numpy array. Note that it recomputes all its graph
                # dependencies.
                print "Validation accuracy: {0:.2f}%".format(100 * accuracy.binarized_accuracy(
                     valid_labels[label_to_train], valid_prediction.eval()))
        save_path = saver.save(session, data_dir +"softmax_10K_label_"+str(i)+".ckpt")
        valid_results[:,i] = np.reshape(valid_prediction.eval()[:,0], (valid_results.shape[0], 1))
        test_results[:,i] = np.reshape(test_prediction.eval()[:,0], (test_results.shape[0], 1))

pickle.dump(valid_results, open(data_dir +"softmax_2_mean_valid_results_5K.pickle", "wb"))
pickle.dump(test_results, open(data_dir +"softmax_2_mean_test_results_5K.pickle", "wb"))


[[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]

TRAINING LABEL 0
Initialized

>---------- step: 0
Loss at step 0 : 97.1086
Training Rate: 0.5
Training accuracy: 42.67%
Validation accuracy: 67.00%

>---------- step: 100
Loss at step 100 : 0.608354
Training Rate: 0.5
Training accuracy: 67.18%
Validation accuracy: 67.50%

>---------- step: 200
Loss at step 200 : 0.562234
Training Rate: 0.5
Training accuracy: 67.58%
Validation accuracy: 69.50%

>---------- step: 300
Loss at step 300 : 0.544131
Training Rate: 0.5
Training accuracy: 71.69%
Validation accuracy: 68.50%

>---------- step: 400
Loss at step 400 : 0.58247
Training Rate: 0.5
Training accuracy: 68.88%
Validation accuracy: 73.50%

>---------- step: 500
Loss at step 500 : 0.485972
Training Rate: 0.5
Training accuracy: 74.09%
Validation accuracy: 73.75%

>---------- step: 600
Loss at s

In [167]:
pickle.dump(valid_results, open(data_dir +"softmax_valid_results_full_mean_3K.pickle", "wb"))
pickle.dump(test_results, open(data_dir +"softmax_test_results_full_mean_3K.pickle", "wb"))

# ========            
# LABEL 0 : Good for lunch
# ========

#Simple zero hidden layer with no regularization
#AdamOptimizer
#>---------- step: 4900
#Loss at step 4900 : 0.0220133
#Training Rate: 0.1 (or 0.01?)
#Training accuracy: 100%
#Validation accuracy: 74%


# 2 Hidden Layer model with L2 regularization added to the loss, and 20% dropout.
#AdaGradOptimizer
# 512 hidden nodes per layer
#>---------- step: 2900
#Loss at step 2900 : 0.534646
#Training Rate: 0.5
#Training accuracy: 86%
#Validation accuracy: 78%

# 2 Hidden Layer model with L2 regularization added to the loss, and 20% dropout.
#AdaGradOptimizer
#regularizer = 1e-8
#>---------- step: 4900
#Loss at step 4900 : 0.361446
#Training Rate: 0.5
#Training accuracy: 86.37%
#Validation accuracy: 77.75%

# 2 Hidden Layer model with L2 regularization added to the loss, and 10% dropout.
#AdaGradOptimizer
# (USED MAX INSTEAD OF MEAN)
#>---------- step: 4900
#Loss at step 4900 : 0.265733
#Training Rate: 0.1
#Training accuracy: 92.25%
#Validation accuracy: 72.50%

# 2 Hidden Layer model just 20% dropout.
#AdaGradOptimizer
# 1024 hidden nodes per layer
#>---------- step: 9900
#Loss at step 9900 : 0.0885701
#Training Rate: 1.0
#Training accuracy: 98.94%
#Validation accuracy: 75.75%

# 2 Hidden Layer model just 10% dropout.
#AdaGradOptimizer
# 16 hidden nodes per layer
#>---------- step: 4900
#Loss at step 4900 : 0.409952
#Training Rate: 0.1
#Training accuracy: 83.18%
#Validation accuracy: 76.50%

# ========            
# LABEL 8 : Good for kids
# ========

# 2 Hidden Layer model just 10% dropout.
#AdaGradOptimizer
#>---------- step: 4900
#Loss at step 4900 : 0.317328
#Training Rate: 0.1
#Training accuracy: 87.49%
#Validation accuracy: 85.25%

In [168]:
#print test_bids
test_results_old = test_results
test_results_bin = np.array(np.round(test_results), dtype=bool)
output = "business_id,labels\n"

for i in range(len(test_bids)):
    if (i % 500 == 0):
        print i
    tags = np.argwhere(test_results_bin[i])
    tags_str = ' '.join('%d'%F for F in tags[:] )
    output += test_bids[i]+ "," + tags_str + "\n"
    
print output
f = open('test_submission_binary_labels_nn_full_3K_4', 'w')
f.write(output)
f.close()

0
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500
9000
9500
business_id,labels
003sg,1 2 3 5 6 8
00er5,1 2 3 5 6 8
00kad,1 2 3 5 6 8
00mc6,1 2 3 4 5 6 8
00q7x,1 2 3 5 6 7
00v0t,1 2 3 5 6
00y7p,1 2 3 5 6 8
019fg,2 3 5 6 8
019r1,0 3 5 8
01i5j,1 2 3 4 5 6 7
01is9,1 2 3 5 6 7
01mrb,1 2 3 4 5 6
01pyb,1 2 4 5 6 7
01s0p,2 3 5 6 8
01xsq,1 2 3 5 6 8
021oz,1 2 3 5 6 8
026nc,1 2 3 5 6
02bwy,0 3 8
02d9t,1 2 3 5 6 7
02eos,1 2 3 5 6 8
02fio,1 2 3 4 5 6
02pxt,1 2 3 5 6 8
02qrp,1 2 3 5 6
02rfd,1 2 3 5 6
0357u,1 2 3 5 6 8
035x6,1 2 3 5 6 8
038l4,1 2 3 5 6 8
03bbu,1 2 3 5 6 8
03ked,3 6
03m8y,5 6 8
03vx8,1 2 3 5 6 8
03yz9,1 2 3 5 6
040nh,1 2 3 5 6 7
042hy,1 2 3 5 6 8
044sl,1 2 3 5 6 8
045qe,1 2 3 5 6 8
04944,1 2 3 5 6 8
04cy7,1 2 3 5 6 8
04ilw,1 2 3 5 6
04imx,2 5 6 8
04kgm,1 2 3 5 6 8
04ud9,3 8
04wn2,2 3 5 6 8
04zgs,1 2 3 5 6 8
050l6,1 2 3 5 6 8
0573e,1 2 3 5 6 8
057qc,0 3 8
05fb2,1 2 3 5 6 8
05h9r,1 2 3 5 6 7 8
05ihx,1 2 3 4 5 6 7
05jhx,1 2 3 5 6 8
05rwc,1 2 3 5 6
06c

In [None]:
output

In [116]:
test_dataset.shape

(10000, 1024)

In [114]:
test_results[543]

matrix([[ 0.03887118,  0.85299587,  0.93095261,  0.60537016,  0.3645809 ,
          0.95144856,  0.99414945,  0.41221097,  0.45984778]])