Load data in Tensorflow.

In [1]:
import codecs
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

  return f(*args, **kwds)
  from ._conv import register_converters as _register_converters


In [2]:
training_data_folder = '../training_data/web-radio/output/rec'
embDir = '../embeddings'
what = 'artist'

uri_file = '%s/%s.emb.u' % (embDir, what)
vector_file = '%s/%s.emb.v' % (embDir, what)
# header_file = '%s/%s.emb.h' % (embDir, what)
training_file = '%s/%s.dat' % (training_data_folder, what)

vectors = np.array([line.strip().split(' ') for line in codecs.open(vector_file, 'r', 'utf-8')])
# heads = np.array([line.strip() for line in codecs.open(header_file, 'r', 'utf-8')])
uris = np.array([line.strip() for line in codecs.open(uri_file, 'r', 'utf-8')])

train_array = np.array([line.strip().split(' ') for line in codecs.open(training_file, 'r', 'utf-8')])
train_array.shape

(11815, 3)

In [3]:
def get_embs(x):
    v = vectors[np.argwhere(uris == x)]
    if v.size == 0:
        result = -2. * np.ones(vectors[0].size)
    else:
        result = v[0][0]
    return result.astype('float32')

In [4]:
col1 = np.array([get_embs(xi) for xi in train_array[:, 0]])
col2 = np.array([get_embs(xi) for xi in train_array[:, 1]])
col3 = np.array(train_array[:, 2]).astype('float32')
col3 = col3.reshape((col3.size, 1))

training_vector = np.concatenate((col1, col2, col3), axis=1)

train, test = train_test_split(training_vector, train_size=0.3)

train_vector = train[:, :-1]
train_label = train[:, -1]

test_vector = test[:, :-1]
test_label = test[:, -1]



In [5]:
print(col1.shape)
print(col2.shape)
print(col3.shape)

print(training_vector.shape)
print(test.shape)

print(train_vector.shape)
print(train_label.shape)

(11815, 14)
(11815, 14)
(11815, 1)
(11815, 29)
(8271, 29)
(3544, 28)
(3544,)


In [6]:
# Parameters
learning_rate = 0.1
num_steps = 1000
batch_size = 128
display_step = 100

# Network Parameters
n_hidden_1 = 256  # 1st layer number of neurons
n_hidden_2 = 256  # 2nd layer number of neurons
num_input = train_vector[0].size
num_output = int(num_input / 2)

# tf Graph input
X = tf.placeholder("float", [None, num_input])
Y = tf.placeholder("float", [None, num_output])

num


In [7]:
# Store layers weight & bias
weights = {
    'h1': tf.Variable(tf.random_normal([num_input, n_hidden_1])),
    'h2': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2])),
    'out': tf.Variable(tf.random_normal([n_hidden_2, num_output]))
}
biases = {
    'b1': tf.Variable(tf.random_normal([n_hidden_1])),
    'b2': tf.Variable(tf.random_normal([n_hidden_2])),
    'out': tf.Variable(tf.random_normal([num_output]))
}

In [8]:
# Create model
def neural_net(x):
    # Hidden fully connected layer with 256 neurons
    layer_1 = tf.add(tf.matmul(x, weights['h1']), biases['b1'])
    # Hidden fully connected layer with 256 neurons
    layer_2 = tf.add(tf.matmul(layer_1, weights['h2']), biases['b2'])
    # Output fully connected layer with a neuron for each class
    out_layer = tf.matmul(layer_2, weights['out']) + biases['out']
    return out_layer

In [9]:
def similarity_loss(_sentinel=None, labels=None, logits=None, dim=-1, name=None):
    l = col1[0].size
    a = train_vector[:, 0:l]
    b = train_vector[:, l:]

    max_distance = weightedL2(np.ones(l), tf.constant(np.ones(l) * -1), logits)

    predicted = compute_sim(a, b, logits, max_distance)
    expected = tf.constant(train_label)
    print(type(predicted))
    print(type(expected))
    print(predicted[0])
    print(expected[0])
    loss = tf.subtract(expected, predicted)
    print(loss.shape)
    print(loss[0])
    loss = tf.reduce_mean(loss)
    print(loss.shape)
    print(loss)
    return loss


def weightedL2(a, b, w=1):
    # https://stackoverflow.com/a/8861999/1218213
    q = tf.subtract(a, b)
    # return np.sqrt((w * q * q).sum())
    pow_q = tf.cast(tf.pow(q, 2), tf.float32)
    _w = tf.reshape(w, [w.shape[1]])

    _sum = tf.reduce_sum(tf.multiply(_w, pow_q), axis=0, keepdims=True)
    return tf.reshape(_sum, [1, 1])


def compute_sim(seed, target, w, max_distance):
    lt = seed.shape[0]
    cost = [compute_similarity(seed[i], target[i], w, max_distance) for i in range(0, lt)]
    return tf.convert_to_tensor(cost, tf.float32)


def compute_similarity(seed, target, w, max_distance):
    b1 = np.argwhere(seed >= -1)
    b1.reshape(b1.size)
    b2 = np.argwhere(target >= -1)
    b2.reshape(b2.size)

    good_pos = np.intersect1d(b1, b2)
    if len(good_pos) == 0:
        return tf.constant(0.0, shape=[1, 1])

    _seed = seed[good_pos]
    _target = target[good_pos]
    _w = tf.gather(w, good_pos, axis=1)

    # distance
    d = weightedL2(_seed, _target, _w)

    # how much info I am not finding
    penalty = (len(b1) - len(good_pos)) / len(seed)
    multiplier = 1. - penalty

    # score
    s = tf.divide(tf.subtract(max_distance, d), max_distance)
    return tf.multiply(s, multiplier)


In [10]:
# Construct model
logits = neural_net(X)

# Define loss and optimizer
loss_op = similarity_loss(logits=logits, labels=Y)
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
train_op = optimizer.minimize(loss_op)

# Evaluate model (with test logits, for dropout to be disabled)
correct_pred = tf.equal(tf.argmax(logits, 1), tf.argmax(Y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

# Initialize the variables (i.e. assign their default value)
init = tf.global_variables_initializer()

<class 'tensorflow.python.framework.ops.Tensor'>
<class 'tensorflow.python.framework.ops.Tensor'>
Tensor("strided_slice:0", shape=(1, 1), dtype=float32)
Tensor("strided_slice_1:0", shape=(), dtype=float32)
(3544, 1, 3544)
Tensor("strided_slice_2:0", shape=(1, 3544), dtype=float32)
()
Tensor("Mean:0", shape=(), dtype=float32)


In [14]:
def next_batch(num, data, labels):

    '''
    Return a total of `num` random samples and labels. 
    '''
    idx = np.arange(0 , len(data))
    np.random.shuffle(idx)
    idx = idx[:num]
    data_shuffle = data[idx]
    labels_shuffle = labels[idx]
    print(labels_shuffle.shape)
    return data_shuffle, labels_shuffle

In [15]:
fake_labels = np.ones((train_vector.shape[0], num_output))
fake_labels.shape

(3544, 14)

In [16]:
with tf.Session() as sess:
    # Run the initializer
    sess.run(init)

    for step in range(1, num_steps + 1):
        batch_x, batch_y = next_batch(batch_size, train_vector, fake_labels)

        # Run optimization op (backprop)
        sess.run(train_op, feed_dict={X: batch_x, Y: batch_y})
        if step % display_step == 0 or step == 1:
            # Calculate batch loss and accuracy
            loss, acc = sess.run([loss_op, accuracy], feed_dict={X: batch_x,
                                                                 Y: batch_y})
            print("Step " + str(step) + ", Minibatch Loss= " + \
                  "{:.4f}".format(loss) + ", Training Accuracy= " + \
                  "{:.3f}".format(acc))

    print("Optimization Finished!")

    # Calculate accuracy for MNIST test images
    print("Testing Accuracy:",
          sess.run(accuracy, feed_dict={X: test_vector, Y: test_labels}))

(128, 14)


InvalidArgumentError: Input to reshape is a tensor with 1792 values, but the requested shape has 14
	 [[Node: Reshape = Reshape[T=DT_FLOAT, Tshape=DT_INT32, _device="/job:localhost/replica:0/task:0/device:CPU:0"](add, Reshape_12/shape)]]

Caused by op 'Reshape', defined at:
  File "/Users/pasquale/anaconda3/lib/python3.5/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/Users/pasquale/anaconda3/lib/python3.5/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/Users/pasquale/anaconda3/lib/python3.5/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/Users/pasquale/anaconda3/lib/python3.5/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/Users/pasquale/anaconda3/lib/python3.5/site-packages/ipykernel/kernelapp.py", line 478, in start
    self.io_loop.start()
  File "/Users/pasquale/anaconda3/lib/python3.5/site-packages/zmq/eventloop/ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "/Users/pasquale/anaconda3/lib/python3.5/site-packages/tornado/ioloop.py", line 888, in start
    handler_func(fd_obj, events)
  File "/Users/pasquale/anaconda3/lib/python3.5/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/Users/pasquale/anaconda3/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "/Users/pasquale/anaconda3/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/Users/pasquale/anaconda3/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/Users/pasquale/anaconda3/lib/python3.5/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/Users/pasquale/anaconda3/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/Users/pasquale/anaconda3/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 233, in dispatch_shell
    handler(stream, idents, msg)
  File "/Users/pasquale/anaconda3/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "/Users/pasquale/anaconda3/lib/python3.5/site-packages/ipykernel/ipkernel.py", line 208, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/Users/pasquale/anaconda3/lib/python3.5/site-packages/ipykernel/zmqshell.py", line 537, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/Users/pasquale/anaconda3/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2705, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/Users/pasquale/anaconda3/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2809, in run_ast_nodes
    if self.run_code(code, result):
  File "/Users/pasquale/anaconda3/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2869, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-10-b4c302094384>", line 5, in <module>
    loss_op = similarity_loss(logits=logits, labels=Y)
  File "<ipython-input-9-3a71b9f56e8f>", line 6, in similarity_loss
    max_distance = weightedL2(np.ones(l), tf.constant(np.ones(l) * -1), logits)
  File "<ipython-input-9-3a71b9f56e8f>", line 28, in weightedL2
    _w = tf.reshape(w, [w.shape[1]])
  File "/Users/pasquale/anaconda3/lib/python3.5/site-packages/tensorflow/python/ops/gen_array_ops.py", line 3997, in reshape
    "Reshape", tensor=tensor, shape=shape, name=name)
  File "/Users/pasquale/anaconda3/lib/python3.5/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "/Users/pasquale/anaconda3/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 3160, in create_op
    op_def=op_def)
  File "/Users/pasquale/anaconda3/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 1625, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

InvalidArgumentError (see above for traceback): Input to reshape is a tensor with 1792 values, but the requested shape has 14
	 [[Node: Reshape = Reshape[T=DT_FLOAT, Tshape=DT_INT32, _device="/job:localhost/replica:0/task:0/device:CPU:0"](add, Reshape_12/shape)]]
