# PointNet with Input and Feature Transformation

In [None]:
import tensorflow as tf
import utils
import numpy as np

In [2]:
# load training data and labels
data0 = utils.load_h5("ply_data_train0.h5")
data1 = utils.load_h5("ply_data_train1.h5")
data2 = utils.load_h5("ply_data_train2.h5")
data3 = utils.load_h5("ply_data_train3.h5")
data4 = utils.load_h5("ply_data_train4.h5")

# aggregate training data, training label
train_data = np.append(data0[0], data1[0], axis=0)
train_data = np.append(train_data, data2[0], axis=0)
train_data = np.append(train_data, data3[0], axis=0)
train_data = np.append(train_data, data4[0], axis=0)
print(np.shape(train_data))

train_labels = np.append(data0[1], data1[1], axis=0)
train_labels = np.append(train_labels, data2[1], axis=0)
train_labels = np.append(train_labels, data3[1], axis=0)
train_labels = np.append(train_labels, data4[1], axis=0)
print(np.shape(train_labels))

(9840, 2048, 3)
(9840, 1)


In [3]:
test0 = utils.load_h5("ply_data_test0.h5")
test1 = utils.load_h5("ply_data_test1.h5")

test_data = np.append(test0[0], test1[0], axis=0)
test_labels = np.append(test0[1], test1[1], axis=0)

In [4]:
train_labels_one_hot = []
for l in train_labels:
    one_hot = np.zeros(40, dtype=np.int)
    one_hot[l[0]] = 1
    train_labels_one_hot.append(one_hot)
train_labels_one_hot = np.array(train_labels_one_hot)

# one hot encode test_labels
test_labels_one_hot = []
for l in test_labels:
    one_hot = np.zeros(40, dtype=np.int)
    one_hot[l[0]] = 1
    test_labels_one_hot.append(one_hot)
test_labels_one_hot = np.array(test_labels_one_hot)

In [5]:
batch_size = 32

In [6]:
def input_transform(cloud):
    # B x 2048 x 3 x 1
    batch_norm = tf.contrib.layers.batch_norm
    # 1st mlp layer
    i_trans_layer_conv1 = tf.contrib.layers.conv2d(inputs=cloud, num_outputs=64, kernel_size=[1, 3], padding="VALID", activation_fn=tf.nn.relu, normalizer_fn=batch_norm)
    # (B, 2048, 64)

    # 2nd mlp layer
    i_trans_layer_conv2 = tf.contrib.layers.conv2d(inputs=i_trans_layer_conv1, num_outputs=128, kernel_size=[1, 1], padding="VALID", activation_fn=tf.nn.relu, normalizer_fn=batch_norm)
    # (B, 2048, 64)

    # 3rd mlp layer
    i_trans_layer_conv3 = tf.contrib.layers.conv2d(inputs=i_trans_layer_conv2, num_outputs=1024, kernel_size=[1, 1], padding="VALID", activation_fn=tf.nn.relu, normalizer_fn=batch_norm)
    # (B, 2048, 64)

    # pooling
    i_max_pool = tf.contrib.layers.max_pool2d(inputs=i_trans_layer_conv3, kernel_size=[2048, 1], stride=1, padding="VALID")

    i_max_pool = tf.reshape(i_max_pool, [batch_size, -1])
    
    # fnn1
    i_layer_fnn1 = tf.contrib.layers.fully_connected(inputs=i_max_pool, num_outputs=512, activation_fn=tf.nn.relu, normalizer_fn=batch_norm)

    # fnn2
    i_layer_fnn2 = tf.contrib.layers.fully_connected(inputs=i_layer_fnn1, num_outputs=256, activation_fn=tf.nn.relu,normalizer_fn=batch_norm)

    initial = np.eye(3).flatten()
    transform = tf.contrib.layers.fully_connected(inputs=i_layer_fnn2, num_outputs = 3*3, biases_initializer=tf.constant_initializer(initial))

    return tf.reshape(transform, (-1, 3, 3))

In [7]:
tf.reset_default_graph()
cloud_origin = tf.placeholder(tf.float32, [batch_size, 2048, 3])
cloud = tf.reshape(cloud_origin, (-1, 2048, 3, 1))
print(cloud)

Tensor("Reshape:0", shape=(32, 2048, 3, 1), dtype=float32)


## Apply input transformation

In [8]:
# apply input transformation
t = input_transform(cloud)
print(np.shape(t))
transformed_input = tf.reshape(tf.matmul(cloud_origin, t), (-1, 2048, 3, 1))
print(transformed_input)

(32, 3, 3)
Tensor("Reshape_3:0", shape=(32, 2048, 3, 1), dtype=float32)


In [9]:
def feature_transform(net):
    # B x 2048 x 3 x 1

    # 1st mlp layer
    f_trans_layer_conv1 = tf.contrib.layers.conv2d(inputs=net, num_outputs=64, kernel_size=1, padding="VALID", activation_fn=tf.nn.relu,normalizer_fn=batch_norm)
    # (B, 2048, 1, 64)

    # 2nd mlp layer
    f_trans_layer_conv2 = tf.contrib.layers.conv2d(inputs=f_trans_layer_conv1, num_outputs=128, kernel_size=1, padding="VALID", activation_fn=tf.nn.relu, normalizer_fn=batch_norm)
    # (B, 2048, 1, 64)

    # 3rd mlp layer
    f_trans_layer_conv3 = tf.contrib.layers.conv2d(inputs=f_trans_layer_conv2, num_outputs=1024, kernel_size=1, padding="VALID", activation_fn=tf.nn.relu, normalizer_fn=batch_norm)
    # (B, 2048, 1, 64)

    # pooling
    f_max_pool = tf.contrib.layers.max_pool2d(inputs=f_trans_layer_conv3, kernel_size=[2048, 1], stride=1, padding="VALID")

    f_max_pool = tf.reshape(f_max_pool, [batch_size, -1])
    
    # fnn1
    f_layer_fnn1 = tf.contrib.layers.fully_connected(inputs=f_max_pool, num_outputs=512, activation_fn=tf.nn.relu, normalizer_fn=batch_norm)

    # fnn2
    f_layer_fnn2 = tf.contrib.layers.fully_connected(inputs=f_layer_fnn1, num_outputs=256, activation_fn=tf.nn.relu, normalizer_fn=batch_norm)

    K = 64
    initial = np.eye(K).flatten()
    transform = tf.contrib.layers.fully_connected(inputs=f_layer_fnn2, num_outputs = K*K, biases_initializer=tf.constant_initializer(initial))

    return tf.reshape(transform, (-1, K, K))

## Main network, apply feature transformation after conv2

In [10]:
# main network
# placeholder for one-hot labels
y = tf.placeholder(tf.float32, [None, 40])
learning_rate = tf.placeholder(tf.float32, shape=[])
decay_rate = tf.placeholder(tf.float32, shape=[])
batch_norm = tf.contrib.layers.batch_norm

# placeholder for labels
y_labels = tf.placeholder(tf.int64, [None])

# 1st mlp layer
layer_conv1 = tf.contrib.layers.conv2d(inputs=transformed_input, num_outputs=64, kernel_size=[1, 3], padding="VALID", activation_fn=tf.nn.relu, normalizer_fn=batch_norm, normalizer_params = {'decay':decay_rate})


# 2nd mlp layer
layer_conv2 = tf.contrib.layers.conv2d(inputs=layer_conv1, num_outputs=64, kernel_size=1, padding="VALID", activation_fn=tf.nn.relu, normalizer_fn=batch_norm, normalizer_params = {'decay':decay_rate})


# apply feature transformation
f_transform = feature_transform(layer_conv2)
layer_conv2 = tf.matmul(tf.squeeze(layer_conv2, [2]), f_transform)
layer_conv2 = tf.expand_dims(layer_conv2, [2])

# 3rd mlp layer
layer_conv3 = tf.contrib.layers.conv2d(inputs=layer_conv2, num_outputs=64, kernel_size=1, padding="VALID", activation_fn=tf.nn.relu, normalizer_fn=batch_norm, normalizer_params = {'decay':decay_rate})

# 4th cnn
layer_conv4 = tf.contrib.layers.conv2d(inputs=layer_conv3, num_outputs=128, kernel_size=[1, 1], padding="VALID", activation_fn=tf.nn.relu, normalizer_fn=batch_norm, normalizer_params = {'decay':decay_rate})

# 5th cnn
layer_conv5 = tf.contrib.layers.conv2d(inputs=layer_conv4, num_outputs=1024, kernel_size=[1, 1], padding="VALID", activation_fn=tf.nn.relu, normalizer_fn=batch_norm, normalizer_params = {'decay':decay_rate})

# max pooling
max_pool = tf.contrib.layers.max_pool2d(inputs=layer_conv5, kernel_size=[2048, 1], stride = 1, padding="VALID")

# fnn1
layer_fnn1 = tf.contrib.layers.fully_connected(inputs=max_pool, num_outputs=512, activation_fn=tf.nn.relu, normalizer_fn=batch_norm, normalizer_params = {'decay':decay_rate})

# fnn2
layer_fnn2 = tf.contrib.layers.fully_connected(inputs=layer_fnn1, num_outputs=256, activation_fn=tf.nn.relu, normalizer_fn=batch_norm, normalizer_params = {'decay':decay_rate})

layer_fnn2 = tf.contrib.layers.dropout(inputs=layer_fnn2, keep_prob=0.7)

# fnn3
logits = tf.contrib.layers.fully_connected(inputs=layer_fnn2, num_outputs=40, activation_fn=tf.nn.relu)
logits = tf.squeeze(logits, [1, 2])

# softmax
output = tf.nn.softmax(logits)
output_class = tf.reshape(output, (-1,40))

In [11]:
# loss function
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits = logits, labels = y))

In [12]:
from random import *
import math
# only rotate against y axis
def rotate(pt_cloud):
    angle = np.deg2rad(randint(0, 360))
    R = np.array([[math.cos(angle), 0, math.sin(angle)], [0, 1, 0], [-math.sin(angle), 0, math.cos(angle)]])
    rotated_pt_cloud = np.matmul(R, pt_cloud.T).T
    return rotated_pt_cloud

def jitter(pt_cloud):
    return pt_cloud + np.random.normal(0, 0.02, None)

def augment(pt_cloud):
    seed = np.random.randint(10)
    if seed == 0:
        return rotate(jitter(pt_cloud))
    elif seed == 1:
        return rotate(pt_cloud)
    elif seed == 2:
        return jitter(pt_cloud)
    else:
        return pt_cloud

In [13]:
optim = tf.train.AdamOptimizer(learning_rate)
optimizer = optim.minimize(loss)

In [14]:
def get_accuracy():
    right_count = 0
    i = 0
    while i < len(test_data):
        j = min(i + 32, len(test_data))
        correct_labels = tf.equal(tf.argmax(output_class, 1), tf.argmax(y, 1))
        accuracy = tf.reduce_mean(tf.cast(correct_labels, tf.float32))
    # compute accuracy on test data
        labels = np.array([label.argmax() for label in test_labels_one_hot[i:j]])
    #print(np.shape(labels))
        accuracy = sess.run([accuracy],feed_dict = {cloud_origin: test_data[i:j], y: test_labels_one_hot[i:j]})
        right_count = right_count + accuracy[0] * (j - i)
        i += 32
    final_accuracy = right_count / len(test_data)
    return final_accuracy

In [15]:
sess = tf.InteractiveSession()
tf.global_variables_initializer().run()

l_rate = 0.001
dr = 0.5

epoch = 200
num_iter = 307

## Training, manually terminated after 170 epoches

In [16]:
train_data = np.array([augment(x) for x in train_data])
for e in range(epoch):
    if dr < 0.99:
        dr += 0.01
    for i in range(num_iter):
        idx = np.random.choice(9840, [batch_size], False)
        batch_img_vanilla = train_data[idx][:]
        batch_y = train_labels_one_hot[idx][:]
        _, l, lrate = sess.run([optimizer, loss, optim._lr],feed_dict = {cloud_origin: batch_img_vanilla , y: batch_y, learning_rate: l_rate, decay_rate : dr})
    if (e + 1) % 10 == 0:
        ax = get_accuracy()
        print ("epoch " + str(e+1) + "; loss = " + str(l) + "; accuracy = " + str(ax) + "; lr = " + str(lrate))
    if e % 20 == 0 and e > 0:
        l_rate /= 2

epoch 10; loss = 1.30526; accuracy = 0.71875; lr = 0.0010000000474974513
epoch 20; loss = 0.758977; accuracy = 0.752435064935; lr = 0.0010000000474974513
epoch 30; loss = 0.834988; accuracy = 0.79586038961; lr = 0.0005000000237487257
epoch 40; loss = 0.975209; accuracy = 0.798701298701; lr = 0.0005000000237487257
epoch 50; loss = 0.675526; accuracy = 0.826298701299; lr = 0.0002500000118743628
epoch 60; loss = 0.596541; accuracy = 0.821022727273; lr = 0.0002500000118743628
epoch 70; loss = 0.415493; accuracy = 0.836444805195; lr = 0.0001250000059371814
epoch 80; loss = 0.422969; accuracy = 0.843344155844; lr = 0.0001250000059371814
epoch 90; loss = 0.721235; accuracy = 0.842938311688; lr = 6.25000029685907e-05
epoch 100; loss = 0.575737; accuracy = 0.840097402597; lr = 6.25000029685907e-05
epoch 110; loss = 0.50492; accuracy = 0.845779220779; lr = 3.125000148429535e-05
epoch 120; loss = 0.426225; accuracy = 0.846996753247; lr = 3.125000148429535e-05
epoch 130; loss = 0.494703; accuracy 

KeyboardInterrupt: 

In [17]:
print("Final accuracy", get_accuracy())

Final accuracy 0.852272727273
