Skip to content

Commit

Permalink
tweak benchmarking scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
AngusG committed Jul 21, 2017
1 parent 917a7be commit 31df1a9
Show file tree
Hide file tree
Showing 4 changed files with 79 additions and 22 deletions.
79 changes: 58 additions & 21 deletions smoke_test.py
@@ -1,38 +1,75 @@
import time
import numpy as np
import tensorflow as tf

N = 8192

gemm_module = tf.load_op_library('./libs/gemm_op.so')

A = tf.placeholder(tf.float32, [N, N])
B = tf.placeholder(tf.float32, [N, N])
sess = tf.InteractiveSession()

a = tf.cast(
2 * (tf.random_normal(shape=[N, N], seed=1).eval() > 0) - 1, tf.float32)

#b = tf.cast(
# 2 * (tf.random_normal(shape=[N, N], seed=2).eval() > 0) - 1, tf.float32)

N_RUNS=5
xnor_timings = np.zeros(N_RUNS)
base_timings = np.zeros(N_RUNS)

for i in range(N_RUNS):
start_time = time.time()
gemm_module.gemm(a, a).eval()
xnor_timings[i] = time.time() - start_time
print("xnor_gemm %d took %f" % (i, xnor_timings[i]))
print("Avg XNOR kernel execution time over %d runs: %f +/- %f") % ((N_RUNS, xnor_timings.mean(), xnor_timings.std()))

for i in range(N_RUNS):
start_time = time.time()
print(tf.matmul(a, a).eval())
base_timings[i] = time.time() - start_time
print("matmul %d took %f" % (i,base_timings[i]))
print("Avg MatMul execution time over %d runs: %f +/- %f") % ((N_RUNS, base_timings.mean(), base_timings.std()))


'''
#A = tf.placeholder(tf.float32, [N, N])
#B = tf.placeholder(tf.float32, [N, N])
# For benchmarking on GPU w/only 4GB memory
a = 2 * tf.cast(tf.random_normal(shape=[N, N], seed=1) > 0, tf.float32) - 1
b = 2 * tf.cast(tf.random_normal(shape=[N, N], seed=2) > 0, tf.float32) - 1
xnor_gemm = gemm_module.gemm(A, B)
matmul = tf.matmul(a, b)
N_RUNS = 5
xnor_timings = np.zeros(N_RUNS)
base_timings = np.zeros(N_RUNS)
with tf.Session() as sess:
a_f32 = sess.run(a)
b_f32 = sess.run(b)
#b_f32 = sess.run(b)
########### benchmark xnor ############
start_time = time.time()
xnor_gemm_result = sess.run(xnor_gemm, feed_dict={A: a_f32, B: b_f32})
xnor_gemm_time = time.time() - start_time
for i in range(N_RUNS):
########### benchmark xnor ############
start_time = time.time()
#xnor_gemm_result = sess.run(xnor_gemm, feed_dict={A: a_f32, B: b_f32})
xnor_gemm_result = sess.run(gemm_module.gemm(a_f32, a_f32))
xnor_timings[i] = time.time() - start_time
print("xnor_gemm took %f" % xnor_time)
print(xnor_gemm_result)
#######################################
print("xnor_gemm %d took %f" % (i, xnor_timings[i]))
print(xnor_gemm_result)
#######################################
print("Avg XNOR kernel execution time over %d runs: %f +/- %f" % (N_RUNS, xnor_timings.mean(), xnor_timings.std()))
for i in range(N_RUNS):
########### benchmark matmul ##########
start_time = time.time()
#matmul_result = sess.run(matmul, feed_dict={A: a_f32, B: b_f32})
matmul_result = sess.run(tf.matmul(a_f32, a_f32))
base_timings[i] = time.time() - start_time
########### benchmark matmul ##########
start_time = time.time()
matmul_result = sess.run(matmul, feed_dict={A: a_f32, B: b_f32})
matmul_time = time.time() - start_time

print("matmul took %f" % tf_time)
print(matmul_result)
#######################################
print("matmul %d took %f" % (i, base_timings[i]))
print(matmul_result)
#######################################
print("Avg MatMul execution time over %d runs: %f +/- %f" % (N_RUNS, base_timings.mean(), base_timings.std()))
'''
4 changes: 4 additions & 0 deletions src/concatenate_kernel.cu.cc
Expand Up @@ -83,7 +83,9 @@ __global__ void deconcatenate_rows_kernel(int *a, float *b, int size)
template <typename T>
struct ConcatenateRowsFunctor<GPUDevice, T> {
void operator()(const GPUDevice& d, const float* fA, int* Aconc, const int N) {
#ifdef DEBUG
printf("\n\nConcatenateRowsFunctor\n\n");
#endif
int block = BLOCK_SIZE * 4, grid = N * N / (block * 32) + 1;
concatenate_rows_kernel<T>
<<<grid, block, 0, d.stream()>>>(fA, Aconc, N * N / 32);
Expand All @@ -93,7 +95,9 @@ struct ConcatenateRowsFunctor<GPUDevice, T> {
template <typename T>
struct ConcatenateColsFunctor<GPUDevice, T> {
void operator()(const GPUDevice& d, const float* fB, int* Bconc, const int N) {
#ifdef DEBUG
printf("\n\nConcatenateColsFunctor\n\n");
#endif
int block = BLOCK_SIZE * 4;
int grid = N / block + 1;
concatenate_cols_kernel<T>
Expand Down
16 changes: 15 additions & 1 deletion src/gemm_op.cc
@@ -1,4 +1,5 @@
// gemm_op.cc
//#define DEBUG
#define EIGEN_USE_THREADS

#include <stdio.h>
Expand Down Expand Up @@ -152,15 +153,18 @@ class XnorGemmOp : public OpKernel {
Status allocate_temp(DataType type, const TensorShape& shape,
Tensor* out_temp);
*/

#ifdef DEBUG
printf("\n\nXnorGemmOp -- allocated output\n\n");
#endif

Tensor Aconc;// = nullptr;
Tensor Bconc;// = nullptr;
OP_REQUIRES_OK(ctx, ctx->allocate_temp(DT_INT32, out_shape, &Aconc));
OP_REQUIRES_OK(ctx, ctx->allocate_temp(DT_INT32, out_shape, &Bconc));

#ifdef DEBUG
printf("\n\nXnorGemmOp -- allocated temp\n\n");
#endif

if (out->NumElements() == 0) {
// If a has shape [0, x] or b has shape [x, 0], the output shape
Expand All @@ -187,33 +191,41 @@ class XnorGemmOp : public OpKernel {
const int32 k = a.dim_size(dim_pair[0].first);
const int32 n = b.dim_size(1 - dim_pair[0].second);

#ifdef DEBUG
printf("\n\nXnorGemmOp -- created m,n,k\n\n");
#endif

auto a_flat = a.flat<T>().data();
auto b_flat = b.flat<T>().data();
auto Aconc_flat = Aconc.flat<int32>().data();
auto Bconc_flat = Bconc.flat<int32>().data();
auto c_flat = out->flat<T>().data();

#ifdef DEBUG
printf("\n\nXnorGemmOp -- created a_flat, Aconc_flat\n\n");
#endif

#if 1
ConcatenateRowsFunctor<Device, T>()(
ctx->eigen_device<Device>(),
a_flat,
Aconc_flat,
m);
#ifdef DEBUG
printf("\n\nXnorGemmOp -- ran ConcatenateRowsFunctor\n\n");
#endif
#endif

#if 1
ConcatenateColsFunctor<Device, T>()(
ctx->eigen_device<Device>(),
b_flat,
Bconc_flat,
m);
#ifdef DEBUG
printf("\n\nXnorGemmOp -- ran ConcatenateColsFunctor\n\n");
#endif
#endif

#if 1
XnorGemmFunctor<Device, T>()(
Expand All @@ -224,8 +236,10 @@ class XnorGemmOp : public OpKernel {
m,
n,
k);
#ifdef DEBUG
printf("\n\nXnorGemmOp -- ran XnorGemmFunctor\n\n");
#endif
#endif

#if 0 /* For testing base kernel */
XnorGemmFunctor<Device, T>()(
Expand Down
2 changes: 2 additions & 0 deletions src/xnor_gemm_kernel.cu.cc
Expand Up @@ -93,7 +93,9 @@ struct XnorGemmFunctor<GPUDevice, T> {
//
// See core/util/cuda_kernel_helper.h for example of computing
// block count and thread_per_block count.
#ifdef DEBUG
printf("\n\nInt32 input -- using XnorGemmFunctor\n\n");
#endif
/*
int block_count = BLOCK_SIZE;
int thread_per_block = 512;
Expand Down

0 comments on commit 31df1a9

Please sign in to comment.