Permalink
Browse files

Optimizations to the matrix computation. Only compute the top right o…

…f the matrix, above (and not including) the diagonal.
  • Loading branch information...
1 parent b9a3322 commit 0c2f4a5d38bd1f84423f4cd78c293fa6cdea8c14 Bob Somers committed Dec 7, 2011
Showing with 53 additions and 138 deletions.
  1. +3 −3 biogpu/correlation.py
  2. +23 −17 biogpu/pearson.cu
  3. +27 −118 pearsonBenchmark.py
@@ -34,15 +34,15 @@ def pearson(pyroprints, num_buckets, show_progress = True):
remain_A = n - (s * tile_size * block_size)
num_A = num_A if num_A < remain_A else remain_A
- A = numpy.zeros(shape=(num_A, m), dtype=numpy.int32, order='C')
+ A = numpy.zeros(shape=(num_A, m), dtype=numpy.float32, order='C')
for i in range(num_A):
numpy.put(A[i], range(m), pyroprints[(s * tile_size * block_size) + i])
num_B = tile_size * block_size
remain_B = n - (t * tile_size * block_size)
num_B = num_B if num_B < remain_B else remain_B
- B = numpy.zeros(shape=(num_B, m), dtype=numpy.int32, order='C')
+ B = numpy.zeros(shape=(num_B, m), dtype=numpy.float32, order='C')
for i in range(num_B):
numpy.put(B[i], range(m), pyroprints[(t * tile_size * block_size) + i])
@@ -55,7 +55,7 @@ def pearson(pyroprints, num_buckets, show_progress = True):
grid=(tile_size, tile_size))
if show_progress:
- progress = (s * num_tiles + t) * 100.0 / (num_tiles * num_tiles)
+ progress = (s * num_tiles + t) * 100 / (num_tiles * num_tiles)
sys.stdout.write('\rComputing correlations %.3f%%' % progress)
sys.stdout.flush()
View
@@ -1,28 +1,34 @@
+#include <stdint.h>
+
__global__ void pearson(int *buckets, int num_buckets,
- int *A, int num_A, int *B, int num_B,
+ float *A, int num_A, float *B, int num_B,
int s, int t, int n, int m) {
- // calculate relative <i, j> coords within this tile
- int i = blockIdx.y * blockDim.y + threadIdx.y; // row
- int j = blockIdx.x * blockDim.x + threadIdx.x; // column
+ // Calculate relative <i, j> coords within this tile.
+ uint32_t i = blockIdx.y * blockDim.y + threadIdx.y; // row
+ uint32_t j = blockIdx.x * blockDim.x + threadIdx.x; // column
+
+ // Calculate the offsets based on the tile number.
+ uint32_t i_offset = s * gridDim.y * blockDim.y;
+ uint32_t j_offset = t * gridDim.x * blockDim.x;
- // calculate the offsets based on the tile number
- int i_offset = s * gridDim.y * blockDim.y;
- int j_offset = t * gridDim.x * blockDim.x;
+ // Calculate the absolute <i, j> coords within the matrix.
+ uint64_t i_abs = i_offset + i;
+ uint64_t j_abs = j_offset + j;
- // make sure this thread is inside the matrix
- if (i + i_offset >= n ||
- j + j_offset >= n) {
+ // Quick checks to bail out. Only compute values inside the bounds of the
+ // matrix, and above the diagonal.
+ if (i_abs >= n || j_abs >= n || i_abs >= j_abs) {
return;
}
- // initialize accumulators and result
+ // Initialize accumulators and the result.
float sum_x, sum_y, sum_x2, sum_y2, sum_xy, coeff;
sum_x = sum_y = sum_x2 = sum_y2 = sum_xy = coeff = 0.0f;
- // compute the sums
+ // Compute the sums.
for (int k = 0; k < m; k++) {
- int x = A[i * m + k];
- int y = B[j * m + k];
+ float x = A[i * m + k];
+ float y = B[j * m + k];
sum_x += x;
sum_y += y;
@@ -31,12 +37,12 @@ __global__ void pearson(int *buckets, int num_buckets,
sum_xy += x * y;
}
- // compute the pearson coefficient using the "sometimes numerically
- // unstable" method because it's way more computationally efficient
+ // Compute the Pearson coefficient using the "sometimes numerically
+ // unstable" method because it's way more computationally efficient.
coeff = (m * sum_xy - sum_x * sum_y) /
sqrtf((m * sum_x2 - sum_x * sum_x) * (m * sum_y2 - sum_y * sum_y));
- // dump it in the appropriate bucket
+ // Dump it in the appropriate bucket.
int bucket = (int)(coeff * num_buckets);
if (bucket >= num_buckets) {
atomicAdd(&(buckets[num_buckets - 1]), 1);
View
@@ -1,59 +1,56 @@
-import pycuda.autoinit
-import pycuda.driver as cuda
-import pycuda.compiler
-import pycuda.gpuarray
+import sys
import numpy
from scipy.stats.stats import pearsonr
+import biogpu.correlation
import time
def main():
- n = 2000 # number of pyroprints
+ n = 512 # number of pyroprints
m = 104 # pyroprint length
+ #n = 10
+ #m = 104
- pyroprints = numpy.zeros(shape=(n, m), dtype=numpy.int32, order='C')
+ pyroprints = numpy.zeros(shape=(n, m), dtype=numpy.float32, order='C')
for i in range(n):
numpy.put(pyroprints[i], range(m),
- numpy.random.random_integers(0, 30, 10).astype(numpy.int32))
+ numpy.random.rand(m).astype(numpy.float32))
- print('fake pyroprints:')
+ print('Fake Pyroprints:')
print(pyroprints)
- print('\n');
+ print('')
- print('=== computing with python ===')
+ print('=== Computing with Python/SciPy ===')
python_start = time.time()
python_buckets = compute_python(pyroprints, 10000)
python_end = time.time()
- #print('buckets (abridged):')
+ #print('Buckets (abridged):')
#for i in range(10000):
# if python_buckets[i] > 0:
# print('\t[%d] = %d' % (i, python_buckets[i]))
#print('\n')
python_time = python_end - python_start
- print('computed in %f seconds' % python_time)
- print('\n')
+ print('Computed in %f seconds.\n' % python_time)
- print('=== computing with cuda ===')
+ print('=== Computing with CUDA ===')
cuda_start = time.time()
- cuda_buckets = compute_cuda(pyroprints, 10000)
+ cuda_buckets = biogpu.correlation.pearson(pyroprints, 10000)
cuda_end = time.time()
- #print('buckets (abridged):')
+ #print('Buckets (abridged):')
#for i in range(10000):
# if cuda_buckets[i] > 0:
# print('\t[%d] = %d' % (i, cuda_buckets[i]))
#print('\n')
cuda_time = cuda_end - cuda_start
- print('computed in %f seconds' % cuda_time)
- print('\n')
+ print('Computed in %f seconds.\n' % cuda_time)
speedup = python_time / cuda_time
- print('speedup of %fx' % speedup)
- print('\n')
+ print('Speedup of %.2fx.\n' % speedup)
- print('done')
+ print('Done.')
def compute_python(pyroprints, num_buckets):
n = len(pyroprints)
@@ -62,114 +59,26 @@ def compute_python(pyroprints, num_buckets):
matrix = numpy.zeros(shape=(n, n), dtype=numpy.float32, order='C')
buckets = numpy.zeros(shape=(num_buckets, 1), dtype=numpy.int32, order='C')
+ num_cells = n * n * 0.5 - n
+ cell_count = 0
for i in range(n):
for j in range(n):
+ if i >= j:
+ continue
coeff, _ = pearsonr(pyroprints[i], pyroprints[j])
matrix[i][j] = coeff
bucket = int(coeff * num_buckets)
if bucket >= num_buckets:
buckets[num_buckets - 1] += 1
elif bucket >= 1:
buckets[bucket - 1] += 1
+ cell_count += 1
- progress = ((i * n) * 100) / (n * n)
- print('%d%% complete' % progress)
-
- return buckets
-
+ progress = cell_count / num_cells * 100
+ sys.stdout.write('\rComputing correlations %.3f%%' % progress)
+ sys.stdout.flush()
-def compute_cuda(pyroprints, num_buckets):
- kernel = pycuda.compiler.SourceModule('''
- __global__ void pearson(int *buckets, int num_buckets,
- int *A, int num_A, int *B, int num_B,
- int s, int t, int n, int m) {
-
- // calculate relative <i, j> coords within this tile
- int i = blockIdx.y * blockDim.y + threadIdx.y; // row
- int j = blockIdx.x * blockDim.x + threadIdx.x; // column
-
- // calculate the offsets based on the tile number
- int i_offset = s * gridDim.y * blockDim.y;
- int j_offset = t * gridDim.x * blockDim.x;
-
- // make sure this thread is inside the matrix
- if (i + i_offset >= n ||
- j + j_offset >= n) {
- return;
- }
-
- // initialize accumulators and result
- float sum_x, sum_y, sum_x2, sum_y2, sum_xy, coeff;
- sum_x = sum_y = sum_x2 = sum_y2 = sum_xy = coeff = 0.0f;
-
- // compute the sums
- for (int k = 0; k < m; k++) {
- int x = A[i * m + k];
- int y = B[j * m + k];
-
- sum_x += x;
- sum_y += y;
- sum_x2 += x * x;
- sum_y2 += y * y;
- sum_xy += x * y;
- }
-
- // compute the pearson coefficient using the "sometimes numerically
- // unstable" because it's waaaay more computationally efficient
- coeff = (m * sum_xy - sum_x * sum_y) /
- sqrtf((m * sum_x2 - sum_x * sum_x) * (m * sum_y2 - sum_y * sum_y));
-
- // dump it in the appropriate bucket
- int bucket = (int)(coeff * num_buckets);
- if (bucket >= num_buckets) {
- atomicAdd(&(buckets[num_buckets - 1]), 1);
- } else if (bucket >= 1) {
- atomicAdd(&(buckets[bucket - 1]), 1);
- }
- }
- ''')
- pearson_kernel = kernel.get_function('pearson')
-
- n = len(pyroprints)
- m = len(pyroprints[0])
-
- block_size = 16
- tile_size = 64
- num_tiles = (n / (tile_size * block_size)) + 1
-
- buckets = numpy.zeros(shape=(num_buckets, 1), dtype=numpy.int32, order='C')
- buckets_gpu = pycuda.gpuarray.to_gpu(buckets)
-
- for s in range(num_tiles):
- for t in range(num_tiles):
- num_A = tile_size * block_size
- remain_A = n - (s * tile_size * block_size)
- num_A = num_A if num_A < remain_A else remain_A
-
- A = numpy.zeros(shape=(num_A, m), dtype=numpy.int32, order='C')
- for i in range(num_A):
- numpy.put(A[i], range(m), pyroprints[(s * tile_size * block_size) + i])
-
- num_B = tile_size * block_size
- remain_B = n - (t * tile_size * block_size)
- num_B = num_B if num_B < remain_B else remain_B
-
- B = numpy.zeros(shape=(num_B, m), dtype=numpy.int32, order='C')
- for i in range(num_B):
- numpy.put(B[i], range(m), pyroprints[(t * tile_size * block_size) + i])
-
- pearson_kernel(buckets_gpu.gpudata, numpy.int32(num_buckets),
- cuda.In(A), numpy.int32(num_A),
- cuda.In(B), numpy.int32(num_B),
- numpy.int32(s), numpy.int32(t),
- numpy.int32(n), numpy.int32(m),
- block=(block_size, block_size, 1),
- grid=(tile_size, tile_size))
-
- progress = ((s * num_tiles + t) * 100) / (num_tiles * num_tiles)
- print('%d%% complete' % progress)
-
- buckets_gpu.get(buckets)
+ print('\rComputing correlations 100.000%')
return buckets
if __name__ == '__main__':

0 comments on commit 0c2f4a5

Please sign in to comment.