Skip to content

# publicDrin/PyroprintSimulation forked from bobsomers/CSC570

### Subversion checkout URL

You can clone with HTTPS or Subversion.

Optimizations to the matrix computation. Only compute the top right o…

`…f the matrix, above (and not including) the diagonal.`
• Loading branch information...
commit 0c2f4a5d38bd1f84423f4cd78c293fa6cdea8c14 1 parent b9a3322
authored

Showing 3 changed files with 53 additions and 138 deletions.

1. biogpu/correlation.py
2. biogpu/pearson.cu
3. pearsonBenchmark.py
6  biogpu/correlation.py
 `@@ -34,7 +34,7 @@ def pearson(pyroprints, num_buckets, show_progress = True):` 34 34 ` remain_A = n - (s * tile_size * block_size)` 35 35 ` num_A = num_A if num_A < remain_A else remain_A` 36 36 ` ` 37 `- A = numpy.zeros(shape=(num_A, m), dtype=numpy.int32, order='C')` 37 `+ A = numpy.zeros(shape=(num_A, m), dtype=numpy.float32, order='C')` 38 38 ` for i in range(num_A):` 39 39 ` numpy.put(A[i], range(m), pyroprints[(s * tile_size * block_size) + i])` 40 40 ` ` `@@ -42,7 +42,7 @@ def pearson(pyroprints, num_buckets, show_progress = True):` 42 42 ` remain_B = n - (t * tile_size * block_size)` 43 43 ` num_B = num_B if num_B < remain_B else remain_B` 44 44 ` ` 45 `- B = numpy.zeros(shape=(num_B, m), dtype=numpy.int32, order='C')` 45 `+ B = numpy.zeros(shape=(num_B, m), dtype=numpy.float32, order='C')` 46 46 ` for i in range(num_B):` 47 47 ` numpy.put(B[i], range(m), pyroprints[(t * tile_size * block_size) + i])` 48 48 ` ` `@@ -55,7 +55,7 @@ def pearson(pyroprints, num_buckets, show_progress = True):` 55 55 ` grid=(tile_size, tile_size))` 56 56 ` ` 57 57 ` if show_progress:` 58 `- progress = (s * num_tiles + t) * 100.0 / (num_tiles * num_tiles)` 58 `+ progress = (s * num_tiles + t) * 100 / (num_tiles * num_tiles)` 59 59 ` sys.stdout.write('\rComputing correlations %.3f%%' % progress)` 60 60 ` sys.stdout.flush()` 61 61 ` `
40  biogpu/pearson.cu
 ... ... `@@ -1,28 +1,34 @@` 1 `+#include ` 2 `+` 1 3 ` __global__ void pearson(int *buckets, int num_buckets,` 2 `- int *A, int num_A, int *B, int num_B,` 4 `+ float *A, int num_A, float *B, int num_B,` 3 5 ` int s, int t, int n, int m) {` 4 `- // calculate relative coords within this tile` 5 `- int i = blockIdx.y * blockDim.y + threadIdx.y; // row` 6 `- int j = blockIdx.x * blockDim.x + threadIdx.x; // column` 6 `+ // Calculate relative coords within this tile.` 7 `+ uint32_t i = blockIdx.y * blockDim.y + threadIdx.y; // row` 8 `+ uint32_t j = blockIdx.x * blockDim.x + threadIdx.x; // column` 9 `+` 10 `+ // Calculate the offsets based on the tile number.` 11 `+ uint32_t i_offset = s * gridDim.y * blockDim.y;` 12 `+ uint32_t j_offset = t * gridDim.x * blockDim.x;` 7 13 ` ` 8 `- // calculate the offsets based on the tile number` 9 `- int i_offset = s * gridDim.y * blockDim.y;` 10 `- int j_offset = t * gridDim.x * blockDim.x;` 14 `+ // Calculate the absolute coords within the matrix.` 15 `+ uint64_t i_abs = i_offset + i;` 16 `+ uint64_t j_abs = j_offset + j;` 11 17 ` ` 12 `- // make sure this thread is inside the matrix` 13 `- if (i + i_offset >= n ||` 14 `- j + j_offset >= n) {` 18 `+ // Quick checks to bail out. Only compute values inside the bounds of the` 19 `+ // matrix, and above the diagonal.` 20 `+ if (i_abs >= n || j_abs >= n || i_abs >= j_abs) {` 15 21 ` return;` 16 22 ` }` 17 23 ` ` 18 `- // initialize accumulators and result` 24 `+ // Initialize accumulators and the result.` 19 25 ` float sum_x, sum_y, sum_x2, sum_y2, sum_xy, coeff;` 20 26 ` sum_x = sum_y = sum_x2 = sum_y2 = sum_xy = coeff = 0.0f;` 21 27 ` ` 22 `- // compute the sums` 28 `+ // Compute the sums.` 23 29 ` for (int k = 0; k < m; k++) {` 24 `- int x = A[i * m + k];` 25 `- int y = B[j * m + k];` 30 `+ float x = A[i * m + k];` 31 `+ float y = B[j * m + k];` 26 32 ` ` 27 33 ` sum_x += x;` 28 34 ` sum_y += y;` `@@ -31,12 +37,12 @@ __global__ void pearson(int *buckets, int num_buckets,` 31 37 ` sum_xy += x * y;` 32 38 ` }` 33 39 ` ` 34 `- // compute the pearson coefficient using the "sometimes numerically` 35 `- // unstable" method because it's way more computationally efficient` 40 `+ // Compute the Pearson coefficient using the "sometimes numerically` 41 `+ // unstable" method because it's way more computationally efficient.` 36 42 ` coeff = (m * sum_xy - sum_x * sum_y) /` 37 43 ` sqrtf((m * sum_x2 - sum_x * sum_x) * (m * sum_y2 - sum_y * sum_y));` 38 44 ` ` 39 `- // dump it in the appropriate bucket` 45 `+ // Dump it in the appropriate bucket.` 40 46 ` int bucket = (int)(coeff * num_buckets);` 41 47 ` if (bucket >= num_buckets) {` 42 48 ` atomicAdd(&(buckets[num_buckets - 1]), 1);`
145  pearsonBenchmark.py
 ... ... `@@ -1,59 +1,56 @@` 1 `-import pycuda.autoinit` 2 `-import pycuda.driver as cuda` 3 `-import pycuda.compiler` 4 `-import pycuda.gpuarray` 1 `+import sys` 5 2 ` import numpy` 6 3 ` from scipy.stats.stats import pearsonr` 4 `+import biogpu.correlation` 7 5 ` import time` 8 6 ` ` 9 7 ` def main():` 10 `- n = 2000 # number of pyroprints` 8 `+ n = 512 # number of pyroprints` 11 9 ` m = 104 # pyroprint length` 10 `+ #n = 10` 11 `+ #m = 104` 12 12 ` ` 13 `- pyroprints = numpy.zeros(shape=(n, m), dtype=numpy.int32, order='C')` 13 `+ pyroprints = numpy.zeros(shape=(n, m), dtype=numpy.float32, order='C')` 14 14 ` for i in range(n):` 15 15 ` numpy.put(pyroprints[i], range(m),` 16 `- numpy.random.random_integers(0, 30, 10).astype(numpy.int32))` 16 `+ numpy.random.rand(m).astype(numpy.float32))` 17 17 ` ` 18 `- print('fake pyroprints:')` 18 `+ print('Fake Pyroprints:')` 19 19 ` print(pyroprints)` 20 `- print('\n');` 20 `+ print('')` 21 21 ` ` 22 `- print('=== computing with python ===')` 22 `+ print('=== Computing with Python/SciPy ===')` 23 23 ` python_start = time.time()` 24 24 ` python_buckets = compute_python(pyroprints, 10000)` 25 25 ` python_end = time.time()` 26 26 ` ` 27 `- #print('buckets (abridged):')` 27 `+ #print('Buckets (abridged):')` 28 28 ` #for i in range(10000):` 29 29 ` # if python_buckets[i] > 0:` 30 30 ` # print('\t[%d] = %d' % (i, python_buckets[i]))` 31 31 ` #print('\n')` 32 32 ` ` 33 33 ` python_time = python_end - python_start` 34 `- print('computed in %f seconds' % python_time)` 35 `- print('\n')` 34 `+ print('Computed in %f seconds.\n' % python_time)` 36 35 ` ` 37 `- print('=== computing with cuda ===')` 36 `+ print('=== Computing with CUDA ===')` 38 37 ` cuda_start = time.time()` 39 `- cuda_buckets = compute_cuda(pyroprints, 10000)` 38 `+ cuda_buckets = biogpu.correlation.pearson(pyroprints, 10000)` 40 39 ` cuda_end = time.time()` 41 40 ` ` 42 `- #print('buckets (abridged):')` 41 `+ #print('Buckets (abridged):')` 43 42 ` #for i in range(10000):` 44 43 ` # if cuda_buckets[i] > 0:` 45 44 ` # print('\t[%d] = %d' % (i, cuda_buckets[i]))` 46 45 ` #print('\n')` 47 46 ` ` 48 47 ` cuda_time = cuda_end - cuda_start` 49 `- print('computed in %f seconds' % cuda_time)` 50 `- print('\n')` 48 `+ print('Computed in %f seconds.\n' % cuda_time)` 51 49 ` ` 52 50 ` speedup = python_time / cuda_time` 53 `- print('speedup of %fx' % speedup)` 54 `- print('\n')` 51 `+ print('Speedup of %.2fx.\n' % speedup)` 55 52 ` ` 56 `- print('done')` 53 `+ print('Done.')` 57 54 ` ` 58 55 ` def compute_python(pyroprints, num_buckets):` 59 56 ` n = len(pyroprints)` `@@ -62,8 +59,12 @@ def compute_python(pyroprints, num_buckets):` 62 59 ` matrix = numpy.zeros(shape=(n, n), dtype=numpy.float32, order='C')` 63 60 ` buckets = numpy.zeros(shape=(num_buckets, 1), dtype=numpy.int32, order='C')` 64 61 ` ` 62 `+ num_cells = n * n * 0.5 - n` 63 `+ cell_count = 0` 65 64 ` for i in range(n):` 66 65 ` for j in range(n):` 66 `+ if i >= j:` 67 `+ continue` 67 68 ` coeff, _ = pearsonr(pyroprints[i], pyroprints[j])` 68 69 ` matrix[i][j] = coeff` 69 70 ` bucket = int(coeff * num_buckets)` `@@ -71,105 +72,13 @@ def compute_python(pyroprints, num_buckets):` 71 72 ` buckets[num_buckets - 1] += 1` 72 73 ` elif bucket >= 1:` 73 74 ` buckets[bucket - 1] += 1` 75 `+ cell_count += 1` 74 76 ` ` 75 `- progress = ((i * n) * 100) / (n * n)` 76 `- print('%d%% complete' % progress)` 77 `- ` 78 `- return buckets` 79 `-` 77 `+ progress = cell_count / num_cells * 100` 78 `+ sys.stdout.write('\rComputing correlations %.3f%%' % progress)` 79 `+ sys.stdout.flush()` 80 80 ` ` 81 `-def compute_cuda(pyroprints, num_buckets):` 82 `- kernel = pycuda.compiler.SourceModule('''` 83 `- __global__ void pearson(int *buckets, int num_buckets,` 84 `- int *A, int num_A, int *B, int num_B,` 85 `- int s, int t, int n, int m) {` 86 `-` 87 `- // calculate relative coords within this tile` 88 `- int i = blockIdx.y * blockDim.y + threadIdx.y; // row` 89 `- int j = blockIdx.x * blockDim.x + threadIdx.x; // column` 90 `-` 91 `- // calculate the offsets based on the tile number` 92 `- int i_offset = s * gridDim.y * blockDim.y;` 93 `- int j_offset = t * gridDim.x * blockDim.x;` 94 `-` 95 `- // make sure this thread is inside the matrix` 96 `- if (i + i_offset >= n ||` 97 `- j + j_offset >= n) {` 98 `- return;` 99 `- }` 100 `-` 101 `- // initialize accumulators and result` 102 `- float sum_x, sum_y, sum_x2, sum_y2, sum_xy, coeff;` 103 `- sum_x = sum_y = sum_x2 = sum_y2 = sum_xy = coeff = 0.0f;` 104 `-` 105 `- // compute the sums` 106 `- for (int k = 0; k < m; k++) {` 107 `- int x = A[i * m + k];` 108 `- int y = B[j * m + k];` 109 `-` 110 `- sum_x += x;` 111 `- sum_y += y;` 112 `- sum_x2 += x * x;` 113 `- sum_y2 += y * y;` 114 `- sum_xy += x * y;` 115 `- }` 116 `-` 117 `- // compute the pearson coefficient using the "sometimes numerically` 118 `- // unstable" because it's waaaay more computationally efficient` 119 `- coeff = (m * sum_xy - sum_x * sum_y) /` 120 `- sqrtf((m * sum_x2 - sum_x * sum_x) * (m * sum_y2 - sum_y * sum_y));` 121 `-` 122 `- // dump it in the appropriate bucket` 123 `- int bucket = (int)(coeff * num_buckets);` 124 `- if (bucket >= num_buckets) {` 125 `- atomicAdd(&(buckets[num_buckets - 1]), 1);` 126 `- } else if (bucket >= 1) {` 127 `- atomicAdd(&(buckets[bucket - 1]), 1);` 128 `- }` 129 `- }` 130 `- ''')` 131 `- pearson_kernel = kernel.get_function('pearson')` 132 `-` 133 `- n = len(pyroprints)` 134 `- m = len(pyroprints[0])` 135 `- ` 136 `- block_size = 16` 137 `- tile_size = 64` 138 `- num_tiles = (n / (tile_size * block_size)) + 1` 139 `-` 140 `- buckets = numpy.zeros(shape=(num_buckets, 1), dtype=numpy.int32, order='C')` 141 `- buckets_gpu = pycuda.gpuarray.to_gpu(buckets)` 142 `-` 143 `- for s in range(num_tiles):` 144 `- for t in range(num_tiles):` 145 `- num_A = tile_size * block_size` 146 `- remain_A = n - (s * tile_size * block_size)` 147 `- num_A = num_A if num_A < remain_A else remain_A` 148 `-` 149 `- A = numpy.zeros(shape=(num_A, m), dtype=numpy.int32, order='C')` 150 `- for i in range(num_A):` 151 `- numpy.put(A[i], range(m), pyroprints[(s * tile_size * block_size) + i])` 152 `-` 153 `- num_B = tile_size * block_size` 154 `- remain_B = n - (t * tile_size * block_size)` 155 `- num_B = num_B if num_B < remain_B else remain_B` 156 `-` 157 `- B = numpy.zeros(shape=(num_B, m), dtype=numpy.int32, order='C')` 158 `- for i in range(num_B):` 159 `- numpy.put(B[i], range(m), pyroprints[(t * tile_size * block_size) + i])` 160 `-` 161 `- pearson_kernel(buckets_gpu.gpudata, numpy.int32(num_buckets),` 162 `- cuda.In(A), numpy.int32(num_A),` 163 `- cuda.In(B), numpy.int32(num_B),` 164 `- numpy.int32(s), numpy.int32(t),` 165 `- numpy.int32(n), numpy.int32(m),` 166 `- block=(block_size, block_size, 1),` 167 `- grid=(tile_size, tile_size))` 168 `-` 169 `- progress = ((s * num_tiles + t) * 100) / (num_tiles * num_tiles)` 170 `- print('%d%% complete' % progress)` 171 `-` 172 `- buckets_gpu.get(buckets)` 81 `+ print('\rComputing correlations 100.000%')` 173 82 ` return buckets` 174 83 ` ` 175 84 ` if __name__ == '__main__':`

## 0 notes on commit `0c2f4a5`

Please sign in to comment.
Something went wrong with that request. Please try again.