Skip to content
This repository
Browse code

Optimizations to the matrix computation. Only compute the top right o…

…f the matrix, above (and not including) the diagonal.
  • Loading branch information...
commit 0c2f4a5d38bd1f84423f4cd78c293fa6cdea8c14 1 parent b9a3322
authored
6  biogpu/correlation.py
@@ -34,7 +34,7 @@ def pearson(pyroprints, num_buckets, show_progress = True):
34 34
             remain_A = n - (s * tile_size * block_size)
35 35
             num_A = num_A if num_A < remain_A else remain_A
36 36
 
37  
-            A = numpy.zeros(shape=(num_A, m), dtype=numpy.int32, order='C')
  37
+            A = numpy.zeros(shape=(num_A, m), dtype=numpy.float32, order='C')
38 38
             for i in range(num_A):
39 39
                 numpy.put(A[i], range(m), pyroprints[(s * tile_size * block_size) + i])
40 40
 
@@ -42,7 +42,7 @@ def pearson(pyroprints, num_buckets, show_progress = True):
42 42
             remain_B = n - (t * tile_size * block_size)
43 43
             num_B = num_B if num_B < remain_B else remain_B
44 44
 
45  
-            B = numpy.zeros(shape=(num_B, m), dtype=numpy.int32, order='C')
  45
+            B = numpy.zeros(shape=(num_B, m), dtype=numpy.float32, order='C')
46 46
             for i in range(num_B):
47 47
                 numpy.put(B[i], range(m), pyroprints[(t * tile_size * block_size) + i])
48 48
 
@@ -55,7 +55,7 @@ def pearson(pyroprints, num_buckets, show_progress = True):
55 55
                            grid=(tile_size, tile_size))
56 56
 
57 57
             if show_progress:
58  
-                progress = (s * num_tiles + t) * 100.0 / (num_tiles * num_tiles)
  58
+                progress = (s * num_tiles + t) * 100 / (num_tiles * num_tiles)
59 59
                 sys.stdout.write('\rComputing correlations %.3f%%' % progress)
60 60
                 sys.stdout.flush()
61 61
 
40  biogpu/pearson.cu
... ...
@@ -1,28 +1,34 @@
  1
+#include <stdint.h>
  2
+
1 3
 __global__ void pearson(int *buckets, int num_buckets,
2  
-                        int *A, int num_A, int *B, int num_B,
  4
+                        float *A, int num_A, float *B, int num_B,
3 5
                         int s, int t, int n, int m) {
4  
-    // calculate relative <i, j> coords within this tile
5  
-    int i = blockIdx.y * blockDim.y + threadIdx.y; // row
6  
-    int j = blockIdx.x * blockDim.x + threadIdx.x; // column
  6
+    // Calculate relative <i, j> coords within this tile.
  7
+    uint32_t i = blockIdx.y * blockDim.y + threadIdx.y; // row
  8
+    uint32_t j = blockIdx.x * blockDim.x + threadIdx.x; // column
  9
+
  10
+    // Calculate the offsets based on the tile number.
  11
+    uint32_t i_offset = s * gridDim.y * blockDim.y;
  12
+    uint32_t j_offset = t * gridDim.x * blockDim.x;
7 13
 
8  
-    // calculate the offsets based on the tile number
9  
-    int i_offset = s * gridDim.y * blockDim.y;
10  
-    int j_offset = t * gridDim.x * blockDim.x;
  14
+    // Calculate the absolute <i, j> coords within the matrix.
  15
+    uint64_t i_abs = i_offset + i;
  16
+    uint64_t j_abs = j_offset + j;
11 17
 
12  
-    // make sure this thread is inside the matrix
13  
-    if (i + i_offset >= n ||
14  
-        j + j_offset >= n) {
  18
+    // Quick checks to bail out. Only compute values inside the bounds of the
  19
+    // matrix, and above the diagonal.
  20
+    if (i_abs >= n || j_abs >= n || i_abs >= j_abs) {
15 21
         return;
16 22
     }
17 23
 
18  
-    // initialize accumulators and result
  24
+    // Initialize accumulators and the result.
19 25
     float sum_x, sum_y, sum_x2, sum_y2, sum_xy, coeff;
20 26
     sum_x = sum_y = sum_x2 = sum_y2 = sum_xy = coeff = 0.0f;
21 27
 
22  
-    // compute the sums
  28
+    // Compute the sums.
23 29
     for (int k = 0; k < m; k++) {
24  
-        int x = A[i * m + k];
25  
-        int y = B[j * m + k];
  30
+        float x = A[i * m + k];
  31
+        float y = B[j * m + k];
26 32
 
27 33
         sum_x += x;
28 34
         sum_y += y;
@@ -31,12 +37,12 @@ __global__ void pearson(int *buckets, int num_buckets,
31 37
         sum_xy += x * y;
32 38
     }
33 39
 
34  
-    // compute the pearson coefficient using the "sometimes numerically
35  
-    // unstable" method because it's way more computationally efficient
  40
+    // Compute the Pearson coefficient using the "sometimes numerically
  41
+    // unstable" method because it's way more computationally efficient.
36 42
     coeff = (m * sum_xy - sum_x * sum_y) /
37 43
             sqrtf((m * sum_x2 - sum_x * sum_x) * (m * sum_y2 - sum_y * sum_y));
38 44
 
39  
-    // dump it in the appropriate bucket
  45
+    // Dump it in the appropriate bucket.
40 46
     int bucket = (int)(coeff * num_buckets);
41 47
     if (bucket >= num_buckets) {
42 48
         atomicAdd(&(buckets[num_buckets - 1]), 1);
145  pearsonBenchmark.py
... ...
@@ -1,59 +1,56 @@
1  
-import pycuda.autoinit
2  
-import pycuda.driver as cuda
3  
-import pycuda.compiler
4  
-import pycuda.gpuarray
  1
+import sys
5 2
 import numpy
6 3
 from scipy.stats.stats import pearsonr
  4
+import biogpu.correlation
7 5
 import time
8 6
 
9 7
 def main():
10  
-    n = 2000 # number of pyroprints
  8
+    n = 512 # number of pyroprints
11 9
     m = 104 # pyroprint length
  10
+    #n = 10
  11
+    #m = 104
12 12
 
13  
-    pyroprints = numpy.zeros(shape=(n, m), dtype=numpy.int32, order='C')
  13
+    pyroprints = numpy.zeros(shape=(n, m), dtype=numpy.float32, order='C')
14 14
     for i in range(n):
15 15
         numpy.put(pyroprints[i], range(m),
16  
-                  numpy.random.random_integers(0, 30, 10).astype(numpy.int32))
  16
+                  numpy.random.rand(m).astype(numpy.float32))
17 17
 
18  
-    print('fake pyroprints:')
  18
+    print('Fake Pyroprints:')
19 19
     print(pyroprints)
20  
-    print('\n');
  20
+    print('')
21 21
 
22  
-    print('=== computing with python ===')
  22
+    print('=== Computing with Python/SciPy ===')
23 23
     python_start = time.time()
24 24
     python_buckets = compute_python(pyroprints, 10000)
25 25
     python_end = time.time()
26 26
 
27  
-    #print('buckets (abridged):')
  27
+    #print('Buckets (abridged):')
28 28
     #for i in range(10000):
29 29
     #    if python_buckets[i] > 0:
30 30
     #        print('\t[%d] = %d' % (i, python_buckets[i]))
31 31
     #print('\n')
32 32
 
33 33
     python_time = python_end - python_start
34  
-    print('computed in %f seconds' % python_time)
35  
-    print('\n')
  34
+    print('Computed in %f seconds.\n' % python_time)
36 35
 
37  
-    print('=== computing with cuda ===')
  36
+    print('=== Computing with CUDA ===')
38 37
     cuda_start = time.time()
39  
-    cuda_buckets = compute_cuda(pyroprints, 10000)
  38
+    cuda_buckets = biogpu.correlation.pearson(pyroprints, 10000)
40 39
     cuda_end = time.time()
41 40
 
42  
-    #print('buckets (abridged):')
  41
+    #print('Buckets (abridged):')
43 42
     #for i in range(10000):
44 43
     #    if cuda_buckets[i] > 0:
45 44
     #        print('\t[%d] = %d' % (i, cuda_buckets[i]))
46 45
     #print('\n')
47 46
 
48 47
     cuda_time = cuda_end - cuda_start
49  
-    print('computed in %f seconds' % cuda_time)
50  
-    print('\n')
  48
+    print('Computed in %f seconds.\n' % cuda_time)
51 49
 
52 50
     speedup = python_time / cuda_time
53  
-    print('speedup of %fx' % speedup)
54  
-    print('\n')
  51
+    print('Speedup of %.2fx.\n' % speedup)
55 52
 
56  
-    print('done')
  53
+    print('Done.')
57 54
 
58 55
 def compute_python(pyroprints, num_buckets):
59 56
     n = len(pyroprints)
@@ -62,8 +59,12 @@ def compute_python(pyroprints, num_buckets):
62 59
     matrix = numpy.zeros(shape=(n, n), dtype=numpy.float32, order='C')
63 60
     buckets = numpy.zeros(shape=(num_buckets, 1), dtype=numpy.int32, order='C')
64 61
 
  62
+    num_cells = n * n * 0.5 - n
  63
+    cell_count = 0
65 64
     for i in range(n):
66 65
         for j in range(n):
  66
+            if i >= j:
  67
+                continue
67 68
             coeff, _ = pearsonr(pyroprints[i], pyroprints[j])
68 69
             matrix[i][j] = coeff
69 70
             bucket = int(coeff * num_buckets)
@@ -71,105 +72,13 @@ def compute_python(pyroprints, num_buckets):
71 72
                 buckets[num_buckets - 1] += 1
72 73
             elif bucket >= 1:
73 74
                 buckets[bucket - 1] += 1
  75
+            cell_count += 1
74 76
 
75  
-        progress = ((i * n) * 100) / (n * n)
76  
-        print('%d%% complete' % progress)
77  
-    
78  
-    return buckets
79  
-
  77
+        progress = cell_count / num_cells * 100
  78
+        sys.stdout.write('\rComputing correlations %.3f%%' % progress)
  79
+        sys.stdout.flush()
80 80
 
81  
-def compute_cuda(pyroprints, num_buckets):
82  
-    kernel = pycuda.compiler.SourceModule('''
83  
-        __global__ void pearson(int *buckets, int num_buckets,
84  
-                                int *A, int num_A, int *B, int num_B,
85  
-                                int s, int t, int n, int m) {
86  
-
87  
-            // calculate relative <i, j> coords within this tile
88  
-            int i = blockIdx.y * blockDim.y + threadIdx.y; // row
89  
-            int j = blockIdx.x * blockDim.x + threadIdx.x; // column
90  
-
91  
-            // calculate the offsets based on the tile number
92  
-            int i_offset = s * gridDim.y * blockDim.y;
93  
-            int j_offset = t * gridDim.x * blockDim.x;
94  
-
95  
-            // make sure this thread is inside the matrix
96  
-            if (i + i_offset >= n ||
97  
-                j + j_offset >= n) {
98  
-                return;
99  
-            }
100  
-
101  
-            // initialize accumulators and result
102  
-            float sum_x, sum_y, sum_x2, sum_y2, sum_xy, coeff;
103  
-            sum_x = sum_y = sum_x2 = sum_y2 = sum_xy = coeff = 0.0f;
104  
-
105  
-            // compute the sums
106  
-            for (int k = 0; k < m; k++) {
107  
-                int x = A[i * m + k];
108  
-                int y = B[j * m + k];
109  
-
110  
-                sum_x += x;
111  
-                sum_y += y;
112  
-                sum_x2 += x * x;
113  
-                sum_y2 += y * y;
114  
-                sum_xy += x * y;
115  
-            }
116  
-
117  
-            // compute the pearson coefficient using the "sometimes numerically
118  
-            // unstable" because it's waaaay more computationally efficient
119  
-            coeff = (m * sum_xy - sum_x * sum_y) /
120  
-                    sqrtf((m * sum_x2 - sum_x * sum_x) * (m * sum_y2 - sum_y * sum_y));
121  
-
122  
-            // dump it in the appropriate bucket
123  
-            int bucket = (int)(coeff * num_buckets);
124  
-            if (bucket >= num_buckets) {
125  
-                atomicAdd(&(buckets[num_buckets - 1]), 1);
126  
-            } else if (bucket >= 1) {
127  
-                atomicAdd(&(buckets[bucket - 1]), 1);
128  
-            }
129  
-        }
130  
-    ''')
131  
-    pearson_kernel = kernel.get_function('pearson')
132  
-
133  
-    n = len(pyroprints)
134  
-    m = len(pyroprints[0])
135  
-    
136  
-    block_size = 16
137  
-    tile_size = 64
138  
-    num_tiles = (n / (tile_size * block_size)) + 1
139  
-
140  
-    buckets = numpy.zeros(shape=(num_buckets, 1), dtype=numpy.int32, order='C')
141  
-    buckets_gpu = pycuda.gpuarray.to_gpu(buckets)
142  
-
143  
-    for s in range(num_tiles):
144  
-        for t in range(num_tiles):
145  
-            num_A = tile_size * block_size
146  
-            remain_A = n - (s * tile_size * block_size)
147  
-            num_A = num_A if num_A < remain_A else remain_A
148  
-
149  
-            A = numpy.zeros(shape=(num_A, m), dtype=numpy.int32, order='C')
150  
-            for i in range(num_A):
151  
-                numpy.put(A[i], range(m), pyroprints[(s * tile_size * block_size) + i])
152  
-
153  
-            num_B = tile_size * block_size
154  
-            remain_B = n - (t * tile_size * block_size)
155  
-            num_B = num_B if num_B < remain_B else remain_B
156  
-
157  
-            B = numpy.zeros(shape=(num_B, m), dtype=numpy.int32, order='C')
158  
-            for i in range(num_B):
159  
-                numpy.put(B[i], range(m), pyroprints[(t * tile_size * block_size) + i])
160  
-
161  
-            pearson_kernel(buckets_gpu.gpudata, numpy.int32(num_buckets),
162  
-                           cuda.In(A), numpy.int32(num_A),
163  
-                           cuda.In(B), numpy.int32(num_B),
164  
-                           numpy.int32(s), numpy.int32(t),
165  
-                           numpy.int32(n), numpy.int32(m),
166  
-                           block=(block_size, block_size, 1),
167  
-                           grid=(tile_size, tile_size))
168  
-
169  
-            progress = ((s * num_tiles + t) * 100) / (num_tiles * num_tiles)
170  
-            print('%d%% complete' % progress)
171  
-
172  
-    buckets_gpu.get(buckets)
  81
+    print('\rComputing correlations 100.000%')
173 82
     return buckets
174 83
 
175 84
 if __name__ == '__main__':

0 notes on commit 0c2f4a5

Please sign in to comment.
Something went wrong with that request. Please try again.