/
reduce.go
99 lines (85 loc) · 2.79 KB
/
reduce.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
package cuda
import (
"math"
"unsafe"
"github.com/mumax/3/cuda/cu"
"github.com/mumax/3/data"
"github.com/mumax/3/util"
)
//#include "reduce.h"
import "C"
// Block size for reduce kernels.
const REDUCE_BLOCKSIZE = C.REDUCE_BLOCKSIZE
// Sum of all elements.
func Sum(in *data.Slice) float32 {
util.Argument(in.NComp() == 1)
out := reduceBuf(0)
k_reducesum_async(in.DevPtr(0), out, 0, in.Len(), reducecfg)
return copyback(out)
}
// Dot product.
func Dot(a, b *data.Slice) float32 {
nComp := a.NComp()
util.Argument(nComp == b.NComp())
out := reduceBuf(0)
// not async over components
for c := 0; c < nComp; c++ {
k_reducedot_async(a.DevPtr(c), b.DevPtr(c), out, 0, a.Len(), reducecfg) // all components add to out
}
return copyback(out)
}
// Maximum of absolute values of all elements.
func MaxAbs(in *data.Slice) float32 {
util.Argument(in.NComp() == 1)
out := reduceBuf(0)
k_reducemaxabs_async(in.DevPtr(0), out, 0, in.Len(), reducecfg)
return copyback(out)
}
// Maximum of the norms of all vectors (x[i], y[i], z[i]).
// max_i sqrt( x[i]*x[i] + y[i]*y[i] + z[i]*z[i] )
func MaxVecNorm(v *data.Slice) float64 {
out := reduceBuf(0)
k_reducemaxvecnorm2_async(v.DevPtr(0), v.DevPtr(1), v.DevPtr(2), out, 0, v.Len(), reducecfg)
return math.Sqrt(float64(copyback(out)))
}
// Maximum of the norms of the difference between all vectors (x1,y1,z1) and (x2,y2,z2)
// (dx, dy, dz) = (x1, y1, z1) - (x2, y2, z2)
// max_i sqrt( dx[i]*dx[i] + dy[i]*dy[i] + dz[i]*dz[i] )
func MaxVecDiff(x, y *data.Slice) float64 {
util.Argument(x.Len() == y.Len())
out := reduceBuf(0)
k_reducemaxvecdiff2_async(x.DevPtr(0), x.DevPtr(1), x.DevPtr(2),
y.DevPtr(0), y.DevPtr(1), y.DevPtr(2),
out, 0, x.Len(), reducecfg)
return math.Sqrt(float64(copyback(out)))
}
var reduceBuffers chan unsafe.Pointer // pool of 1-float CUDA buffers for reduce
// return a 1-float CUDA reduction buffer from a pool
// initialized to initVal
func reduceBuf(initVal float32) unsafe.Pointer {
if reduceBuffers == nil {
initReduceBuf()
}
buf := <-reduceBuffers
cu.MemsetD32Async(cu.DevicePtr(uintptr(buf)), math.Float32bits(initVal), 1, stream0)
return buf
}
// copy back single float result from GPU and recycle buffer
func copyback(buf unsafe.Pointer) float32 {
var result float32
MemCpyDtoH(unsafe.Pointer(&result), buf, cu.SIZEOF_FLOAT32)
reduceBuffers <- buf
return result
}
// initialize pool of 1-float CUDA reduction buffers
func initReduceBuf() {
const N = 128
reduceBuffers = make(chan unsafe.Pointer, N)
for i := 0; i < N; i++ {
reduceBuffers <- MemAlloc(1 * cu.SIZEOF_FLOAT32)
}
}
// launch configuration for reduce kernels
// 8 is typ. number of multiprocessors.
// could be improved but takes hardly ~1% of execution time
var reducecfg = &config{Grid: cu.Dim3{X: 8, Y: 1, Z: 1}, Block: cu.Dim3{X: REDUCE_BLOCKSIZE, Y: 1, Z: 1}}