/
madd.go
131 lines (121 loc) · 4.97 KB
/
madd.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
package cuda
import (
"github.com/mumax/3/data"
"github.com/mumax/3/util"
)
// multiply: dst[i] = a[i] * b[i]
// a and b must have the same number of components
func Mul(dst, a, b *data.Slice) {
N := dst.Len()
nComp := dst.NComp()
util.Assert(a.Len() == N && a.NComp() == nComp && b.Len() == N && b.NComp() == nComp)
cfg := make1DConf(N)
for c := 0; c < nComp; c++ {
k_mul_async(dst.DevPtr(c), a.DevPtr(c), b.DevPtr(c), N, cfg)
}
}
// divide: dst[i] = a[i] / b[i]
// divide-by-zero yields zero.
func Div(dst, a, b *data.Slice) {
N := dst.Len()
nComp := dst.NComp()
util.Assert(a.Len() == N && a.NComp() == nComp && b.Len() == N && b.NComp() == nComp)
cfg := make1DConf(N)
for c := 0; c < nComp; c++ {
k_pointwise_div_async(dst.DevPtr(c), a.DevPtr(c), b.DevPtr(c), N, cfg)
}
}
// Add: dst = src1 + src2.
func Add(dst, src1, src2 *data.Slice) {
Madd2(dst, src1, src2, 1, 1)
}
// multiply-add: dst[i] = src1[i] * factor1 + src2[i] * factor2
func Madd2(dst, src1, src2 *data.Slice, factor1, factor2 float32) {
N := dst.Len()
nComp := dst.NComp()
util.Assert(src1.Len() == N && src2.Len() == N)
util.Assert(src1.NComp() == nComp && src2.NComp() == nComp)
cfg := make1DConf(N)
for c := 0; c < nComp; c++ {
k_madd2_async(dst.DevPtr(c), src1.DevPtr(c), factor1,
src2.DevPtr(c), factor2, N, cfg)
}
}
// multiply-add: dst[i] = src1[i] * factor1 + src2[i] * factor2 + src3[i] * factor3
func Madd3(dst, src1, src2, src3 *data.Slice, factor1, factor2, factor3 float32) {
N := dst.Len()
nComp := dst.NComp()
util.Assert(src1.Len() == N && src2.Len() == N && src3.Len() == N)
util.Assert(src1.NComp() == nComp && src2.NComp() == nComp && src3.NComp() == nComp)
cfg := make1DConf(N)
for c := 0; c < nComp; c++ {
k_madd3_async(dst.DevPtr(c), src1.DevPtr(c), factor1,
src2.DevPtr(c), factor2, src3.DevPtr(c), factor3, N, cfg)
}
}
// multiply-add: dst[i] = src1[i] * factor1 + src2[i] * factor2 + src3[i] * factor3 + src4[i] * factor4
func Madd4(dst, src1, src2, src3, src4 *data.Slice, factor1, factor2, factor3, factor4 float32) {
N := dst.Len()
nComp := dst.NComp()
util.Assert(src1.Len() == N && src2.Len() == N && src3.Len() == N && src4.Len() == N)
util.Assert(src1.NComp() == nComp && src2.NComp() == nComp && src3.NComp() == nComp && src4.NComp() == nComp)
cfg := make1DConf(N)
for c := 0; c < nComp; c++ {
k_madd4_async(dst.DevPtr(c),
src1.DevPtr(c), factor1,
src2.DevPtr(c), factor2,
src3.DevPtr(c), factor3,
src4.DevPtr(c), factor4, N, cfg)
}
}
// multiply-add: dst[i] = src1[i] * factor1 + src2[i] * factor2 + src3[i] * factor3 + src4[i] * factor4 + src5[i] * factor5
func Madd5(dst, src1, src2, src3, src4, src5 *data.Slice, factor1, factor2, factor3, factor4, factor5 float32) {
N := dst.Len()
nComp := dst.NComp()
util.Assert(src1.Len() == N && src2.Len() == N && src3.Len() == N && src4.Len() == N && src5.Len() == N)
util.Assert(src1.NComp() == nComp && src2.NComp() == nComp && src3.NComp() == nComp && src4.NComp() == nComp && src5.NComp() == nComp)
cfg := make1DConf(N)
for c := 0; c < nComp; c++ {
k_madd5_async(dst.DevPtr(c),
src1.DevPtr(c), factor1,
src2.DevPtr(c), factor2,
src3.DevPtr(c), factor3,
src4.DevPtr(c), factor4,
src5.DevPtr(c), factor5, N, cfg)
}
}
// multiply-add: dst[i] = src1[i] * factor1 + src2[i] * factor2 + src3[i] * factor3 + src4[i] * factor4 + src5[i] * factor5 + src6[i] * factor6
func Madd6(dst, src1, src2, src3, src4, src5, src6 *data.Slice, factor1, factor2, factor3, factor4, factor5, factor6 float32) {
N := dst.Len()
nComp := dst.NComp()
util.Assert(src1.Len() == N && src2.Len() == N && src3.Len() == N && src4.Len() == N && src5.Len() == N && src6.Len() == N)
util.Assert(src1.NComp() == nComp && src2.NComp() == nComp && src3.NComp() == nComp && src4.NComp() == nComp && src5.NComp() == nComp && src6.NComp() == nComp)
cfg := make1DConf(N)
for c := 0; c < nComp; c++ {
k_madd6_async(dst.DevPtr(c),
src1.DevPtr(c), factor1,
src2.DevPtr(c), factor2,
src3.DevPtr(c), factor3,
src4.DevPtr(c), factor4,
src5.DevPtr(c), factor5,
src6.DevPtr(c), factor6, N, cfg)
}
}
// multiply-add: dst[i] = src1[i] * factor1 + src2[i] * factor2 + src3[i] * factor3 + src4[i] * factor4 + src5[i] * factor5 + src6[i] * factor6 + src7[i] * factor7
func Madd7(dst, src1, src2, src3, src4, src5, src6, src7 *data.Slice, factor1, factor2, factor3, factor4, factor5, factor6, factor7 float32) {
N := dst.Len()
nComp := dst.NComp()
util.Assert(src1.Len() == N && src2.Len() == N && src3.Len() == N && src4.Len() == N && src5.Len() == N && src6.Len() == N && src7.Len() == N)
util.Assert(src1.NComp() == nComp && src2.NComp() == nComp && src3.NComp() == nComp && src4.NComp() == nComp && src5.NComp() == nComp && src6.NComp() == nComp && src7.NComp() == nComp)
cfg := make1DConf(N)
for c := 0; c < nComp; c++ {
k_madd7_async(dst.DevPtr(c),
src1.DevPtr(c), factor1,
src2.DevPtr(c), factor2,
src3.DevPtr(c), factor3,
src4.DevPtr(c), factor4,
src5.DevPtr(c), factor5,
src6.DevPtr(c), factor6,
src7.DevPtr(c), factor7, N, cfg)
}
}