-
Notifications
You must be signed in to change notification settings - Fork 4
/
cuda_kernel_sum_all.go
123 lines (106 loc) · 3.49 KB
/
cuda_kernel_sum_all.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
package mt
const KER_MATRIX_SUM_ALL = `
//
// Generated by NVIDIA NVVM Compiler
// Compiler built on Thu Mar 13 19:31:35 2014 (1394735495)
// Cuda compilation tools, release 6.0, V6.0.1
//
.version 4.0
.target sm_30
.address_size 64
// matrixSumAll$__cuda_local_var_34215_33_non_const_res has been demoted
.visible .entry matrixSumAll(
.param .u64 matrixSumAll_param_0,
.param .u32 matrixSumAll_param_1,
.param .u32 matrixSumAll_param_2,
.param .u64 matrixSumAll_param_3
)
{
.reg .pred %p<6>;
.reg .s32 %r<16>;
.reg .s64 %rd<17>;
.reg .f64 %fd<34>;
// demoted variable
.shared .align 8 .b8 matrixSumAll$__cuda_local_var_34215_33_non_const_res[8192];
ld.param.u64 %rd7, [matrixSumAll_param_0];
ld.param.u32 %r9, [matrixSumAll_param_1];
ld.param.u32 %r10, [matrixSumAll_param_2];
ld.param.u64 %rd8, [matrixSumAll_param_3];
mov.u32 %r1, %tid.x;
setp.gt.s32 %p1, %r9, 0;
@%p1 bra BB0_2;
mov.f64 %fd32, 0d0000000000000000;
bra.uni BB0_6;
BB0_2:
cvta.to.global.u64 %rd9, %rd7;
mul.lo.s32 %r13, %r1, %r9;
mul.wide.s32 %rd10, %r13, 8;
add.s64 %rd15, %rd9, %rd10;
mov.f64 %fd32, 0d0000000000000000;
mov.u32 %r14, 0;
BB0_3:
setp.ge.s32 %p2, %r13, %r10;
@%p2 bra BB0_5;
ld.global.f64 %fd10, [%rd15];
add.rn.f64 %fd32, %fd32, %fd10;
BB0_5:
add.s64 %rd15, %rd15, 8;
add.s32 %r13, %r13, 1;
add.s32 %r14, %r14, 1;
setp.lt.s32 %p3, %r14, %r9;
@%p3 bra BB0_3;
BB0_6:
mul.wide.u32 %rd11, %r1, 8;
mov.u64 %rd16, matrixSumAll$__cuda_local_var_34215_33_non_const_res;
add.s64 %rd13, %rd16, %rd11;
st.shared.f64 [%rd13], %fd32;
bar.sync 0;
setp.ne.s32 %p4, %r1, 0;
@%p4 bra BB0_10;
cvta.to.global.u64 %rd4, %rd8;
ld.shared.f64 %fd33, [matrixSumAll$__cuda_local_var_34215_33_non_const_res];
mov.u32 %r15, 0;
BB0_8:
mov.u64 %rd5, %rd16;
ld.shared.f64 %fd11, [%rd5+8];
add.rn.f64 %fd12, %fd33, %fd11;
ld.shared.f64 %fd13, [%rd5+16];
st.shared.f64 [matrixSumAll$__cuda_local_var_34215_33_non_const_res], %fd12;
add.rn.f64 %fd14, %fd12, %fd13;
st.shared.f64 [matrixSumAll$__cuda_local_var_34215_33_non_const_res], %fd14;
ld.shared.f64 %fd15, [%rd5+24];
add.rn.f64 %fd16, %fd14, %fd15;
ld.shared.f64 %fd17, [%rd5+32];
st.shared.f64 [matrixSumAll$__cuda_local_var_34215_33_non_const_res], %fd16;
add.rn.f64 %fd18, %fd16, %fd17;
st.shared.f64 [matrixSumAll$__cuda_local_var_34215_33_non_const_res], %fd18;
ld.shared.f64 %fd19, [%rd5+40];
add.rn.f64 %fd20, %fd18, %fd19;
ld.shared.f64 %fd21, [%rd5+48];
st.shared.f64 [matrixSumAll$__cuda_local_var_34215_33_non_const_res], %fd20;
add.rn.f64 %fd22, %fd20, %fd21;
st.shared.f64 [matrixSumAll$__cuda_local_var_34215_33_non_const_res], %fd22;
ld.shared.f64 %fd23, [%rd5+56];
add.rn.f64 %fd24, %fd22, %fd23;
ld.shared.f64 %fd25, [%rd5+64];
st.shared.f64 [matrixSumAll$__cuda_local_var_34215_33_non_const_res], %fd24;
add.rn.f64 %fd26, %fd24, %fd25;
st.shared.f64 [matrixSumAll$__cuda_local_var_34215_33_non_const_res], %fd26;
ld.shared.f64 %fd27, [%rd5+72];
add.rn.f64 %fd28, %fd26, %fd27;
ld.shared.f64 %fd29, [%rd5+80];
st.shared.f64 [matrixSumAll$__cuda_local_var_34215_33_non_const_res], %fd28;
add.rn.f64 %fd30, %fd28, %fd29;
st.shared.f64 [matrixSumAll$__cuda_local_var_34215_33_non_const_res], %fd30;
add.s64 %rd16, %rd5, 88;
ld.shared.f64 %fd31, [%rd5+88];
add.rn.f64 %fd33, %fd30, %fd31;
st.shared.f64 [matrixSumAll$__cuda_local_var_34215_33_non_const_res], %fd33;
add.s32 %r15, %r15, 11;
setp.ne.s32 %p5, %r15, 1023;
@%p5 bra BB0_8;
st.global.f64 [%rd4], %fd33;
BB0_10:
ret;
}
`