@@ -52,7 +52,7 @@ function vectorized_solve(probs, prob::ODEProblem, alg;
52
52
53
53
# Handle tstops
54
54
tstops = cu (tstops)
55
- dev = CUDADevice {#=prefer_blocks=#true} ()
55
+ dev = CUDADevice {true} () #= prefer_blocks=#
56
56
if alg isa GPUTsit5
57
57
kernel = tsit5_kernel (dev)
58
58
elseif alg isa GPUVern7
@@ -61,7 +61,7 @@ function vectorized_solve(probs, prob::ODEProblem, alg;
61
61
kernel = vern9_kernel (dev)
62
62
end
63
63
event = kernel (probs, us, ts, dt, callback, tstops, nsteps, saveat, Val (save_everystep);
64
- ndrange= length (probs), dependencies= Event (dev))
64
+ ndrange = length (probs), dependencies = Event (dev))
65
65
wait (dev, event)
66
66
67
67
# we build the actual solution object on the CPU because the GPU would create one
@@ -94,27 +94,19 @@ function vectorized_solve(probs, prob::SDEProblem, alg;
94
94
us = CuMatrix {typeof(prob.u0)} (undef, (length (saveat), length (probs)))
95
95
end
96
96
97
+ dev = CUDADevice {true} () #= prefer_blocks=#
98
+
97
99
if alg isa GPUEM
98
- kernel = @cuda launch= false em_kernel (probs, us, ts, dt,
99
- saveat, Val (save_everystep))
100
+ kernel = em_kernel (dev)
100
101
elseif alg isa Union{GPUSIEA}
101
102
SciMLBase. is_diagonal_noise (prob) ? nothing :
102
103
error (" The algorithm is not compatible with the chosen noise type. Please see the documentation on the solver methods" )
103
- kernel = @cuda launch= false siea_kernel (probs, us, ts, dt,
104
- saveat, Val (save_everystep))
105
- end
106
- if debug
107
- @show CUDA. registers (kernel)
108
- @show CUDA. memory (kernel)
104
+ kernel = siea_kernel (dev)
109
105
end
110
106
111
- config = launch_configuration (kernel. fun)
112
- threads = min (length (probs), config. threads)
113
- # XXX : this kernel performs much better with all blocks active
114
- blocks = max (cld (length (probs), threads), config. blocks)
115
- threads = cld (length (probs), blocks)
116
-
117
- kernel (probs, us, ts, dt, saveat; threads, blocks)
107
+ event = kernel (probs, us, ts, dt, saveat, Val (save_everystep);
108
+ ndrange = length (probs), dependencies = Event (dev))
109
+ wait (dev, event)
118
110
119
111
ts, us
120
112
end
@@ -147,7 +139,7 @@ function vectorized_asolve(probs, prob::ODEProblem, alg;
147
139
end
148
140
149
141
tstops = cu (tstops)
150
- dev = CUDADevice {#=prefer_blocks=#true} ()
142
+ dev = CUDADevice {true} () #= prefer_blocks=#
151
143
if alg isa GPUTsit5
152
144
kernel = atsit5_kernel (dev)
153
145
elseif alg isa GPUVern7
@@ -157,7 +149,7 @@ function vectorized_asolve(probs, prob::ODEProblem, alg;
157
149
end
158
150
event = kernel (probs, us, ts, dt, callback, tstops,
159
151
abstol, reltol, saveat, Val (save_everystep);
160
- ndrange= length (probs), dependencies= Event (dev))
152
+ ndrange = length (probs), dependencies = Event (dev))
161
153
wait (dev, event)
162
154
163
155
# we build the actual solution object on the CPU because the GPU would create one
0 commit comments