In [1]:
using CUDA
CUDA.versioninfo()

CUDA toolkit 11.7, artifact installation
Unknown NVIDIA driver, for CUDA 11.6
CUDA driver 11.6

Libraries: 
- CUBLAS: 11.10.1
- CURAND: 10.2.10
- CUFFT: 10.7.1
- CUSOLVER: 11.3.5
- CUSPARSE: 11.7.3
- CUPTI: 17.0.0
- NVML: missing
- CUDNN: 8.30.2 (for CUDA 11.5.0)
- CUTENSOR: 1.4.0 (for CUDA 11.5.0)

Toolchain:
- Julia: 1.7.2
- LLVM: 12.0.1
- PTX ISA support: 3.2, 4.0, 4.1, 4.2, 4.3, 5.0, 6.0, 6.1, 6.3, 6.4, 6.5, 7.0
- Device capability support: sm_35, sm_37, sm_50, sm_52, sm_53, sm_60, sm_61, sm_62, sm_70, sm_72, sm_75, sm_80

1 device:
  0: NVIDIA GeForce GTX 1070 (sm_61, 7.074 GiB / 8.000 GiB available)


In [2]:
function RK4(z0, h)
    xprev = z0[1];
    yprev = z0[2];
    
    k1 = odefcn([xprev, yprev]);
    k2 = odefcn([xprev + k1[1] * h/2, yprev + k1[2] * h/2]);
    k3 = odefcn([xprev + k2[1] * h/2, yprev + k2[2] * h/2]);
    k4 = odefcn([xprev + k3[1] * h  , yprev + k3[2] * h  ]);
    
    x = xprev + (h/6) * (k1[1] + 2*k2[1] + 2*k3[1] + k4[1]);
    y = yprev + (h/6) * (k1[2] + 2*k2[2] + 2*k3[2] + k4[2]);
    
    count = 0
    
    while count < 2
        
        xprev = x;
        yprev = y;
        
        k1 = odefcn([xprev, yprev]);
        k2 = odefcn([xprev + k1[1] * h/2, yprev + k1[2] * h/2]);
        k3 = odefcn([xprev + k2[1] * h/2, yprev + k2[2] * h/2]);
        k4 = odefcn([xprev + k3[1] * h  , yprev + k3[2] * h  ]);
    
        x = xprev + (h/6) * (k1[1] + 2*k2[1] + 2*k3[1] + k4[1]);
        y = yprev + (h/6) * (k1[2] + 2*k2[2] + 2*k3[2] + k4[2]);
        
        if y*yprev < 0
            count = count+1;
        end
        
    end
    
    xzero = xprev - yprev*(x-xprev)/(y-yprev);
    
    return xzero
    
end

RK4 (generic function with 1 method)

In [3]:
function RK4loop(x, diff, h)
    for i = 1:length(x)
        z0 = [x[i] 0];
        @inbounds diff[i] = RK4(z0, h)
    end
    return diff
end

RK4loop (generic function with 1 method)

In [6]:
N = 2^20
x_d = CUDA.fill(1.0f0, N)
y_d = CUDA.fill(2.0f0, N)

1048576-element CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}:
 2.0
 2.0
 2.0
 2.0
 2.0
 2.0
 2.0
 2.0
 2.0
 2.0
 2.0
 2.0
 2.0
 ⋮
 2.0
 2.0
 2.0
 2.0
 2.0
 2.0
 2.0
 2.0
 2.0
 2.0
 2.0
 2.0

In [7]:
function gpu_add1!(y, x)
    for i = 1:length(y)
        @inbounds y[i] += x[i]
    end
    return nothing
end

fill!(y_d, 2)
@cuda gpu_add1!(y_d, x_d)
using Test
@test all(Array(y_d) .== 3.0f0)

[32m[1mTest Passed[22m[39m
  Expression: all(Array(y_d) .== 3.0f0)

In [9]:
function bench_gpu1!(y, x)
    CUDA.@sync begin
        @cuda gpu_add1!(y, x)
    end
end

bench_gpu1! (generic function with 1 method)

In [10]:
using BenchmarkTools
@btime bench_gpu1!($y_d, $x_d)

  141.953 ms (52 allocations: 3.58 KiB)


CUDA.HostKernel{typeof(gpu_add1!), Tuple{CuDeviceVector{Float32, 1}, CuDeviceVector{Float32, 1}}}(gpu_add1!, CuFunction(Ptr{Nothing} @0x000000007f061290, CuModule(Ptr{Nothing} @0x000000007ef6d3b0, CuContext(0x000000007d3e50e0, instance bd0865608a9b1c07))), CUDA.KernelState(Ptr{Nothing} @0x0000000603e00000))

In [13]:
$ nvprof --profile-from-start off 

LoadError: syntax: invalid operator "--"