In [3]:
using CUDA

In [4]:
function matrizTranspuesta!(entrada,salida)
    columna = threadIdx().x
    fila    = threadIdx().y
    univId  = (columna) + (fila-1) * blockDim().x
    transId = (fila) + (columna -1) * blockDim().y
    salida[transId] = entrada[univId]
    return nothing
end

matrizTranspuesta! (generic function with 1 method)

In [6]:
entrada_hst = zeros(Int32,6,10)
for i in eachindex(entrada_hst)
    entrada_hst[i] = i
end
entrada_hst

6×10 Matrix{Int32}:
 1   7  13  19  25  31  37  43  49  55
 2   8  14  20  26  32  38  44  50  56
 3   9  15  21  27  33  39  45  51  57
 4  10  16  22  28  34  40  46  52  58
 5  11  17  23  29  35  41  47  53  59
 6  12  18  24  30  36  42  48  54  60

In [7]:
entrada_dev = CuArray(entrada_hst)
salida_dev = CuArray(zeros(Int64,10,6))

10×6 CuArray{Int64, 2, CUDA.Mem.DeviceBuffer}:
 0  0  0  0  0  0
 0  0  0  0  0  0
 0  0  0  0  0  0
 0  0  0  0  0  0
 0  0  0  0  0  0
 0  0  0  0  0  0
 0  0  0  0  0  0
 0  0  0  0  0  0
 0  0  0  0  0  0
 0  0  0  0  0  0

In [8]:
@sync @cuda threads=(6,10) matrizTranspuesta!(entrada_dev,salida_dev)

CUDA.HostKernel{typeof(matrizTranspuesta!), Tuple{CuDeviceMatrix{Int32, 1}, CuDeviceMatrix{Int64, 1}}}(matrizTranspuesta!, CuContext(0x00000000662be020, instance cf70d06f966b64f1), CuModule(Ptr{Nothing} @0x0000000066789190, CuContext(0x00000000662be020, instance cf70d06f966b64f1)), CuFunction(Ptr{Nothing} @0x00000000668a43a0, CuModule(Ptr{Nothing} @0x0000000066789190, CuContext(0x00000000662be020, instance cf70d06f966b64f1))), CUDA.KernelState(Ptr{Nothing} @0x0000000604000000))

In [9]:
salida_hst = Array(salida_dev)

10×6 Matrix{Int64}:
  1   2   3   4   5   6
  7   8   9  10  11  12
 13  14  15  16  17  18
 19  20  21  22  23  24
 25  26  27  28  29  30
 31  32  33  34  35  36
 37  38  39  40  41  42
 43  44  45  46  47  48
 49  50  51  52  53  54
 55  56  57  58  59  60

In [3]:
function matrix_transpose_shared!(input, output)
    sharedMemory = CUDA.CuStaticSharedArray(Float64,(32+1,32))

    #global index
    indexX = threadIdx().x + (blockIdx().x-1) * blockDim().x
    indexY = threadIdx().y + (blockIdx().y-1) * blockDim().y 

    #transposed global index
    tindexX = threadIdx().x + (blockIdx().y-1) * blockDim().x 
    tindexY = threadIdx().y + (blockIdx().x-1) * blockDim().y 

    #local index 
    localIndexX = threadIdx().x 
    localIndexY = threadIdx().y 
    index = (indexY-1) * 64 + indexX 
    transposedIndex = (tindexY-1) * 64 + tindexX

    #transposed the matrix in shared memory 
    #global memory is read in coalesced fashion

    sharedMemory[localIndexX,localIndexY] = input[index]
    
    CUDA.sync_threads()
    output[transposedIndex] = sharedMemory[localIndexY,localIndexX]
    return nothing
end


matrix_transpose_shared! (generic function with 1 method)

In [5]:
using CUDA

a = rand(64,64)
b = similar(a)

a_d = CuArray(a)
b_d = CuArray(b)


blockSize = (32,32,1)
gridSize  = (Int(64/32),Int(64/32),1) 

@sync @cuda threads=blockSize blocks=gridSize matrix_transpose_shared!(a_d,b_d)





CUDA.HostKernel{typeof(matrix_transpose_shared!), Tuple{CuDeviceMatrix{Float64, 1}, CuDeviceMatrix{Float64, 1}}}(matrix_transpose_shared!, CuFunction(Ptr{Nothing} @0x0000000005257560, CuModule(Ptr{Nothing} @0x0000000005021f00, CuContext(0x000000000387b6b0, instance ac19b0f8f975c44a))), CUDA.KernelState(Ptr{Nothing} @0x00007f123aa00000))

In [10]:
Array(b_d) == a'

true