In [1]:
import numpy as np
import pyopencl as cl
import pyopencl.array
import pyopencl.clrandom
# loopy currently requires on pyopencl 
import loopy as lp
lp.set_caching_enabled(False)
from warnings import filterwarnings, catch_warnings
filterwarnings('error', category=lp.LoopyWarning)
ctx = cl.create_some_context(interactive=False)
queue = cl.CommandQueue(ctx)
# Set up pyopencl.Context & CommandQueue

In [2]:
n = 16*16
x_vec_dev = cl.clrandom.rand(queue, n, dtype=np.float32) # device side
y_vec_dev = cl.clrandom.rand(queue, n, dtype=np.float32)
z_vec_dev = cl.clrandom.rand(queue, n, dtype=np.float32)
a_mat_dev = cl.clrandom.rand(queue, (n, n), dtype=np.float32)
b_mat_dev = cl.clrandom.rand(queue, (n, n), dtype=np.float32)
x_vec_host = np.random.randn(n).astype(np.float32) # host side
y_vec_host = np.random.randn(n).astype(np.float32)

In [3]:
knl = lp.make_kernel(
     "{ [i,j,ii,jj]: 0<=i,j,ii,jj<n }",
     """
     out[j,i] = a[i,j] {id=transpose}
     out[ii,jj] = 2*out[ii,jj]  {dep=transpose}
     """)
knl = lp.prioritize_loops(knl, "i,j,ii,jj")
knl = lp.set_options(knl, "write_cl")
print(knl) # Kernel info, including loop domain, instructions and arguments

---------------------------------------------------------------------------
KERNEL: loopy_kernel
---------------------------------------------------------------------------
ARGUMENTS:
a: GlobalArg, type: <runtime>, shape: (n, n), dim_tags: (N1:stride:n, N0:stride:1)
n: ValueArg, type: <runtime>
out: GlobalArg, type: <runtime>, shape: (n, n), dim_tags: (N1:stride:n, N0:stride:1)
---------------------------------------------------------------------------
DOMAINS:
[n] -> { [i, j, ii, jj] : 0 <= i < n and 0 <= j < n and 0 <= ii < n and 0 <= jj < n }
---------------------------------------------------------------------------
INAME IMPLEMENTATION TAGS:
i: None
ii: None
j: None
jj: None
---------------------------------------------------------------------------
INSTRUCTIONS:
  for j, i
↱     [36mout[j, i][0m = [35ma[i, j][0m  {id=[32mtranspose[0m}
│ end j, i
│ for ii, jj
└     [36mout[ii, jj][0m = [35m2*out[ii, jj][0m  {id=[32minsn[0m}
  end ii, jj
--------------------------------

In [4]:
evt, (out,) = knl(queue, a=a_mat_dev)
assert (out.get() == a_mat_dev.get().T*2).all()

[36m#[39;49;00m[36mdefine lid(N) ((int) get_local_id(N))[39;49;00m[36m[39;49;00m
[36m#[39;49;00m[36mdefine gid(N) ((int) get_group_id(N))[39;49;00m[36m[39;49;00m

__kernel [36mvoid[39;49;00m [32m__attribute__[39;49;00m ((reqd_work_group_size([34m1[39;49;00m, [34m1[39;49;00m, [34m1[39;49;00m))) loopy_kernel(__global [36mfloat[39;49;00m [34mconst[39;49;00m *__restrict__ a, [36mint[39;49;00m [34mconst[39;49;00m n, __global [36mfloat[39;49;00m *__restrict__ out)
{
  [34mfor[39;49;00m ([36mint[39;49;00m i = [34m0[39;49;00m; i <= -[34m1[39;49;00m + n; ++i)
    [34mfor[39;49;00m ([36mint[39;49;00m j = [34m0[39;49;00m; j <= -[34m1[39;49;00m + n; ++j)
      out[n * j + i] = a[n * i + j];
  [34mfor[39;49;00m ([36mint[39;49;00m ii = [34m0[39;49;00m; ii <= -[34m1[39;49;00m + n; ++ii)
    [34mfor[39;49;00m ([36mint[39;49;00m jj = [34m0[39;49;00m; jj <= -[34m1[39;49;00m + n; ++jj)
      out[n * ii + jj] = [34m2.0f[39;49;00m * out[n * 

In [5]:
knl = lp.set_options(knl, write_wrapper=True, write_cl=False) # peek at generated code

In [6]:
evt, (out,) = knl(queue, a=x_vec_host)

[34mfrom[39;49;00m [04m[36m__future__[39;49;00m [34mimport[39;49;00m division

[34mimport[39;49;00m [04m[36mnumpy[39;49;00m [34mas[39;49;00m [04m[36m_lpy_np[39;49;00m
[34mimport[39;49;00m [04m[36mpyopencl[39;49;00m [34mas[39;49;00m [04m[36m_lpy_cl[39;49;00m
[34mimport[39;49;00m [04m[36mpyopencl.array[39;49;00m [34mas[39;49;00m [04m[36m_lpy_cl_array[39;49;00m
[34mimport[39;49;00m [04m[36mpyopencl.tools[39;49;00m [34mas[39;49;00m [04m[36m_lpy_cl_tools[39;49;00m

[34mdef[39;49;00m [32m_lpy_host_loopy_kernel[39;49;00m(_lpy_cl_kernels, queue, a, n, out, wait_for=[36mNone[39;49;00m, allocator=[36mNone[39;49;00m):
    [34mfrom[39;49;00m [04m[36mstruct[39;49;00m [34mimport[39;49;00m pack [34mas[39;49;00m _lpy_pack
    [34mimport[39;49;00m [04m[36mpyopencl[39;49;00m [34mas[39;49;00m [04m[36m_lpy_cl[39;49;00m
    [34mimport[39;49;00m [04m[36mpyopencl.tools[39;49;00m

    [34mif[39;49;00m allocator [35mis[39;49;00m

TypeError: shape mismatch on argument 'a' (got: (256,), expected: (256, 256))

In [7]:
lp.show_dependency_graph(knl)

  % (type(e).__name__, e))
