In [4]:
from __future__ import division
import pyopencl as cl
import numpy as np
import pylab
from PIL import Image

def round_up(global_size, group_size):
    r = global_size % group_size
    if r == 0:
        return global_size
    return global_size + group_size - r


if __name__ == '__main__':
    # List our platforms
    platforms = cl.get_platforms()
    print 'The platforms detected are:'
    print '---------------------------'
    for platform in platforms:
        print platform.name, platform.vendor, 'version:', platform.version

    # List devices in each platform
    for platform in platforms:
        print 'The devices detected on platform', platform.name, 'are:'
        print '---------------------------'
        for device in platform.get_devices():
            print device.name, '[Type:', cl.device_type.to_string(device.type), ']'
            print 'Maximum clock Frequency:', device.max_clock_frequency, 'MHz'
            print 'Maximum allocable memory size:', int(device.max_mem_alloc_size / 1e6), 'MB'
            print 'Maximum work group size', device.max_work_group_size
            print '---------------------------'

    # Create a context with all the devices
    devices = platforms[0].get_devices()
    context = cl.Context(devices)
    print 'This context is associated with ', len(context.devices), 'devices'

    # Create a queue for transferring data and launching computations.
    # Turn on profiling to allow us to check event times.
    queue = cl.CommandQueue(context, context.devices[0],
                            properties=cl.command_queue_properties.PROFILING_ENABLE)
    print 'The queue is using the device:', queue.device.name

    program = cl.Program(context, open('hdr.cl').read()).build(options='')

    im_0 = np.array(Image.open("../orig_0.jpg").getdata())
    im0 = im_0.astype(np.uint8).copy()
    im_1 = np.array(Image.open("../orig_1.jpg").getdata())
    im1 = im_1.astype(np.uint8).copy()
    im_2 = np.array(Image.open("../orig_2.jpg").getdata())
    im2 = im_2.astype(np.uint8).copy()
    im_3 = np.array(Image.open("../orig_3.jpg").getdata())
    im3 = im_3.astype(np.uint8).copy()

    assert im0.shape==im1.shape==im2.shape==im3.shape
    out = np.empty_like(im0)

    gpu_0 = cl.Buffer(context, cl.mem_flags.READ_ONLY, im0.size)
    gpu_1 = cl.Buffer(context, cl.mem_flags.READ_ONLY, im1.size)
    gpu_2 = cl.Buffer(context, cl.mem_flags.READ_ONLY, im2.size)
    gpu_3 = cl.Buffer(context, cl.mem_flags.READ_ONLY, im3.size)
    gpu_out = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, im0.size)

    local_size = (8, 8)  # 64 pixels per work group
    global_size = tuple([round_up(g, l) for g, l in zip(im0.shape[::-1], local_size)])

    print 'Original Image shape', im0.shape
    
    
    width = np.int32(im0.shape[1])
    height = np.int32(im0.shape[0])


    cl.enqueue_copy(queue, gpu_0, im0, is_blocking=False)
    cl.enqueue_copy(queue, gpu_1, im1, is_blocking=False)
    cl.enqueue_copy(queue, gpu_2, im2, is_blocking=False)
    cl.enqueue_copy(queue, gpu_3, im3, is_blocking=False)

    event = program.hdr(queue, global_size, local_size,
                               gpu_0, gpu_1, gpu_2, gpu_3, gpu_out,
                               width, height)

    cl.enqueue_copy(queue, out, gpu_out, is_blocking=True)

    seconds = (event.profile.end - event.profile.start) / 1e9
    print("{} Million Complex FMAs in {} seconds, {} million Complex FMAs / second".format(out.sum() / 1e6, seconds, (out.sum() / seconds) / 1e6))


    id_comp2 = np.reshape(out, (612,816,3)).astype(np.uint8)
    print 'shape', id_comp2.shape
    print id_comp2[:20]
    im_comp = Image.fromarray(id_comp2, 'RGB')

    print 'shape', id_comp2.shape
    print id_comp2[:20]
    im_comp.show()

The platforms detected are:
---------------------------
Apple Apple version: OpenCL 1.2 (Sep 20 2014 22:01:02)
The devices detected on platform Apple are:
---------------------------
Intel(R) Core(TM) i5-4258U CPU @ 2.40GHz [Type: CPU ]
Maximum clock Frequency: 2400 MHz
Maximum allocable memory size: 2147 MB
Maximum work group size 1024
---------------------------
Iris [Type: GPU ]
Maximum clock Frequency: 1100 MHz
Maximum allocable memory size: 402 MB
Maximum work group size 512
---------------------------
This context is associated with  2 devices
The queue is using the device: Iris
Original Image shape (499392, 3)
157.486399 Million Complex FMAs in 0.0040984 seconds, 38426.3124634 million Complex FMAs / second
shape (612, 816, 3)
[[[102  98  24]
  [ 38  33  23]
  [ 40  35 152]
  ..., 
  [ 57 103 143]
  [248  38 142]
  [248 230  14]]

 [[ 32  28 210]
  [ 33 221  18]
  [ 38  33  22]
  ..., 
  [ 57 103 143]
  [248  38 142]
  [248 230  14]]

 [[ 29 153 206]
  [ 31 219 144]
  [163  30 21

In [8]:
from __future__ import division
import pyopencl as cl
import numpy as np
import pylab
from PIL import Image

def round_up(global_size, group_size):
    r = global_size % group_size
    if r == 0:
        return global_size
    return global_size + group_size - r


if __name__ == '__main__':
    # List our platforms
    platforms = cl.get_platforms()
    print 'The platforms detected are:'
    print '---------------------------'
    for platform in platforms:
        print platform.name, platform.vendor, 'version:', platform.version

    # List devices in each platform
    for platform in platforms:
        print 'The devices detected on platform', platform.name, 'are:'
        print '---------------------------'
        for device in platform.get_devices():
            print device.name, '[Type:', cl.device_type.to_string(device.type), ']'
            print 'Maximum clock Frequency:', device.max_clock_frequency, 'MHz'
            print 'Maximum allocable memory size:', int(device.max_mem_alloc_size / 1e6), 'MB'
            print 'Maximum work group size', device.max_work_group_size
            print '---------------------------'

    # Create a context with all the devices
    devices = platforms[0].get_devices()
    context = cl.Context(devices)
    print 'This context is associated with ', len(context.devices), 'devices'

    # Create a queue for transferring data and launching computations.
    # Turn on profiling to allow us to check event times.
    queue = cl.CommandQueue(context, context.devices[0],
                            properties=cl.command_queue_properties.PROFILING_ENABLE)
    print 'The queue is using the device:', queue.device.name

    program = cl.Program(context, open('hdr.cl').read()).build(options='')

    #in_coords, out_counts = make_coords()
    #real_coords = np.real(in_coords).copy()
    #imag_coords = np.imag(in_coords).copy()

    im0 = np.array(Image.open("../orig_0.jpg").getdata())
    him0 = im0.astype(np.float32).copy()

    im1 = np.array(Image.open("../orig_1.jpg").getdata())
    him1 = im1.astype(np.float32).copy()
    im2 = np.array(Image.open("../orig_2.jpg").getdata())
    him2 = im2.astype(np.float32).copy()
    im3 = np.array(Image.open("../orig_3.jpg").getdata())
    him3 = im3.astype(np.float32).copy()

    out = np.empty_like(him0)

    gpu_0 = cl.Buffer(context, cl.mem_flags.READ_ONLY, him0.size * 4)
    gpu_1 = cl.Buffer(context, cl.mem_flags.READ_ONLY, him1.size * 4)
    gpu_2 = cl.Buffer(context, cl.mem_flags.READ_ONLY, him2.size * 4)
    gpu_3 = cl.Buffer(context, cl.mem_flags.READ_ONLY, him3.size * 4)
    gpu_out = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, him0.size * 4)

    local_size = (8, 8)  # 64 pixels per work group, test and see
    global_size = tuple([round_up(g, l) for g, l in zip(him0.shape[::-1], local_size)])

    print him0.shape
    width = np.int32(him0.shape[1])
    height = np.int32(him0.shape[0])

    #max_iters = np.int32(1024)

    cl.enqueue_copy(queue, gpu_0, him0, is_blocking=False)
    cl.enqueue_copy(queue, gpu_1, him1, is_blocking=False)
    cl.enqueue_copy(queue, gpu_2, him2, is_blocking=False)
    cl.enqueue_copy(queue, gpu_3, him3, is_blocking=False)

    event = program.hdr(queue, global_size, local_size,
                               gpu_0, gpu_1, gpu_2, gpu_3, gpu_out,
                               width, height)

    cl.enqueue_copy(queue, out, gpu_out, is_blocking=True)

    seconds = (event.profile.end - event.profile.start) / 1e9
    print("{} Million Complex FMAs in {} seconds, {} million Complex FMAs / second".format(out.sum() / 1e6, seconds, (out.sum() / seconds) / 1e6))


    id_comp2 = np.reshape(out, (612,816,3)).astype(np.uint8)
    print 'shape', id_comp2.shape
    print id_comp2[:20]
    im_comp = Image.fromarray(id_comp2, 'RGB')

    print 'shape', id_comp2.shape
    print id_comp2[:20]
    im_comp.show()

The platforms detected are:
---------------------------
Apple Apple version: OpenCL 1.2 (Sep 20 2014 22:01:02)
The devices detected on platform Apple are:
---------------------------
Intel(R) Core(TM) i5-4258U CPU @ 2.40GHz [Type: CPU ]
Maximum clock Frequency: 2400 MHz
Maximum allocable memory size: 2147 MB
Maximum work group size 1024
---------------------------
Iris [Type: GPU ]
Maximum clock Frequency: 1100 MHz
Maximum allocable memory size: 402 MB
Maximum work group size 512
---------------------------
This context is associated with  2 devices
The queue is using the device: Iris
(499392, 3)
194.435568 Million Complex FMAs in 0.00408264 seconds, 47624.9603198 million Complex FMAs / second
shape (612, 816, 3)
[[[ 38  34  24]
  [ 38  33  23]
  [ 40  35  24]
  ..., 
  [120 103  79]
  [120 102  78]
  [120 102  78]]

 [[ 32  28  18]
  [ 33  29  18]
  [ 38  33  22]
  ..., 
  [120 103  79]
  [120 102  78]
  [120 102  78]]

 [[ 29  25  14]
  [ 31  27  16]
  [ 35  30  20]
  ..., 
  [120 10

In [10]:
tempimg = Image.open("../orig_3.jpg")
tempimg.show()