@@ -0,0 +1,155 @@
#include "include/cl_wrapper.hpp"
#include "include/oclUtils.h"

void CLSetup::init(char* deviceName)
{
getPlatformID();
getDeviceID(deviceName);
getContextnQueue();
}

void CLSetup::getPlatformID()
{

cl_uint num_of_platforms = 0;
// get total number of available platforms:
cl_int err = CL_SUCCESS;
err = clGetPlatformIDs(0, 0, &num_of_platforms);

cl_platform_id* platforms = new cl_platform_id[num_of_platforms];
// get IDs for all platforms:
err = clGetPlatformIDs(num_of_platforms, platforms, 0);

cl_uint selected_platform_index = num_of_platforms;

for (cl_uint i = 0; i < num_of_platforms; ++i)
{
// Get the length for the i-th platform name
size_t platform_name_length = 0;
err = clGetPlatformInfo(
platforms[i],
CL_PLATFORM_NAME,
0,
0,
&platform_name_length
);

// Get the name itself for the i-th platform
char* platform_name = new char[platform_name_length];
err = clGetPlatformInfo(
platforms[i],
CL_PLATFORM_NAME,
platform_name_length,
platform_name,
0
);

// decide if this i-th platform is what we are looking for
// we select the first one matched skipping the next one if any
if (strstr(platform_name, /*"AMD"*/ "NVIDIA" /*"Intel(R) OpenCL"*/) &&
selected_platform_index == num_of_platforms)
{
selected_platform_index = i;
_platformID = platforms[i];
// do not stop here, just see all available platforms
}

delete[] platform_name;
}


/// !TODO: Multiple Platforms
// cl_platform_id* _platformID;
// _status = clGetPlatformIDs(NUMBER_OF_PLATFORMS, NULL, &_numPlatforms);
// DEBUG_CL(_status);
// _platformID = (cl_platform_id *)malloc(sizeof(cl_platform_id) * _numPlatforms);
// _status =clGetPlatformIDs(_numPlatforms, _platformID, NULL);
// DEBUG_CL(_status);
// _platformIDsVector.assign(_platformID[0], _platformID[_numPlatforms]);
}

void CLSetup::getDeviceID(char *devName)
{
/// !TODO: For Multiple Devices
_status = clGetDeviceIDs(_platformID,CL_DEVICE_TYPE_GPU, 1, NULL, &_numDevices);
DEBUG_CL(_status);
std::cout<<"CL_COMPUTE DEVICES: "<<_numDevices<<std::endl;
_status = clGetDeviceIDs(_platformID, CL_DEVICE_TYPE_GPU, 1, &_deviceID, NULL);
DEBUG_CL(_status);
std::cout<<"CL_DEVICE_ID: "<<_deviceID<<std::endl;

char device_string[1024];
clGetDeviceInfo(_deviceID, CL_DEVICE_NAME, sizeof(device_string), &device_string, NULL);
strcpy(devName, device_string);

// Getting some information about the device
// Getting some information about the device

oclPrintDevInfo(LOGCONSOLE, _deviceID);
clGetDeviceInfo(_deviceID, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &_maxComputeUnits, NULL);
std::cout<<"CL_DEVICE_MAX_COMPUTE_UNITS: "<<_maxComputeUnits<<std::endl;
clGetDeviceInfo(_deviceID, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &_maxWorkGroupSize, NULL);
clGetDeviceInfo(_deviceID, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &_maxMemAllocSize, NULL);
clGetDeviceInfo(_deviceID, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_ulong), &_globalMemSize, NULL);
clGetDeviceInfo(_deviceID, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(cl_ulong), &_constMemSize, NULL);
clGetDeviceInfo(_deviceID, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &_localMemSize, NULL);
///!TODO:Add get KernelInfo APIs

}

void CLSetup::getContextnQueue()
{
cl_command_queue_properties queueProps = NULL;// CL_QUEUE_PROFILING_ENABLE;
_context = clCreateContext(NULL, 1, &_deviceID, NULL, NULL, &_status);
DEBUG_CL(_status);
_queue = clCreateCommandQueue(_context, _deviceID, queueProps, &_status);
DEBUG_CL(_status);
}

///
/// \brief CLSetup::createProgram
/// \param kernelFilePath
/// \return
///
Program *CLSetup::createProgram(std::vector<std::string> kernelFilePath)
//Program *CLSetup::createProgram(std::string& kernelFilePath)
{
///!TODO: Add support for char** along with string
Program* tmp = new Program(kernelFilePath, &_context, &_queue,
&_deviceID);
return tmp;
}

OCLBuffer* CLSetup::createBuffer(const size_t size, const cl_mem_flags flags,
void *hostMem)
{
cl_mem buff = clCreateBuffer(_context,flags, size, hostMem ,&_status);
if(_status == CL_SUCCESS)
{
OCLBuffer* ret = new OCLBuffer(buff, &_queue);
return ret;
}
DEBUG_CL(_status);
if(_status != CL_SUCCESS)
printf("createBuffer error : %s", getCLErrorString(_status));
return NULL; //TODO: Return custom status value
}



















@@ -0,0 +1,235 @@
#include "include/clutils.h"
#include <stdlib.h>
#include <stdio.h>
#include <CL/opencl.h>

char* utils_cl_enum_to_string (cl_int value)
{
switch (value)
{
/* cl_channel_order */
case 0x10B0: return "CL_R";
case 0x10B1: return "CL_A";
case 0x10B2: return "CL_RG";
case 0x10B3: return "CL_RA";
case 0x10B4: return "CL_RGB";
case 0x10B5: return "CL_RGBA";
case 0x10B6: return "CL_BGRA";
case 0x10B7: return "CL_ARGB";
case 0x10B8: return "CL_INTENSITY";
case 0x10B9: return "CL_LUMINANCE";
case 0x10BA: return "CL_Rx";
case 0x10BB: return "CL_RGx";
case 0x10BC: return "CL_RGBx";

/* cl_channel_type */
case 0x10D0: return "CL_SNORM_INT8";
case 0x10D1: return "CL_SNORM_INT16";
case 0x10D2: return "CL_UNORM_INT8";
case 0x10D3: return "CL_UNORM_INT16";
case 0x10D4: return "CL_UNORM_SHORT_565";
case 0x10D5: return "CL_UNORM_SHORT_555";
case 0x10D6: return "CL_UNORM_INT_101010";
case 0x10D7: return "CL_SIGNED_INT8";
case 0x10D8: return "CL_SIGNED_INT16";
case 0x10D9: return "CL_SIGNED_INT32";
case 0x10DA: return "CL_UNSIGNED_INT8";
case 0x10DB: return "CL_UNSIGNED_INT16";
case 0x10DC: return "CL_UNSIGNED_INT32";
case 0x10DD: return "CL_HALF_FLOAT";
case 0x10DE: return "CL_FLOAT";
default: return "Unknown value!";
}
}

char* utils_get_ocl_error (cl_int err_code)
{
switch (err_code)
{
case CL_SUCCESS: return "Success!";
case CL_DEVICE_NOT_FOUND: return "Device not found.";
case CL_DEVICE_NOT_AVAILABLE: return "Device not available";
case CL_COMPILER_NOT_AVAILABLE: return "Compiler not available";
case CL_MEM_OBJECT_ALLOCATION_FAILURE: return "Memory object allocation failure";
case CL_OUT_OF_RESOURCES: return "Out of resources";
case CL_OUT_OF_HOST_MEMORY: return "Out of host memory";
case CL_PROFILING_INFO_NOT_AVAILABLE: return "Profiling information not available";
case CL_MEM_COPY_OVERLAP: return "Memory copy overlap";
case CL_IMAGE_FORMAT_MISMATCH: return "Image format mismatch";
case CL_IMAGE_FORMAT_NOT_SUPPORTED: return "Image format not supported";
case CL_BUILD_PROGRAM_FAILURE: return "Program build failure";
case CL_MAP_FAILURE: return "Map failure";
case CL_INVALID_VALUE: return "Invalid value";
case CL_INVALID_DEVICE_TYPE: return "Invalid device type";
case CL_INVALID_PLATFORM: return "Invalid platform";
case CL_INVALID_DEVICE: return "Invalid device";
case CL_INVALID_CONTEXT: return "Invalid context";
case CL_INVALID_QUEUE_PROPERTIES: return "Invalid queue properties";
case CL_INVALID_COMMAND_QUEUE: return "Invalid command queue";
case CL_INVALID_HOST_PTR: return "Invalid host pointer";
case CL_INVALID_MEM_OBJECT: return "Invalid memory object";
case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: return "Invalid image format descriptor";
case CL_INVALID_IMAGE_SIZE: return "Invalid image size";
case CL_INVALID_SAMPLER: return "Invalid sampler";
case CL_INVALID_BINARY: return "Invalid binary";
case CL_INVALID_BUILD_OPTIONS: return "Invalid build options";
case CL_INVALID_PROGRAM: return "Invalid program";
case CL_INVALID_PROGRAM_EXECUTABLE: return "Invalid program executable";
case CL_INVALID_KERNEL_NAME: return "Invalid kernel name";
case CL_INVALID_KERNEL_DEFINITION: return "Invalid kernel definition";
case CL_INVALID_KERNEL: return "Invalid kernel";
case CL_INVALID_ARG_INDEX: return "Invalid argument index";
case CL_INVALID_ARG_VALUE: return "Invalid argument value";
case CL_INVALID_ARG_SIZE: return "Invalid argument size";
case CL_INVALID_KERNEL_ARGS: return "Invalid kernel arguments";
case CL_INVALID_WORK_DIMENSION: return "Invalid work dimension";
case CL_INVALID_WORK_GROUP_SIZE: return "Invalid work group size";
case CL_INVALID_WORK_ITEM_SIZE: return "Invalid work item size";
case CL_INVALID_GLOBAL_OFFSET: return "Invalid global offset";
case CL_INVALID_EVENT_WAIT_LIST: return "Invalid event wait list";
case CL_INVALID_EVENT: return "Invalid event";
case CL_INVALID_OPERATION: return "Invalid operation";
case CL_INVALID_GL_OBJECT: return "Invalid OpenGL object";
case CL_INVALID_BUFFER_SIZE: return "Invalid buffer size";
case CL_INVALID_MIP_LEVEL: return "Invalid mip-map level";
default: return "Unknown error";
}
}

char* utils_read_file (const char* filename)
{
// locals
FILE* f = NULL;
size_t file_size;

f = fopen (filename, "r");
if (f == NULL)
{
return NULL;
}

// get the length
fseek (f, 0, SEEK_END);
file_size = ftell (f);
fseek (f, 0, SEEK_SET);

// allocate a buffer and fill it
char* contents = (char *)malloc (file_size + 1);
if (fread (contents, file_size, sizeof (char), f) != 1)
{
fclose (f);
free (contents);
return NULL;
}

fclose (f);
contents[file_size] = '\0';

return contents;
}

int utils_get_platform_and_device (cl_device_type dev_type,
cl_platform_id *platform,
cl_device_id *device_id,
int just_print)
{
cl_int err_code;
cl_uint num_plat;
cl_int p = clGetPlatformIDs (0, NULL, &num_plat);

if (just_print)
{
if (num_plat == 1) printf ("There is 1 platform available\n");
else printf ("There are %u platforms available\n", num_plat);
}
if (num_plat <= 0) return 0;

cl_platform_id *ids = (cl_platform_id *)malloc (sizeof (cl_platform_id) * num_plat);
clGetPlatformIDs (num_plat, ids, NULL);

for (unsigned int i=0; i<num_plat; i++)
{
if (just_print) utils_print_platform_info (ids[i]);

cl_uint num_devs;
err_code = clGetDeviceIDs (ids[i], dev_type, 0, NULL, &num_devs);
if (err_code != CL_SUCCESS || num_devs <= 0) continue;

cl_device_id *dev_ids = (cl_device_id *)malloc (sizeof (cl_device_id) * num_devs);
clGetDeviceIDs (ids[i], dev_type, num_devs, dev_ids, NULL);
for (unsigned int j=0; j<num_devs; j++)
{
if (just_print) utils_print_device_info (dev_ids[j]);
}

if (!just_print)
{
*platform = ids[i];
*device_id = dev_ids[0];
return 1;
}
}

return 0;
}

void utils_print_platform_info (cl_platform_id platform)
{
char buffer[8*1024];

clGetPlatformInfo (platform, CL_PLATFORM_NAME, sizeof (buffer), buffer, NULL);
printf ("PLATFORM_NAME: %s\n", buffer);
clGetPlatformInfo (platform, CL_PLATFORM_VERSION, sizeof (buffer), buffer, NULL);
printf (" VERSION: %s\n", buffer);
clGetPlatformInfo (platform, CL_PLATFORM_VENDOR, sizeof (buffer), buffer, NULL);
printf (" VENDOR: %s\n", buffer);
clGetPlatformInfo (platform, CL_PLATFORM_PROFILE, sizeof (buffer), buffer, NULL);
printf (" PROFILE: %s\n", buffer);
}

void utils_print_device_info (cl_device_id dev_id)
{
cl_ulong ul_prop;
cl_uint ui_prop;
size_t size_prop;
char buffer[8*1024];

clGetDeviceInfo (dev_id, CL_DEVICE_NAME, sizeof (buffer), buffer, NULL);
printf (" DEVICE: %s\n", buffer);
clGetDeviceInfo (dev_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof (ui_prop), &ui_prop, NULL);
printf (" MAX COMPUTE UNITS: %u\n", ui_prop);
clGetDeviceInfo (dev_id, CL_DEVICE_VERSION, sizeof (buffer), buffer, NULL);
printf (" DEVICE VERSION: %s\n", buffer);
clGetDeviceInfo (dev_id, CL_DRIVER_VERSION, sizeof (buffer), buffer, NULL);
printf (" DRIVER VERSION: %s\n", buffer);

clGetDeviceInfo (dev_id, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof (ul_prop), &ul_prop, NULL);
printf (" GLOBAL MEM SIZE: %lu\n", (unsigned long)ul_prop);
clGetDeviceInfo (dev_id, CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, sizeof (ul_prop), &ul_prop, NULL);
printf (" GLOBAL MEM CACHE SIZE: %lu\n", (unsigned long)ul_prop);
clGetDeviceInfo (dev_id, CL_DEVICE_LOCAL_MEM_SIZE, sizeof (ul_prop), &ul_prop, NULL);
printf (" LOCAL MEM SIZE: %lu\n", (unsigned long)ul_prop);
clGetDeviceInfo (dev_id, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof (ui_prop), &ui_prop, NULL);
printf (" MAX CLOCK FREQUENCY: %u\n", ui_prop);
clGetDeviceInfo (dev_id, CL_DEVICE_PROFILING_TIMER_RESOLUTION, sizeof (size_prop), &size_prop, NULL);
printf (" TIMER RESOLUTION: %lu\n", (unsigned long)size_prop);
}

cl_ulong utils_get_event_time (cl_event event, cl_profiling_info param)
{
cl_ulong result;
cl_int err_code;

err_code = clGetEventProfilingInfo (event, param,
sizeof (result), &result, NULL);
CHECK_OCL_ERROR (clGetEventProfilingInfo, err_code);

return result;
}

cl_ulong utils_get_event_execution_time (cl_event event)
{
return utils_get_event_time (event, CL_PROFILING_COMMAND_END) -
utils_get_event_time (event, CL_PROFILING_COMMAND_START);
}

@@ -0,0 +1,16 @@
#include "include/helpers.h"

std::vector<std::string> &split(const std::string &s, char delim, std::vector<std::string> &elems) {
std::stringstream ss(s);
std::string item;
while (std::getline(ss, item, delim)) {
elems.push_back(item);
}
return elems;
}

std::vector<std::string> split(const std::string &s, char delim) {
std::vector<std::string> elems;
split(s, delim, elems);
return elems;
}
@@ -0,0 +1,98 @@
#include "include/image2D.h"

/**
* @brief
*
* @param mem
* @param rowStep Width/Column * Num of channels
* @param queue
*/
Image2D::Image2D(cl_mem mem, cl_command_queue *queue, int rowStep)
:OCLBuffer(mem, queue)
{
//Needed to copy the right amount of data back to the memory
DEBUG_VALUE("Image2D::Image2D Constructor: ", mem);
this->_rowPitch = rowStep;
}

void Image2D::read(void *hostMem, const size_t origin[], const size_t region[], cl_bool blocking)
{
cl_int err = 0;
DEBUG_STRING("Image2D::read");
DEBUG_VALUE("memory :",_memory);

DEBUG_STRING("Image Region:");
DEBUG_VALUE("Width :" , region[0]);
DEBUG_VALUE("Height :", region[1]);
DEBUG_VALUE("Depth :" , region[2]);
DEBUG_VALUE("Row pitch is: ", _rowPitch);

DEBUG_STRING("Image Origin:");
DEBUG_VALUE("Width :" , origin[0]);
DEBUG_VALUE("Height :", origin[1]);
DEBUG_VALUE("Depth :" , origin[2]);

if(blocking)
DEBUG_STRING("BLOCKING READ");

err = clEnqueueReadImage(*_pQueue, _memory, blocking,
origin, region, _rowPitch, 0,
hostMem, 0, NULL, NULL);

DEBUG_CL(err);
}


/**
* @brief
*
* @param hostMem
* @param size[] region
* @param offset[] origin
* @param blocking
*/
void Image2D:: write(void *hostMem, const size_t origin[], const size_t region[], cl_bool blocking)
{
cl_int err = 0;
DEBUG_STRING("Image2D:: write");
DEBUG_VALUE("Image2D _memory write:",_memory);
err = clEnqueueWriteImage(*_pQueue, _memory, blocking, origin,
region, _rowPitch, 0,
hostMem, 0, NULL, NULL);
DEBUG_CL(err);
}


void *Image2D::map(cl_map_flags flags, const size_t size[], const size_t offset[], size_t &_rowPitch, cl_bool blocking)
{
size_t slicePitch;
cl_int err = 0;
void* ret = clEnqueueMapImage(*_pQueue, _memory, blocking, flags, offset, size, &_rowPitch, &slicePitch, 0, NULL, NULL, &err);
DEBUG_CL(err);
return ret;
}


void Image2D::copyToBuffer(OCLBuffer &dst, const size_t size[], const size_t srcOffset[], const size_t dstOffset)
{
cl_int err = 0;
err = clEnqueueCopyImageToBuffer(*_pQueue, _memory, dst.getMem(), srcOffset, size, dstOffset, 0, NULL, NULL);
DEBUG_CL(err);
}


void *Image2D::getInfo(const cl_image_info paramName)
{
cl_int err = 0;
size_t size;
err = clGetImageInfo (_memory, paramName, 0, NULL, &size);
DEBUG_CL(err);

if(size > 0) {
void* info = malloc(size);
err = clGetImageInfo (_memory, paramName, size, info, &size);
DEBUG_CL(err);
return info;
}
else return NULL;
}
@@ -0,0 +1,65 @@

/*Copyright 2017 Sateesh Pedagadi
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http ://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.*/


#ifndef INCLUDES_H_
#define INCLUDES_H_


//#include <Windows.h>
#include <string.h>
#include <stdlib.h>
#include "CL/cl.h"

//General defines
#define MAX_STR_LEN 256
#define HALF_STR_LEN MAX_STR_LEN/2
#define Q_STR_LEN HALF_STR_LEN/2

//OpenCL related
#define OCL_STATUS_READY 0
#define OCL_STATUS_INITIALIZED 1
#define OCL_STATUS_PROGRAM_ERROR 2
#define OCL_STATUS_KERNEL_ERROR 3
#define OCL_STATUS_MUTEX_ERROR 4
#define OCL_STATUS_FINALIZED 5

#define OCL_LOCK_SET 1
#define OCL_LOCK_RELEASE 2


/*
//Deep NN related
#define NN_MAX_KERNEL_COUNT 9 // 13
#define NN_KERNEL_IDX_IM2COL3X3 0
#define NN_KERNEL_IDX_IM2COL1X1 1
#define NN_KERNEL_IDX_NORMARR 2
#define NN_KERNEL_IDX_SCALEBIAS 3
#define NN_KERNEL_IDX_ADDBIAS 4
#define NN_KERNEL_IDX_SCALEADDBIAS 5
#define NN_KERNEL_IDX_NORMSCALEADDBIAS 6
#define NN_KERNEL_IDX_LEAKY_ACTIVATE 7
#define NN_KERNEL_IDX_LINEAR_ACTIVATE 8
#define NN_KERNEL_IDX_FLATARR 9
#define NN_KERNEL_IDX_SOFTMAX 10
#define NN_KERNEL_IDX_MAXPOOL 11
#define NN_KERNEL_IDX_RESETARR 12
*/



#endif /* INCLUDES_H_ */


@@ -0,0 +1,111 @@

/*Copyright 2017 Sateesh Pedagadi
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http ://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.*/


#ifndef OCLWRAPPER_H_
#define OCLWRAPPER_H_

#include <chrono>
#include <algorithm>
#include <string>
#include <iostream>
#include "CL/cl.h"
#include "cl_wrapper.hpp"
#include "GlobalDefines.h"
#include "clblast.h"
//#include "clblast_half.h"

#define PINNED_MEM_OUTPUT

#define PROFILE_KERNELS 0
#define BLOCK_KERNEL_EXEC 0


using namespace std;;

string ExePath();


typedef struct {

void *m_PinnedMemory;
OCLBuffer *m_OCLBuffer;

}StructPinnedOCLBuffer;

float sec(clock_t clocks);

enum {
NN_KERNEL_IDX_IM2COL3X3,
NN_KERNEL_IDX_IM2COL1X1,
NN_KERNEL_IDX_ADDBIAS,
NN_KERNEL_IDX_NORMSCALEADDBIAS,
NN_KERNEL_IDX_LEAKY_ACTIVATE,
NN_KERNEL_IDX_FLATARR,
NN_KERNEL_IDX_SOFTMAX,
NN_KERNEL_IDX_MAXPOOL,
NN_KERNEL_IDX_RESETARR,
NN_MAX_KERNEL_COUNT
};

static const char* NN_KERNEL_NAMES[NN_MAX_KERNEL_COUNT] = {

"image2columarray3x3",
"image2columarray1x1",
"addbias",
"normscaleaddbias",
"leakyactivatearray",
"flattenarray",
"softmax",
"maxpool",
"resetarray"
};

class OCLManager {

public:

OCLManager();
~OCLManager();
int Initialize();
int Finalize();
void ReleaseLock();
void SetLock();

float SoftMax(OCLBuffer *input, int n, int offset, int groups, float temp, OCLBuffer *output, int base);
float OCLManager::ResetArray(int N, OCLBuffer *inArray, OCLBuffer *biasArray, int filtSize);

StructPinnedOCLBuffer* InitializePinnedFloatArray(size_t numItems);
void FinalizePinnedFloatArray(StructPinnedOCLBuffer*);

const char *GetDeviceName() { return m_DeviceName; };

//private:

Program* m_OpenCLProgram;
void *m_RefObject;
int m_Status;
int m_LockStatus;
int m_CallerId;
//HANDLE m_LockMutex;
CLSetup m_OpenCLSetup;
KernelLauncher* m_OpenCLKernels[NN_MAX_KERNEL_COUNT];
char m_DeviceName[256];
};



#endif /* OCLWRAPPER_H_ */

@@ -0,0 +1,173 @@
/*
****************************************************************************
BSD License
Copyright (c) 2014, i-Vizon
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
3. All advertising materials mentioning features or use of this software
must display the following acknowledgement:
This product includes software developed by the i-Vizon.
4. Neither the name of the i-Vizon nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY Mageswaran.D ''AS IS'' AND ANY
EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL Mageswaran.D BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
****************************************************************************
*/
/*
* =============================================================================
*
*
* FILENAME : Buffer.h
*
* DESCRIPTION : A wrapper library for OpenCL and its native counter part
* intialization. With boost thread support.
*
* AUTHOR : Mageswaran D
*
*
* CHANGE HISTORY :
*
* DATE : 17th Mar 2014
*
* File Created.
*
* =============================================================================
*/
#ifndef BUFFER_H
#define BUFFER_H

#include "iv_common.h"
#include "CL/cl.h"

class Image2D;
/**
* @brief
*
*/
class OCLBuffer
{
public:
/**
* @brief
*
* @param tmp
* @param queue
*/
OCLBuffer(cl_mem tmp, cl_command_queue *queue);

/**
* @brief
*
* @return cl_mem
*/
cl_mem getMem();

/**
* @brief
*
* @param hostMem
* @param size
* @param offset
* @param blocking
*/
void read(void* hostMem, const size_t size, const size_t offset=0, const cl_bool blocking=CL_TRUE);

/**
* @brief
*
* @param hostMem
* @param size
* @param offset
* @param blocking
*/
void write(const void* hostMem, const size_t size, const size_t offset=0, const cl_bool blocking=CL_TRUE);

/**
* @brief
*
* @param dst
* @param size
* @param srcOffset
* @param dstOffset
*/
void copy(OCLBuffer& dst, const size_t size, const size_t srcOffset=0, const size_t dstOffset=0);


/**
* @brief
*
* @param flags
* @param size
* @param offset
* @param blocking
*/
void* map(const cl_map_flags flags, const size_t size, const size_t offset, const cl_bool blocking=CL_TRUE);

void FillBuffer(const void *hostMem, const size_t size, const size_t offset);

/**
* @brief
*
* @param mappedPtr
*/
void unmap(void* mappedPtr);

/**
* @brief
*
* @param dst
* @param size[]
* @param srcOffset
* @param dstOffset[]
*/
void copyToImage2D(Image2D& dst, const size_t size[2], const size_t srcOffset, const size_t dstOffset[2]);

/**
* @brief
*
* @param dst
* @param size[]
* @param srcOffset
* @param dstOffset[]
*/
void copyToImage3D(Image2D& dst, const size_t size[3], const size_t srcOffset, const size_t dstOffset[3]);

/**
* @brief
*
* @param paramName
*/
void* getMemInfo(const cl_mem_info paramName);

virtual ~OCLBuffer()
{
clReleaseMemObject(_memory);
//DEBUG_STRING("Releasing GPU Memory Buffers");
}

protected:
cl_mem _memory; /**< TODO */
cl_command_queue* _pQueue; /**< TODO */
private:

};

#endif // BUFFER_H
@@ -0,0 +1,185 @@
#ifndef CL_WRAPPER_H
#define CL_WRAPPER_H

/**
* Enable CL_INFO_PRINT macro to see OpenCL Device and Kernel Info
*
*/
#include "iv_common.h"
#include "CL/cl.h"
//#include "CL/cl.hpp"

#include "program.h"
#include "buffer.h"
#include "image2D.h"
#include "sampler.h"


#define NUMBER_OF_PLATFORMS 5
#define NUMBER_OF_DEVICES 5
class CLSetup
{
public:
void init(char* deviceName);
void getPlatformID();
void getDeviceID(char* deviceName);
void getContextnQueue();
//Program* createProgram(std::string &kernelFilePath);
Program* createProgram(std::vector<std::string> kernelFilePath);
OCLBuffer* createBuffer(const size_t size,
const cl_mem_flags flags,
void* hostMem);
Image2D* createImage2D(const size_t width,
const size_t height,
const cl_image_format* format,
const cl_mem_flags flags = 0,
const size_t rowPitch = 0,
void* hostMem = NULL);
Sampler* createSampler(cl_bool normalizedCoords,
cl_addressing_mode addrMode,
cl_filter_mode filterMode);

cl_command_queue* getQueue() {

return &_queue;
}

cl_context* getContext() {

return &_context;
}

//>>>>>>>>>>>>>>>>>Get Info
int getNumberOfPlatforms()
{
return _numPlatforms;
}
void getDeviceName()
{
char* infoName;
size_t infoCLSize;
clGetPlatformInfo(_platformID,CL_PLATFORM_NAME, NULL, NULL, &infoCLSize);
infoName = (char*)malloc(sizeof(char)*infoCLSize);
clGetPlatformInfo(_platformID,CL_PLATFORM_NAME, sizeof(char) * infoCLSize, infoName, NULL );
std::cout<<"CL_PLATFORM_NAME : "<<infoName<<std::endl;
free(infoName);
}
int getNumberOfDevices()
{
return _numDevices;
}
///
/// \brief Compute units are equivalent of MultiProcessor
/// (which can have either 8, 32, 48 or even 192 cores),
/// and these are designed to be able to simultanesouly
/// run up to 8 work groups (blocks in CUDA) each.
/// Eg: GeForce GT 640 : 2 * 196 = 384 cores
/// Eg: Vivante GC2000 : 4 *
/// \return Maximum Compute Units available in your GPU in size_t

cl_uint getMaxComputeUnits()
{
return _maxComputeUnits;
}
size_t getPerferredWorkGroupSize()
{
return _perferredWrkGrpSize; ///!TODO:Need to get after from kernel init
}
///
/// \brief getMaxWorkGroupSize
/// Returns maximum number of work items in a workgroup
/// Eg: GeForce GT 640 : 48KBytes
/// Eg: Vivante GC2000 : 1KBytes
/// \return
///
size_t getMaxWorkGroupSize()
{
return _maxWorkGroupSize;
}
///
/// \brief Returns the local memory size of device kernel groups
///
/// \return
///
cl_ulong getLocalMemSize()
{
return _localMemSize;
}
size_t getPrefferedWorkGroupSize()
{
return _preferredWorkGrpSize;
}

void getSupportedExtensions()
{
size_t infoCLSize;
char* infoExtensions;
clGetPlatformInfo(_platformID,CL_PLATFORM_EXTENSIONS, NULL, NULL, &infoCLSize);
infoExtensions = (char*)malloc(sizeof(char)*infoCLSize);
clGetPlatformInfo(_platformID,CL_PLATFORM_EXTENSIONS, sizeof(char) * infoCLSize, infoExtensions, NULL );
std::cout<<"CL_PLATFORM_EXTENSIONS : "<<infoExtensions<<std::endl;
free(infoExtensions);
}

~CLSetup()
{
clReleaseCommandQueue(_queue);
clReleaseContext(_context);
}

protected:
private:
//>>>>>>>>>>>>>>>>>Info Members
cl_int _status;
cl_long _infoValue;

//>>>>>>>>>>>>>>>>>Platform Members
cl_uint _numPlatforms;
cl_platform_id _platformID;
/**
* @brief _platformIDsVector
* _platformIDsVector[interestedPlatformNum]
*/
std::vector<cl_platform_id> _platformIDsVector; //!TODO:

//>>>>>>>>>>>>>>>>>Device Members
cl_uint _numDevices;
cl_device_id _deviceID;

cl_uint _maxComputeUnits;
size_t _maxWorkGroupSize;
cl_ulong _maxMemAllocSize;
cl_ulong _globalMemSize;
cl_ulong _constMemSize;
cl_ulong _localMemSize;
size_t _preferredWorkGrpSize;



//>>>>>>>>>>>>>>>>>Context Members
cl_context _context;

//>>>>>>>>>>>>>>>>>Queue Members
cl_command_queue _queue;

//>>>>>>>>>>>>>>>>>Program Members
cl_program _program;

//>>>>>>>>>>>>>>>>>Kernel Members
cl_kernel _kernel;
size_t _compileWrkGrpSize;
size_t _wrkGrpSize;
size_t _perferredWrkGrpSize;
cl_ulong _localMem;
cl_ulong _privateMem;

//>>>>>>>>>>>>>>>>>Image2D Members
cl_mem im2d;


};

//const char * get_error_string(cl_int err);


#endif // CL_WRAPPER_H
@@ -0,0 +1,33 @@

#ifndef __CL_UTILS__H__
#define __CL_UTILS__H__

#include <CL/opencl.h>

char* utils_cl_enum_to_string (cl_int value);

char* utils_get_ocl_error (cl_int err_code);

int utils_get_platform_and_device (cl_device_type dev_type,
cl_platform_id *platform,
cl_device_id *device_id,
int just_print);

void utils_print_device_info (cl_device_id dev_id);

void utils_print_platform_info (cl_platform_id dev_id);

char* utils_read_file (const char* filename);

cl_ulong utils_get_event_time (cl_event event,
cl_profiling_info param);

cl_ulong utils_get_event_execution_time (cl_event event);

#define CHECK_OCL_ERROR(op_name, err_code) \
if (err_code != CL_SUCCESS) \
{\
printf ("%s:%d: " #op_name " failed! %s\n", __FILE__, __LINE__, utils_get_ocl_error (err_code));\
}

#endif
@@ -0,0 +1,24 @@

#ifndef HELPERS__H
#define HELPERS__H

#include <string>
#include <vector>
#include <sstream>

std::vector<std::string> split(const std::string &s, char delim);

template <class T>
std::string vector_join( const std::vector<T>& v, const std::string& token ) {
std::stringstream result;
for (typename std::vector<T>::const_iterator i = v.begin(); i != v.end(); i++) {
if ( i != v.begin() )
result << token;
result << *i;
}
return result.str();
}

std::string randUuid();

#endif // HELPERS__H
@@ -0,0 +1,83 @@

#ifndef IMAGE_2D_H
#define IMAGE_2D_H

#include "CL/cl.h"
#include "buffer.h"

class Image2D : public OCLBuffer
{
public:
/**
* @brief
*
* @param mem
* @param queue
* @param rowPitch
*/
Image2D(cl_mem mem, cl_command_queue* queue, int rowPitch = 0);
/**
* @brief
*
* @param hostMem
* @param size[]
* @param offset[]
* @param blocking
*/
void read(void* hostMem, const size_t size[2],
const size_t offset[2], cl_bool blocking = CL_TRUE);
/**
* @brief
*
* @param hostMem
* @param size[]
* @param offset[]
* @param blocking
*/
void write(void* hostMem, const size_t size[2],
const size_t offset[2], cl_bool blocking = CL_TRUE);
/**
* @brief
*
* @param flags
* @param size[]
* @param offset[]
* @param rowPitch
* @param blocking
*/
void* map(cl_map_flags flags, const size_t size[2],
const size_t offset[2], size_t& rowPitch, cl_bool blocking = CL_TRUE);
/**
* @brief
*
* @param dst
* @param size[]
* @param srcOffset[]
* @param dstOffset
*/
void copyToBuffer(OCLBuffer& dst, const size_t size[2],
const size_t srcOffset[2], const size_t dstOffset = 0);
/**
* @brief
*
* @param paramName
*/
void* getInfo(const cl_image_info paramName);
/**
* @brief
*
*/
~Image2D()
{

}

protected:
size_t _rowPitch;

private:
};



#endif //IMAGE_2D_H
@@ -0,0 +1,160 @@


#ifndef COMMON_H
#define COMMON_H

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <iostream>
#include <fstream>
#include <iostream>
#include <iterator>
#include <map>
#include <iostream>
#include <vector>

#include "CL/cl.h"

///!TODO: Use bits as enum value
typedef enum {
IV_CHAR_FILE_OPEN_FAIL = 0,
IV_CHAR_FILE_STATUS_FAIL
}IV_ERRORS_INFO;

const char* getCLErrorString(int err);
const char* getCustomErrorString(int err, IV_ERRORS_INFO info);
std::string getImgType(int imgTypeInt);


/*#define DEBUG_CL(err) \
if(err< 0) { \
std::cout<<"FILE: "<<__FILE__<<std::cout<<" Line Number: "<<__LINE__\
<<" Function Name : "<<__func__<<"Error Name:" \
<<getCLErrorString(err) \
<<std::endl; \
exit(err); }*/

#define DEBUG_IV(err, info) \
if(err< 0) { \
std::cout<<"Line Number: "<<__LINE__<<" Function Name : "<<__func__<<"Error Name:" \
<<getCustomErrorString(err, info) \
<<std::endl; \
exit(err); }

//#define DEBUG_VALUE(dbgMsg, value) \
// std::cout<<"=====>"<<" " <<dbgMsg<<"......."<<value<<std::endl;

//#define DEBUG_STRING(dbgMsg) \
//std::cout<<">>>>>>"<<dbgMsg<<std::endl;

#define ERROR_PRINT_VALUE(dbgMsg, value) \
{ \
std::cout<<"\n=====> Line Number: "<<__LINE__<<" Function Name :"<<__func__\
<<"\n "<<dbgMsg<<" "<<value<<std::endl; \
exit(0); \
};

#define ERROR_PRINT_STRING(dbgMsg) \
{ \
std::cout<<"\n=====> Line Number: "<<__LINE__<<" Function Name: "<<__func__\
<<"\n "<<dbgMsg<<std::endl; \
exit(0); \
};

#ifdef IVIZON_DEBUG
#define F_LOG LogBlock _l(__func__)
struct LogBlock {
const char *mLine;
LogBlock(const char *line) : mLine(line) {
std::cout<<mLine <<" ----->#### Enter \n";
}
~LogBlock() {
std::cout<<mLine <<" <-----#### Leave \n";
}
};
#else
#define F_LOG {}
#define DEBUG_CL(err) {}
#define DEBUG_STRING(dbgMsg) {}
#define DEBUG_VALUE(dbgMsg, value) {}
#endif



// CV Defines
#define CV_8U 0
#define CV_8S 1
#define CV_16U 2
#define CV_16S 3
#define CV_32S 4
#define CV_32F 5
#define CV_64F 6

#define CV_CN_MAX 512

#define CV_MAT_DEPTH_MASK (CV_DEPTH_MAX - 1)

#define CV_CN_SHIFT 3
#define CV_MAT_DEPTH(flags) ((flags) & CV_MAT_DEPTH_MASK)
#define CV_DEPTH_MAX (1 << CV_CN_SHIFT)

#define CV_MAKETYPE(depth,cn) (CV_MAT_DEPTH(depth) + (((cn)-1) << CV_CN_SHIFT))
#define CV_MAKE_TYPE CV_MAKETYPE

#define CV_8UC1 CV_MAKETYPE(CV_8U,1)
#define CV_8UC2 CV_MAKETYPE(CV_8U,2)
#define CV_8UC3 CV_MAKETYPE(CV_8U,3)
#define CV_8UC4 CV_MAKETYPE(CV_8U,4)
#define CV_8UC(n) CV_MAKETYPE(CV_8U,(n))

#define CV_8SC1 CV_MAKETYPE(CV_8S,1)
#define CV_8SC2 CV_MAKETYPE(CV_8S,2)
#define CV_8SC3 CV_MAKETYPE(CV_8S,3)
#define CV_8SC4 CV_MAKETYPE(CV_8S,4)
#define CV_8SC(n) CV_MAKETYPE(CV_8S,(n))

#define CV_16UC1 CV_MAKETYPE(CV_16U,1)
#define CV_16UC2 CV_MAKETYPE(CV_16U,2)
#define CV_16UC3 CV_MAKETYPE(CV_16U,3)
#define CV_16UC4 CV_MAKETYPE(CV_16U,4)
#define CV_16UC(n) CV_MAKETYPE(CV_16U,(n))

#define CV_16SC1 CV_MAKETYPE(CV_16S,1)
#define CV_16SC2 CV_MAKETYPE(CV_16S,2)
#define CV_16SC3 CV_MAKETYPE(CV_16S,3)
#define CV_16SC4 CV_MAKETYPE(CV_16S,4)
#define CV_16SC(n) CV_MAKETYPE(CV_16S,(n))

#define CV_32SC1 CV_MAKETYPE(CV_32S,1)
#define CV_32SC2 CV_MAKETYPE(CV_32S,2)
#define CV_32SC3 CV_MAKETYPE(CV_32S,3)
#define CV_32SC4 CV_MAKETYPE(CV_32S,4)
#define CV_32SC(n) CV_MAKETYPE(CV_32S,(n))

#define CV_32FC1 CV_MAKETYPE(CV_32F,1)
#define CV_32FC2 CV_MAKETYPE(CV_32F,2)
#define CV_32FC3 CV_MAKETYPE(CV_32F,3)
#define CV_32FC4 CV_MAKETYPE(CV_32F,4)
#define CV_32FC(n) CV_MAKETYPE(CV_32F,(n))

#define CV_64FC1 CV_MAKETYPE(CV_64F,1)
#define CV_64FC2 CV_MAKETYPE(CV_64F,2)
#define CV_64FC3 CV_MAKETYPE(CV_64F,3)
#define CV_64FC4 CV_MAKETYPE(CV_64F,4)
#define CV_64FC(n) CV_MAKETYPE(CV_64F,(n))


/// IV Data Types
//This is done to have more control on memory and range on numbers

typedef unsigned char IV_8U;
typedef char IV_8S;
typedef unsigned IV_16U;
typedef signed IV_16S;
typedef int IV_32S;
typedef unsigned int IV_32U;
typedef float IV_32F;
typedef double IV_64F;
#endif
@@ -0,0 +1,115 @@


#ifndef KERNELLAUNCHER_H
#define KERNELLAUNCHER_H

#include "iv_common.h"



class KernelLauncher
{
public:
KernelLauncher(cl_device_id _device_id, cl_kernel* kernel, cl_command_queue* queue, std::string kernelNAme);

///For a continuous aggignment using an object
KernelLauncher& global(const int g);
KernelLauncher& global(const int gx, const int gy);
KernelLauncher& global(const int gx, const int gy, const int gz);
KernelLauncher& local(const int l);
KernelLauncher& local(const int lx, const int ly);
KernelLauncher& local(const int lx, const int ly, const int lz);
///For a continuous aggignment using an pointer object
KernelLauncher* pGlobal(const int g);
KernelLauncher* pGlobal(const int gx, const int gy);
KernelLauncher* pGlobal(const int gx, const int gy, const int gz);
KernelLauncher* pLocal(const int l);
KernelLauncher* pLocal(const int lx, const int ly);
KernelLauncher* pLocal(const int lx, const int ly, const int lz);

int countArgs();

///For a continuous aggignment using an object
template<class T>
KernelLauncher& arg(const int index, T x) {
if (index >= _numArgs || index < 0) {
std::cout << "Error: argument index out of range" << std::endl;
exit(-1);///!TODO: Custom exit code
}
cl_int status = clSetKernelArg(*_pKernel, index, sizeof(x), &x);
DEBUG_CL(status);
_argListData[index] = true;
return *this;
}
///For a continuous aggignment using an pointer object
template<class T>
KernelLauncher& arg(T x) {
int nArgs = countArgs();
if (nArgs >= _numArgs) {
std::cout << "Error trying to enqueue too much arguments" << std::endl;
std::cout << "Expected " << _numArgs << ", got " << nArgs << std::endl;
exit(-1);///!TODO: Custom exit code
}
for(int i=0; i<_numArgs; i++)
if(!_argListData[i])
return arg(i, x);
return *this;
}

///For a continuous aggignment using an object
template<class T>
KernelLauncher* pArg(const int index, T &x) {
if (index >= _numArgs || index < 0) {
std::cout << "Error: argument index out of range" << std::endl;
exit(-1);///!TODO: Custom exit code
}
cl_int status = clSetKernelArg(*_pKernel, index, sizeof(x), &x);
DEBUG_VALUE("Setting Kernel Argument: ", index);
DEBUG_VALUE("Value/Address: ", x);
DEBUG_VALUE("Size : ", sizeof(x));
DEBUG_CL(status);
_argListData[index] = true;
return this;
}
///For a continuous aggignment using an pointer object
template<class T>
KernelLauncher* pArg(T &x) {
int nArgs = countArgs();
if (nArgs > _numArgs) {
std::cout << "Error trying to enqueue too much arguments" << std::endl;
std::cout << "Expected " << _numArgs << ", got " << nArgs << std::endl;
exit(-1);///!TODO: Custom exit code
}
for(int i=0; i<_numArgs; i++)
if(!_argListData[i])
return pArg(i, x);
return this;
}
float run(bool profile, bool block);

~KernelLauncher()
{
clReleaseKernel(*_pKernel);
}

size_t GetOptimalLWGSize() {

return _optimal_local_workgroup_size;
}

protected:
private:
cl_kernel* _pKernel;
cl_command_queue* _pQueue;
cl_int _numArgs;
size_t _globalWorkSize[3];
size_t _localWorkSize[3];
cl_int _dimensions;
std::string _kernel_name;
cl_bool* _argListData;
cl_device_id _device_id;
size_t _optimal_local_workgroup_size;
};


#endif // KERNELLAUNCHER_H
@@ -0,0 +1,188 @@
#ifndef OCL_UTILS_H
#define OCL_UTILS_H

// *********************************************************************
// Utilities specific to OpenCL samples in NVIDIA GPU Computing SDK
// *********************************************************************

// Common headers: Cross-API utililties and OpenCL header
#include "shrUtils.h"

// All OpenCL headers
#if defined (__APPLE__) || defined(MACOSX)
#include <OpenCL/opencl.h>
#else
#include <CL/opencl.h>
#endif

// Includes
#include <stdio.h>
#include <string.h>
#include <stdlib.h>

// For systems with CL_EXT that are not updated with these extensions, we copied these
// extensions from <CL/cl_ext.h>
#ifndef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
/* cl_nv_device_attribute_query extension - no extension #define since it has no functions */
#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV 0x4000
#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV 0x4001
#define CL_DEVICE_REGISTERS_PER_BLOCK_NV 0x4002
#define CL_DEVICE_WARP_SIZE_NV 0x4003
#define CL_DEVICE_GPU_OVERLAP_NV 0x4004
#define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV 0x4005
#define CL_DEVICE_INTEGRATED_MEMORY_NV 0x4006
#endif

// reminders for build output window and log
#ifdef _WIN32
#pragma message ("Note: including shrUtils.h")
#pragma message ("Note: including opencl.h")
#endif

// SDK Revision #
#define OCL_SDKREVISION "7027912"

// Error and Exit Handling Macros...
// *********************************************************************
// Full error handling macro with Cleanup() callback (if supplied)...
// (Companion Inline Function lower on page)
#define oclCheckErrorEX(a, b, c) __oclCheckErrorEX(a, b, c, __FILE__ , __LINE__)

// Short version without Cleanup() callback pointer
// Both Input (a) and Reference (b) are specified as args
#define oclCheckError(a, b) oclCheckErrorEX(a, b, 0)

//////////////////////////////////////////////////////////////////////////////
//! Gets the platform ID for NVIDIA if available, otherwise default to platform 0
//!
//! @return the id
//! @param clSelectedPlatformID OpenCL platform ID
//////////////////////////////////////////////////////////////////////////////
extern "C" cl_int oclGetPlatformID(cl_platform_id* clSelectedPlatformID);

//////////////////////////////////////////////////////////////////////////////
//! Print info about the device
//!
//! @param iLogMode enum LOGBOTH, LOGCONSOLE, LOGFILE
//! @param device OpenCL id of the device
//////////////////////////////////////////////////////////////////////////////
extern "C" void oclPrintDevInfo(int iLogMode, cl_device_id device);

//////////////////////////////////////////////////////////////////////////////
//! Get and return device capability
//!
//! @return the 2 digit integer representation of device Cap (major minor). return -1 if NA
//! @param device OpenCL id of the device
//////////////////////////////////////////////////////////////////////////////
extern "C" int oclGetDevCap(cl_device_id device);

//////////////////////////////////////////////////////////////////////////////
//! Print the device name
//!
//! @param iLogMode enum LOGBOTH, LOGCONSOLE, LOGFILE
//! @param device OpenCL id of the device
//////////////////////////////////////////////////////////////////////////////
extern "C" void oclPrintDevName(int iLogMode, cl_device_id device);

//////////////////////////////////////////////////////////////////////////////
//! Gets the id of the first device from the context
//!
//! @return the id
//! @param cxGPUContext OpenCL context
//////////////////////////////////////////////////////////////////////////////
extern "C" cl_device_id oclGetFirstDev(cl_context cxGPUContext);

//////////////////////////////////////////////////////////////////////////////
//! Gets the id of the nth device from the context
//!
//! @return the id or -1 when out of range
//! @param cxGPUContext OpenCL context
//! @param device_idx index of the device of interest
//////////////////////////////////////////////////////////////////////////////
extern "C" cl_device_id oclGetDev(cl_context cxGPUContext, unsigned int device_idx);

//////////////////////////////////////////////////////////////////////////////
//! Gets the id of device with maximal FLOPS from the context
//!
//! @return the id
//! @param cxGPUContext OpenCL context
//////////////////////////////////////////////////////////////////////////////
extern "C" cl_device_id oclGetMaxFlopsDev(cl_context cxGPUContext);

//////////////////////////////////////////////////////////////////////////////
//! Loads a Program file and prepends the cPreamble to the code.
//!
//! @return the source string if succeeded, 0 otherwise
//! @param cFilename program filename
//! @param cPreamble code that is prepended to the loaded file, typically a set of #defines or a header
//! @param szFinalLength returned length of the code string
//////////////////////////////////////////////////////////////////////////////
extern "C" char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength);

//////////////////////////////////////////////////////////////////////////////
//! Get the binary (PTX) of the program associated with the device
//!
//! @param cpProgram OpenCL program
//! @param cdDevice device of interest
//! @param binary returned code
//! @param length length of returned code
//////////////////////////////////////////////////////////////////////////////
extern "C" void oclGetProgBinary( cl_program cpProgram, cl_device_id cdDevice, char** binary, size_t* length);

//////////////////////////////////////////////////////////////////////////////
//! Get and log the binary (PTX) from the OpenCL compiler for the requested program & device
//!
//! @param cpProgram OpenCL program
//! @param cdDevice device of interest
//! @param const char* cPtxFileName optional PTX file name
//////////////////////////////////////////////////////////////////////////////
extern "C" void oclLogPtx(cl_program cpProgram, cl_device_id cdDevice, const char* cPtxFileName);

//////////////////////////////////////////////////////////////////////////////
//! Get and log the Build Log from the OpenCL compiler for the requested program & device
//!
//! @param cpProgram OpenCL program
//! @param cdDevice device of interest
//////////////////////////////////////////////////////////////////////////////
extern "C" void oclLogBuildInfo(cl_program cpProgram, cl_device_id cdDevice);

// Helper function for De-allocating cl objects
// *********************************************************************
extern "C" void oclDeleteMemObjs(cl_mem* cmMemObjs, int iNumObjs);

// Helper function to get OpenCL error string from constant
// *********************************************************************
extern "C" const char* oclErrorString(cl_int error);

// Helper function to get OpenCL image format string (channel order and type) from constant
// *********************************************************************
extern "C" const char* oclImageFormatString(cl_uint uiImageFormat);

// companion inline function for error checking and exit on error WITH Cleanup Callback (if supplied)
// *********************************************************************
inline void __oclCheckErrorEX(cl_int iSample, cl_int iReference, void (*pCleanup)(int), const char* cFile, const int iLine)
{
// An error condition is defined by the sample/test value not equal to the reference
if (iReference != iSample)
{
// If the sample/test value isn't equal to the ref, it's an error by defnition, so override 0 sample/test value
iSample = (iSample == 0) ? -9999 : iSample;

// Log the error info
shrLog("\n !!! Error # %i (%s) at line %i , in file %s !!!\n\n", iSample, oclErrorString(iSample), iLine, cFile);

// Cleanup and exit, or just exit if no cleanup function pointer provided. Use iSample (error code in this case) as process exit code.
if (pCleanup != NULL)
{
pCleanup(iSample);
}
else
{
shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n");
exit(iSample);
}
}
}

#endif

@@ -0,0 +1,40 @@
#include <CL/cl.h>
#include "iv_common.h"
#include "kernel_launcher.h"



#ifndef PROGRAM_H
#define PROGRAM_H

class Program
{
public:
//Program(std::string &kernelFilePath, cl_context* context, cl_command_queue* queue, cl_device_id* device);
Program(std::vector<std::string> &kernelFilePath, cl_context* context, cl_command_queue* queue, cl_device_id* device);
void createProgram(std::string filePath);
void buildProgram();
KernelLauncher* createKernelLauncher(std::string kernelName);
~Program()
{
clReleaseProgram(_program);
}

protected:

private:
cl_program _program;
cl_kernel _kernel;
cl_int _numKernels;
std::map<std::string, cl_kernel> _kernels;
std::string _filesPath;

cl_context* _pContext;
cl_command_queue* _pQueue;
cl_device_id* _pDeviceID;

cl_int _status;
cl_bool _buildState;
};

#endif // PROGRAM_H
@@ -0,0 +1,19 @@
#ifndef SAMPLER_H
#define SAMPLER_H
#include <CL/cl.h>

class Sampler
{
public:
Sampler(cl_context* /* context */,
cl_bool /* normalized_coords */,
cl_addressing_mode /* addressing_mode */,
cl_filter_mode /* filter_mode */);
cl_sampler& getSampler();
protected:
private:
cl_context* _pContext;
cl_sampler _sampler;
};

#endif // SAMPLER_H

Large diffs are not rendered by default.

@@ -0,0 +1,104 @@
#include "include/iv_common.h"

const char * getCLErrorString(int err)
{
// /F_LOG;
switch(err)
{
case 0: return "CL_SUCCESS";
case -1: return "CL_DEVICE_NOT_FOUND";
case -2: return "CL_DEVICE_NOT_AVAILABLE";
case -3: return "CL_COMPILER_NOT_AVAILABLE";
case -4: return "CL_MEM_OBJECT_ALLOCATION_FAILURE";
case -5: return "CL_OUT_OF_RESOURCES";
case -6: return "CL_OUT_OF_HOST_MEMORY";
case -7: return "CL_PROFILING_INFO_NOT_AVAILABLE";
case -8: return "CL_MEM_COPY_OVERLAP";
case -9: return "CL_IMAGE_FORMAT_MISMATCH";
case -10: return "CL_IMAGE_FORMAT_NOT_SUPPORTED";
case -11: return "CL_BUILD_Program_FAILURE";
case -12: return "CL_MAP_FAILURE";

case -30: return "CL_INVALID_VALUE";
case -31: return "CL_INVALID_DEVICE_TYPE";
case -32: return "CL_INVALID_PLATFORM";
case -33: return "CL_INVALID_DEVICE";
case -34: return "CL_INVALID_mContext";
case -35: return "CL_INVALID_mQueue_PROPERTIES";
case -36: return "CL_INVALID_COMMAND_mQueue";
case -37: return "CL_INVALID_HOST_PTR";
case -38: return "CL_INVALID_MEM_OBJECT";
case -39: return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR";
case -40: return "CL_INVALID_IMAGE_SIZE";
case -41: return "CL_INVALID_SAMPLER";
case -42: return "CL_INVALID_BINARY";
case -43: return "CL_INVALID_BUILD_OPTIONS";
case -44: return "CL_INVALID_Program";
case -45: return "CL_INVALID_Program_EXECUTABLE";
case -46: return "CL_INVALID_KERNEL_NAME";
case -47: return "CL_INVALID_KERNEL_DEFINITION";
case -48: return "CL_INVALID_KERNEL";
case -49: return "CL_INVALID_ARG_mINDEX";
case -50: return "CL_INVALID_ARG_VALUE";
case -51: return "CL_INVALID_ARG_SIZE";
case -52: return "CL_INVALID_KERNEL_ARGS";
case -53: return "CL_INVALID_WORK_DIMENSION";
case -54: return "CL_INVALID_WORK_GROUP_SIZE";
case -55: return "CL_INVALID_WORK_mITEM_SIZE";
case -56: return "CL_INVALID_GLOBAL_OFFSET";
case -57: return "CL_INVALID_EVENT_WAIT_LIST";
case -58: return "CL_INVALID_EVENT";
case -59: return "CL_INVALID_OPERATION";
case -60: return "CL_INVALID_GL_OBJECT";
case -61: return "CL_INVALID_mBuffer_SIZE";
case -62: return "CL_INVALID_MIP_LEVEL";
case -63: return "CL_INVALID_GLOBAL_WORK_SIZE";
default: return "Unknown OpenCL error";
}
}


const char* getCustomErrorString(int err, IV_ERRORS_INFO info)
{
switch(info)
{
case IV_CHAR_FILE_OPEN_FAIL: return "IV_CHAR_FILE_OPEN_FAIL";
case IV_CHAR_FILE_STATUS_FAIL: return "IV_CHAR_FILE_STATUS_FAIL";
default:
return "IV_UNKNOWN_ERROR";
}
}


/**
* @brief CLRuntime::getImgType // take number
* image type number (from cv::Mat.type()), get OpenCV's enum string.
* @param imgTypeInt
* @return
*/
std::string getImgType(int imgTypeInt)
{
int numImgTypes = 35; // 7 base types, with five channel options each (none or C1, ..., C4)

int enum_Ints[] = {CV_8U, CV_8UC1, CV_8UC2, CV_8UC3, CV_8UC4,
CV_8S, CV_8SC1, CV_8SC2, CV_8SC3, CV_8SC4,
CV_16U, CV_16UC1, CV_16UC2, CV_16UC3, CV_16UC4,
CV_16S, CV_16SC1, CV_16SC2, CV_16SC3, CV_16SC4,
CV_32S, CV_32SC1, CV_32SC2, CV_32SC3, CV_32SC4,
CV_32F, CV_32FC1, CV_32FC2, CV_32FC3, CV_32FC4,
CV_64F, CV_64FC1, CV_64FC2, CV_64FC3, CV_64FC4};

std::string enum_strings[] = {"CV_8U", "CV_8UC1", "CV_8UC2", "CV_8UC3", "CV_8UC4",
"CV_8S", "CV_8SC1", "CV_8SC2", "CV_8SC3", "CV_8SC4",
"CV_16U", "CV_16UC1", "CV_16UC2", "CV_16UC3", "CV_16UC4",
"CV_16S", "CV_16SC1", "CV_16SC2", "CV_16SC3", "CV_16SC4",
"CV_32S", "CV_32SC1", "CV_32SC2", "CV_32SC3", "CV_32SC4",
"CV_32F", "CV_32FC1", "CV_32FC2", "CV_32FC3", "CV_32FC4",
"CV_64F", "CV_64FC1", "CV_64FC2", "CV_64FC3", "CV_64FC4"};

for(int i=0; i<numImgTypes; i++)
{
if(imgTypeInt == enum_Ints[i]) return enum_strings[i];
}
return "unknown image type";
}
@@ -0,0 +1,206 @@
#include "include/kernel_launcher.h"

KernelLauncher::KernelLauncher(cl_device_id device_id, cl_kernel *kernel, cl_command_queue *queue, std::string kernelName)
{
cl_int status;
this->_pKernel = kernel;
this->_pQueue = queue;
this->_dimensions = -1;
_device_id = device_id;
_globalWorkSize[0] = _globalWorkSize[1] = _globalWorkSize[2] =
_localWorkSize[0] = _localWorkSize[1] = _localWorkSize[2] = NULL;

//Finding number of arguments in given kernel and making
//an bool array to track its data content
status = clGetKernelInfo(*_pKernel, CL_KERNEL_NUM_ARGS, sizeof(cl_int), &_numArgs, NULL);
DEBUG_CL(status);
printf("Number of kernel Arguments : %d %s \n",_numArgs, kernelName.c_str());
this->_argListData = (cl_bool*) malloc(_numArgs*sizeof(cl_bool));//new cl_bool[numArgs];
for(int i=0; i<_numArgs; i++)
this->_argListData[i] = false;

cl_int err = clGetKernelWorkGroupInfo(*_pKernel, _device_id,
CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &_optimal_local_workgroup_size, NULL);
//err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);

size_t compileWorkGroupSize[3];
err = clGetKernelWorkGroupInfo(*_pKernel, _device_id,
CL_KERNEL_COMPILE_WORK_GROUP_SIZE,
sizeof(size_t) * 3,
compileWorkGroupSize,
NULL);

_kernel_name = kernelName;

}

int KernelLauncher::countArgs()
{
int ret=0;

for(int i=0; i<_numArgs; i++)
if(_argListData[i])
ret++;

return ret;
}

float KernelLauncher::run(bool profile, bool block)
{
double total = 0;
cl_int status = CL_SUCCESS;
cl_event execEvent = NULL;

status = clEnqueueNDRangeKernel(*_pQueue, *_pKernel, _dimensions,
NULL, _globalWorkSize, _localWorkSize, 0,
NULL, (profile? &execEvent: NULL));

if (status == CL_SUCCESS && block)
clFinish(*_pQueue);

if (profile && status == CL_SUCCESS) {

clWaitForEvents(1, &execEvent);
long long start, end;
status = clGetEventProfilingInfo(execEvent, CL_PROFILING_COMMAND_START,
sizeof(start), &start, NULL);
status = clGetEventProfilingInfo(execEvent, CL_PROFILING_COMMAND_END,
sizeof(end), &end, NULL);

total = (double)(end - start) / 1e6; /* Convert nanoseconds to msecs */
printf("Total kernel time was {%5.3f} msecs - %s \n", total, _kernel_name.c_str());

clReleaseEvent(execEvent);
}

if (status != CL_SUCCESS) {

char * kernelName = (char*)_kernel_name.c_str();
int z = 0;
z++;
}

//DEBUG_CL(status);
return (float)total;
}
KernelLauncher& KernelLauncher::global(const int g) {
if (_dimensions == -1) _dimensions = 1;
else if (_dimensions != 1) {
std::cerr << "Work group dimension incoherence" << std::endl;
}
_globalWorkSize[0] = g;
return *this;
}

KernelLauncher& KernelLauncher::global(const int gx, const int gy) {
if (_dimensions == -1) _dimensions = 2;
else if (_dimensions != 2) {
std:: cerr << "Work group dimension incoherence" << std::endl;
}
_globalWorkSize[0] = gx;
_globalWorkSize[1] = gy;
return *this;
}

KernelLauncher& KernelLauncher::global(const int gx, const int gy, const int gz) {
if (_dimensions == -1) _dimensions = 3;
else if (_dimensions != 3) {
std::cerr << "Work group dimension incoherence" << std::endl;
}
_globalWorkSize[0] = gx;
_globalWorkSize[1] = gy;
_globalWorkSize[2] = gz;
return *this;
}

KernelLauncher& KernelLauncher::local(const int l) {
if (_dimensions == -1) _dimensions = 1;
else if (_dimensions != 1) {
std::cerr << "Work group dimension incoherence" << std::endl;
}
_localWorkSize[0] = l;
return *this;
}

KernelLauncher& KernelLauncher::local(const int lx, const int ly) {
if (_dimensions == -1) _dimensions = 2;
else if (_dimensions != 2) {
std::cerr << "Work group dimension incoherence" << std::endl;
}
_localWorkSize[0] = lx;
_localWorkSize[1] = ly;
return *this;
}

KernelLauncher& KernelLauncher::local(const int lx, const int ly, const int lz) {
if (_dimensions == -1) _dimensions = 3;
else if (_dimensions != 3) {
std::cerr << "Work group dimension incoherence" << std::endl;
}
_localWorkSize[0] = lx;
_localWorkSize[1] = ly;
_localWorkSize[2] = lz;
return *this;
}

//////////////////////////
KernelLauncher* KernelLauncher::pGlobal(const int g) {
if (_dimensions == -1) _dimensions = 1;
else if (_dimensions != 1) {
std::cerr << "Work group dimension incoherence" << std::endl;
}
_globalWorkSize[0] = g;
return this;
}

KernelLauncher* KernelLauncher::pGlobal(const int gx, const int gy) {
if (_dimensions == -1) _dimensions = 2;
else if (_dimensions != 2) {
std::cerr << "Work group dimension incoherence" << std::endl;
}
_globalWorkSize[0] = gx;
_globalWorkSize[1] = gy;
return this;
}

KernelLauncher* KernelLauncher::pGlobal(const int gx, const int gy, const int gz) {
if (_dimensions == -1) _dimensions = 3;
else if (_dimensions != 3) {
std::cerr << "Work group dimension incoherence" << std::endl;
}
_globalWorkSize[0] = gx;
_globalWorkSize[1] = gy;
_globalWorkSize[2] = gz;
return this;
}

KernelLauncher* KernelLauncher::pLocal(const int l) {
if (_dimensions == -1) _dimensions = 1;
else if (_dimensions != 1) {
std::cerr << "Work group dimension incoherence" << std::endl;
}
_localWorkSize[0] = l;
return this;
}

KernelLauncher* KernelLauncher::pLocal(const int lx, const int ly) {
if (_dimensions == -1) _dimensions = 2;
else if (_dimensions != 2) {
std::cerr << "Work group dimension incoherence" << std::endl;
}
_localWorkSize[0] = lx;
_localWorkSize[1] = ly;
return this;
}

KernelLauncher* KernelLauncher::pLocal(const int lx, const int ly, const int lz) {
if (_dimensions == -1) _dimensions = 3;
else if (_dimensions != 3) {
std::cerr << "Work group dimension incoherence" << std::endl;
}
_localWorkSize[0] = lx;
_localWorkSize[1] = ly;
_localWorkSize[2] = lz;
return this;
}

Large diffs are not rendered by default.

@@ -0,0 +1,142 @@
#include "include/program.h"
#include "include/oclUtils.h"


int ReadSourceFromFile(const char* fileName, char** source, size_t* sourceSize)
{
int errorCode = CL_SUCCESS;

FILE* fp = fopen(fileName, "rb");
if (fp == NULL)
{
printf("Error: Couldn't find program source file '%s'.\n", fileName);
errorCode = CL_INVALID_VALUE;
}
else {
fseek(fp, 0, SEEK_END);
*sourceSize = ftell(fp);
fseek(fp, 0, SEEK_SET);

*source = new char[*sourceSize];
if (*source == NULL)
{
printf("Error: Couldn't allocate %d bytes for program source from file '%s'.\n", (int)(*sourceSize), fileName);
errorCode = CL_OUT_OF_HOST_MEMORY;
}
else {
fread(*source, 1, *sourceSize, fp);
}
}
return errorCode;
}


void checkErr( cl_int err,int line, const char *n, bool verbosity=false ) {
if( err != CL_SUCCESS ) {
std::cerr << n << "\r\t\t\t\t\t\tline:" << line<<" "<<oclErrorString(err) << std::endl;
//assert(0);
}
else if( n != NULL ) {
if( verbosity) std::cerr << n << "\r\t\t\t\t\t\t" << "OK" <<std::endl;

}
}

Program::Program(std::vector<std::string> &kernelFilePath, cl_context *context, cl_command_queue *queue, cl_device_id *device)
//Program::Program(std::string &kernelFilePath, cl_context *context, cl_command_queue *queue, cl_device_id *device)
{
this->_pContext = context;
this->_pQueue = queue;
this->_pDeviceID = device;

std::ifstream programFile(kernelFilePath[0].c_str());
//std::ifstream programFile(kernelFilePath.c_str());
std::string programBuffer(std::istreambuf_iterator<char>(programFile),
(std::istreambuf_iterator<char>()));
if(programBuffer.empty())
{
std::cout<<"Kernel File Not Found in specified location!"<<std::endl;
}
size_t programSize = programBuffer.size();


char* source = NULL;
size_t src_size = 0;
cl_int err = CL_SUCCESS;
err = ReadSourceFromFile(kernelFilePath[0].c_str(), &source, &src_size);

//_program = clCreateProgramWithSource((*_pContext), 1,(const char **)&programBuffer, &programSize, &_status);
_program = clCreateProgramWithSource((*_pContext), 1, (const char **)&source, &src_size, &_status);
//DEBUG_CL(_status);
checkErr(_status, __LINE__,"clCreateProgramWithSource");
if( _status != CL_SUCCESS )
printf("clCreateProgramWithSource ERROR - %s\n", oclErrorString(_status));
else
printf("clCreateProgramWithSource success\n");



_buildState = false;
}


void Program::buildProgram()
{
char *programLog;
size_t programLogSize;
//const char options[] = "-cl-std=CL1.0 -cl-mad-enable -Werror";
//const char options[] = "-cl-mad-enable -Werror";
const char options[] = "-cl-mad-enable -cl-unsafe-math-optimizations -cl-fast-relaxed-math -cl-single-precision-constant -cl-no-signed-zeros";
_status= clBuildProgram(_program, 1, _pDeviceID, options, NULL, NULL);
if(_status<0)
{
clGetProgramBuildInfo(_program, *_pDeviceID, CL_PROGRAM_BUILD_LOG, 0, NULL, &programLogSize );
programLog = (char*)malloc(sizeof(char)*programLogSize+1);
clGetProgramBuildInfo(_program, *_pDeviceID, CL_PROGRAM_BUILD_LOG, programLogSize+1, programLog, NULL);
printf("\nBuild Log :%s\n",programLog);
free(programLog);
// exit(0); ///!TODO: Custom Code
}
DEBUG_CL(_status);
if( _status != CL_SUCCESS )
printf("clGetProgramBuildInfo() ERROR - %s\n", oclErrorString(_status));
else
printf("clGetProgramBuildInfo() success\n");

//_kernel = clCreateKernel(_program, kernelName.c_str(), &_status);
//DEBUG_CL(_status);
// Creates the kernels
// Needs to verify if the file compiled is actually a kernel
_status = clCreateKernelsInProgram(_program, 0, NULL, (cl_uint*)&(_numKernels));
cl_kernel* k = new cl_kernel[_numKernels];
_status = clCreateKernelsInProgram(_program, _numKernels, k, NULL);
DEBUG_CL(_status);

// Creates the hash with the kernels
for (int i = 0; i < _numKernels; i++) {
char name[256];
_status = clGetKernelInfo(k[i], CL_KERNEL_FUNCTION_NAME, sizeof(char)*256, (void*) name, NULL);
//DEBUG_CL(_status);
if( _status != CL_SUCCESS )
printf("buildProgram ERROR - %s\n", oclErrorString(_status));
else
printf("buildProgram kernels success\n");
_kernels[name] = k[i];
printf("Kernel No: %d, name - %s\n", i+1, name);
//DEBUG_VALUE("Kernel Name: ", name);
}

_buildState = true;

}

KernelLauncher* Program::createKernelLauncher(std::string kernelName)
{
if(!_buildState)
ERROR_PRINT_STRING("You forgot to build the kernel");

/// @TIPS: Always use a pointer to an variable that needs to be returned
KernelLauncher *kl = new KernelLauncher(*this->_pDeviceID, &_kernels[kernelName], _pQueue, kernelName);
return kl;
}

@@ -0,0 +1,21 @@
#include "include/sampler.h"
#include "include/iv_common.h"

Sampler::Sampler(cl_context* context, cl_bool normalizedCoords, cl_addressing_mode addrMode, cl_filter_mode filterMode)
{
// clCreateSampler(cl_context /* context */,
// cl_bool /* normalized_coords */,
// cl_addressing_mode /* addressing_mode */,
// cl_filter_mode /* filter_mode */,
// cl_int * /* errcode_ret */)
// Create the image sampler
cl_int status;
_sampler = clCreateSampler(*context, normalizedCoords,
addrMode, filterMode, &status);
DEBUG_CL(status);
}

cl_sampler& Sampler::getSampler()
{
return _sampler;
}

Large diffs are not rendered by default.

@@ -1,6 +1,10 @@
#include "additionally.h"
#include "gpu.h"

#ifdef OPENCL
#include "ocl.h"
#endif

#ifdef CUDNN
#pragma comment(lib, "cudnn.lib")
#endif
@@ -726,8 +730,13 @@ softmax_layer make_softmax_layer(int batch, int inputs, int groups)
//l.backward_gpu = backward_softmax_layer_gpu;

l.output_gpu = cuda_make_array(l.output, inputs*batch);
l.delta_gpu = cuda_make_array(l.delta, inputs*batch);
//l.delta_gpu = cuda_make_array(l.delta, inputs*batch);
#endif

#ifdef OPENCL
l.output_ocl = ocl_make_array(l.output, inputs*batch);
#endif

return l;
}

@@ -770,7 +779,11 @@ layer make_reorg_layer(int batch, int w, int h, int c, int stride, int reverse)
//l.backward_gpu = backward_reorg_layer_gpu;

l.output_gpu = cuda_make_array(l.output, output_size);
l.delta_gpu = cuda_make_array(l.delta, output_size);
//l.delta_gpu = cuda_make_array(l.delta, output_size);
#endif

#ifdef OPENCL
l.output_ocl = ocl_make_array(l.output, output_size);
#endif
return l;
}
@@ -807,9 +820,13 @@ route_layer make_route_layer(int batch, int n, int *input_layers, int *input_siz
//l.forward_gpu = forward_route_layer_gpu;
//l.backward_gpu = backward_route_layer_gpu;

l.delta_gpu = cuda_make_array(l.delta, outputs*batch);
//l.delta_gpu = cuda_make_array(l.delta, outputs*batch);
l.output_gpu = cuda_make_array(l.output, outputs*batch);
#endif

#ifdef OPENCL
l.output_ocl = ocl_make_array(l.output, outputs*batch);
#endif
return l;
}

@@ -848,7 +865,11 @@ region_layer make_region_layer(int batch, int w, int h, int n, int classes, int
//l.forward_gpu = forward_region_layer_gpu;
//l.backward_gpu = backward_region_layer_gpu;
l.output_gpu = cuda_make_array(l.output, batch*l.outputs);
l.delta_gpu = cuda_make_array(l.delta, batch*l.outputs);
//l.delta_gpu = cuda_make_array(l.delta, batch*l.outputs);
#endif

#ifdef OPENCL
l.output_ocl = ocl_make_array(l.output, batch*l.outputs);
#endif

fprintf(stderr, "detection\n");
@@ -890,7 +911,12 @@ maxpool_layer make_maxpool_layer(int batch, int h, int w, int c, int size, int s
//l.backward_gpu = backward_maxpool_layer_gpu;
l.indexes_gpu = cuda_make_int_array(output_size);
l.output_gpu = cuda_make_array(l.output, output_size);
l.delta_gpu = cuda_make_array(l.delta, output_size);
//l.delta_gpu = cuda_make_array(l.delta, output_size);
#endif

#ifdef OPENCL
l.indexes_ocl = ocl_make_int_array(output_size);
l.output_ocl = ocl_make_array(l.output, output_size);
#endif
fprintf(stderr, "max %d x %d / %d %4d x%4d x%4d -> %4d x%4d x%4d\n", size, size, stride, w, h, c, l.out_w, l.out_h, l.out_c);
return l;
@@ -1046,43 +1072,43 @@ convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int
//l.update_gpu = update_convolutional_layer_gpu;

if (gpu_index >= 0) {
if (adam) {
l.m_gpu = cuda_make_array(l.m, c*n*size*size);
l.v_gpu = cuda_make_array(l.v, c*n*size*size);
}
//if (adam) {
// l.m_gpu = cuda_make_array(l.m, c*n*size*size);
// l.v_gpu = cuda_make_array(l.v, c*n*size*size);
//}

l.weights_gpu = cuda_make_array(l.weights, c*n*size*size);
l.weight_updates_gpu = cuda_make_array(l.weight_updates, c*n*size*size);
//l.weight_updates_gpu = cuda_make_array(l.weight_updates, c*n*size*size);

l.biases_gpu = cuda_make_array(l.biases, n);
l.bias_updates_gpu = cuda_make_array(l.bias_updates, n);
//l.bias_updates_gpu = cuda_make_array(l.bias_updates, n);

l.delta_gpu = cuda_make_array(l.delta, l.batch*out_h*out_w*n);
//l.delta_gpu = cuda_make_array(l.delta, l.batch*out_h*out_w*n);
l.output_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n);

if (binary) {
l.binary_weights_gpu = cuda_make_array(l.weights, c*n*size*size);
}
if (xnor) {
l.binary_weights_gpu = cuda_make_array(l.weights, c*n*size*size);
l.binary_input_gpu = cuda_make_array(0, l.inputs*l.batch);
}
//if (binary) {
// l.binary_weights_gpu = cuda_make_array(l.weights, c*n*size*size);
//}
//if (xnor) {
// l.binary_weights_gpu = cuda_make_array(l.weights, c*n*size*size);
// l.binary_input_gpu = cuda_make_array(0, l.inputs*l.batch);
//}

if (batch_normalize) {
l.mean_gpu = cuda_make_array(l.mean, n);
l.variance_gpu = cuda_make_array(l.variance, n);
//l.mean_gpu = cuda_make_array(l.mean, n);
//l.variance_gpu = cuda_make_array(l.variance, n);

l.rolling_mean_gpu = cuda_make_array(l.mean, n);
l.rolling_variance_gpu = cuda_make_array(l.variance, n);

l.mean_delta_gpu = cuda_make_array(l.mean, n);
l.variance_delta_gpu = cuda_make_array(l.variance, n);
//l.mean_delta_gpu = cuda_make_array(l.mean, n);
//l.variance_delta_gpu = cuda_make_array(l.variance, n);

l.scales_gpu = cuda_make_array(l.scales, n);
l.scale_updates_gpu = cuda_make_array(l.scale_updates, n);
//l.scale_updates_gpu = cuda_make_array(l.scale_updates, n);

l.x_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n);
l.x_norm_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n);
//l.x_norm_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n);
}
#ifdef CUDNN
cudnnCreateTensorDescriptor(&l.srcTensorDesc);
@@ -1096,6 +1122,24 @@ convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int
#endif
}
#endif

#ifdef OPENCL
//if (gpu_index >= 0) {

l.weights_ocl = ocl_make_array(l.weights, c*n*size*size);
l.biases_ocl = ocl_make_array(l.biases, n);
l.output_ocl = ocl_make_array(l.output, l.batch*out_h*out_w*n);

if (batch_normalize) {
l.rolling_mean_ocl = ocl_make_array(l.rolling_mean, n); // l.mean
l.rolling_variance_ocl = ocl_make_array(l.rolling_variance, n); // l.variance
l.scales_ocl = ocl_make_array(l.scales, n);

l.x_ocl = ocl_make_array(l.output, l.batch*out_h*out_w*n);
}
//}
#endif

l.workspace_size = get_workspace_size(l);
l.activation = activation;

@@ -1685,6 +1729,12 @@ void load_convolutional_weights_cpu(layer l, FILE *fp)
push_convolutional_layer(l);
}
#endif

#ifdef OPENCL
//if (gpu_index >= 0) {
ocl_push_convolutional_layer(l);
//}
#endif
}

// parser.c
@@ -2121,6 +2171,14 @@ network parse_network_cfg(char *filename)
#else
net.workspace = calloc(1, workspace_size);
#endif

#ifdef OPENCL
//if (gpu_index >= 0) {
net.workspace_ocl = ocl_make_array(0, workspace_size/sizeof(float));
//net.workspace_ocl = ocl_make_array(0, (workspace_size - 1) / sizeof(float) + 1);
//net.workspace_ocl = ocl_make_array(NULL, 1024*1024*1024);
//}
#endif
}
return net;
}
@@ -23,6 +23,10 @@
#include "cublas_v2.h"
#endif

#ifdef OPENCL
#include "CL/cl.h"
#endif

#ifdef OPENCV
#include "opencv2/highgui/highgui_c.h"
#include "opencv2/imgproc/imgproc_c.h"
@@ -471,6 +475,18 @@ struct layer {
cudnnConvolutionBwdFilterAlgo_t bf_algo;
#endif
#endif

#ifdef OPENCL
cl_mem weights_ocl;
cl_mem biases_ocl;
cl_mem scales_ocl;
cl_mem rolling_mean_ocl;
cl_mem rolling_variance_ocl;

cl_mem output_ocl;
cl_mem indexes_ocl;
cl_mem x_ocl;
#endif
};

typedef layer local_layer;
@@ -538,6 +554,10 @@ typedef struct network {
float **input_gpu;
float **truth_gpu;
#endif

#ifdef OPENCL
cl_mem workspace_ocl;
#endif
} network;

typedef struct network_state {
@@ -548,6 +568,10 @@ typedef struct network_state {
int train;
int index;
network net;
#ifdef OPENCL
cl_mem input_ocl;
cl_mem workspace_ocl;
#endif
} network_state;


@@ -563,6 +587,10 @@ void cuda_set_device(int n);
#endif
#endif

#ifdef OPENCL
bool ocl_initialize();
#endif

// network.c
void set_batch_network(network *net, int b);

@@ -686,6 +714,14 @@ float *network_predict_cpu(network net, float *input);
float *network_predict_gpu_cudnn(network net, float *input);
#endif

// -------------- yolov2_forward_network_ocl.c --------------------

#ifdef OPENCL
// detect using OpenCL: yolov2_forward_network_gpu.cpp
float *network_predict_opencl(network net, float *input);
#endif


// -------------- gettimeofday for Windows--------------------

#if defined(_MSC_VER)
@@ -154,34 +154,22 @@ void pull_convolutional_layer(convolutional_layer layer)
{
cuda_pull_array(layer.weights_gpu, layer.weights, layer.c*layer.n*layer.size*layer.size);
cuda_pull_array(layer.biases_gpu, layer.biases, layer.n);
cuda_pull_array(layer.weight_updates_gpu, layer.weight_updates, layer.c*layer.n*layer.size*layer.size);
cuda_pull_array(layer.bias_updates_gpu, layer.bias_updates, layer.n);
if (layer.batch_normalize) {
cuda_pull_array(layer.scales_gpu, layer.scales, layer.n);
cuda_pull_array(layer.rolling_mean_gpu, layer.rolling_mean, layer.n);
cuda_pull_array(layer.rolling_variance_gpu, layer.rolling_variance, layer.n);
}
if (layer.adam) {
cuda_pull_array(layer.m_gpu, layer.m, layer.c*layer.n*layer.size*layer.size);
cuda_pull_array(layer.v_gpu, layer.v, layer.c*layer.n*layer.size*layer.size);
}
}

void push_convolutional_layer(convolutional_layer layer)
{
cuda_push_array(layer.weights_gpu, layer.weights, layer.c*layer.n*layer.size*layer.size);
cuda_push_array(layer.biases_gpu, layer.biases, layer.n);
cuda_push_array(layer.weight_updates_gpu, layer.weight_updates, layer.c*layer.n*layer.size*layer.size);
cuda_push_array(layer.bias_updates_gpu, layer.bias_updates, layer.n);
if (layer.batch_normalize) {
cuda_push_array(layer.scales_gpu, layer.scales, layer.n);
cuda_push_array(layer.rolling_mean_gpu, layer.rolling_mean, layer.n);
cuda_push_array(layer.rolling_variance_gpu, layer.rolling_variance, layer.n);
}
if (layer.adam) {
cuda_push_array(layer.m_gpu, layer.m, layer.c*layer.n*layer.size*layer.size);
cuda_push_array(layer.v_gpu, layer.v, layer.c*layer.n*layer.size*layer.size);
}
}

// -------------------- CUDA functions -------------------
@@ -10,14 +10,16 @@
#include "opencv2/highgui/highgui_c.h"
#include "opencv2/core/core_c.h"
#include "opencv2/core/version.hpp"

#ifndef CV_VERSION_EPOCH
// for OpenCV 3.x
#pragma comment(lib, "opencv_world320.lib")
#include "opencv2/videoio/videoio_c.h"
#define OPENCV_VERSION CVAUX_STR(CV_VERSION_MAJOR)""CVAUX_STR(CV_VERSION_MINOR)""CVAUX_STR(CV_VERSION_REVISION)
#pragma comment(lib, "opencv_world" OPENCV_VERSION ".lib")
#else
// for OpenCV 2.4.x
#pragma comment(lib, "opencv_core2413.lib")
#pragma comment(lib, "opencv_imgproc2413.lib")
#pragma comment(lib, "opencv_highgui2413.lib")
#define OPENCV_VERSION CVAUX_STR(CV_VERSION_EPOCH)""CVAUX_STR(CV_VERSION_MAJOR)""CVAUX_STR(CV_VERSION_MINOR)
#pragma comment(lib, "opencv_core" OPENCV_VERSION ".lib")
#pragma comment(lib, "opencv_imgproc" OPENCV_VERSION ".lib")
#pragma comment(lib, "opencv_highgui" OPENCV_VERSION ".lib")
#endif

#endif
@@ -118,8 +120,12 @@ void test_detector_cpu(char **names, char *cfgfile, char *weightfile, char *file
//network_predict(net, X);
#ifdef GPU
network_predict_gpu_cudnn(net, X);
#else
#ifdef OPENCL
network_predict_opencl(net, X);
#else
network_predict_cpu(net, X);
#endif
#endif
printf("%s: Predicted in %f seconds.\n", input, (float)(clock() - time) / CLOCKS_PER_SEC); //sec(clock() - time));
get_region_boxes_cpu(l, 1, 1, thresh, probs, boxes, 0, 0); // get_region_boxes(): region_layer.c
@@ -272,8 +278,12 @@ static void *detect_in_thread(void *ptr)
//float *prediction = network_predict(net, X);
#ifdef GPU
network_predict_gpu_cudnn(net, X);
#else
#ifdef OPENCL
network_predict_opencl(net, X);
#else
network_predict_cpu(net, X);
#endif
#endif

free_image(det_s);
@@ -466,6 +476,9 @@ int main(int argc, char **argv)
if (gpu_index >= 0) {
cuda_set_device(gpu_index);
}
#endif
#ifdef OPENCL
ocl_initialize();
#endif
run_detector(argc, argv);
return 0;
@@ -0,0 +1,28 @@
#pragma once
#ifndef OCL_H
#define OCL_H

#ifdef OPENCL
#include "CL/cl.h"


//#include "additionally.h"

#ifdef __cplusplus
extern "C" {
#endif

bool ocl_initialize();
void ocl_push_array(cl_mem x_gpu, float *x, size_t n);
cl_mem ocl_make_array(float *x, size_t n);
cl_mem ocl_make_int_array(size_t n);
void ocl_push_convolutional_layer(convolutional_layer layer);


#ifdef __cplusplus
}
#endif

#endif // OPENCL

#endif // OCL_H

Large diffs are not rendered by default.

@@ -52,7 +52,7 @@
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 8.0.props" />
<Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 9.0.props" />
</ImportGroup>
<ImportGroup Label="Shared">
</ImportGroup>
@@ -144,6 +144,6 @@
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 8.0.targets" />
<Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 9.0.targets" />
</ImportGroup>
</Project>
@@ -0,0 +1,28 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 14
VisualStudioVersion = 14.0.25420.1
MinimumVisualStudioVersion = 10.0.40219.1
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "yolo_ocl", "yolo_ocl.vcxproj", "{58803E1F-DBC5-4332-87B4-FF90E6E1E2A8}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Debug|x86 = Debug|x86
Release|x64 = Release|x64
Release|x86 = Release|x86
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{58803E1F-DBC5-4332-87B4-FF90E6E1E2A8}.Debug|x64.ActiveCfg = Debug|x64
{58803E1F-DBC5-4332-87B4-FF90E6E1E2A8}.Debug|x64.Build.0 = Debug|x64
{58803E1F-DBC5-4332-87B4-FF90E6E1E2A8}.Debug|x86.ActiveCfg = Debug|Win32
{58803E1F-DBC5-4332-87B4-FF90E6E1E2A8}.Debug|x86.Build.0 = Debug|Win32
{58803E1F-DBC5-4332-87B4-FF90E6E1E2A8}.Release|x64.ActiveCfg = Release|x64
{58803E1F-DBC5-4332-87B4-FF90E6E1E2A8}.Release|x64.Build.0 = Release|x64
{58803E1F-DBC5-4332-87B4-FF90E6E1E2A8}.Release|x86.ActiveCfg = Release|Win32
{58803E1F-DBC5-4332-87B4-FF90E6E1E2A8}.Release|x86.Build.0 = Release|Win32
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
EndGlobal
@@ -0,0 +1,157 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|Win32">
<Configuration>Debug</Configuration>
<Platform>Win32</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|Win32">
<Configuration>Release</Configuration>
<Platform>Win32</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<PropertyGroup Label="Globals">
<ProjectGuid>{58803E1F-DBC5-4332-87B4-FF90E6E1E2A8}</ProjectGuid>
<RootNamespace>yolo_cpu</RootNamespace>
<WindowsTargetPlatformVersion>8.1</WindowsTargetPlatformVersion>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>true</UseDebugLibraries>
<PlatformToolset>v140</PlatformToolset>
<CharacterSet>MultiByte</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>false</UseDebugLibraries>
<PlatformToolset>v140</PlatformToolset>
<WholeProgramOptimization>true</WholeProgramOptimization>
<CharacterSet>MultiByte</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>true</UseDebugLibraries>
<PlatformToolset>v140</PlatformToolset>
<CharacterSet>MultiByte</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>false</UseDebugLibraries>
<PlatformToolset>v140</PlatformToolset>
<WholeProgramOptimization>true</WholeProgramOptimization>
<CharacterSet>MultiByte</CharacterSet>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
</ImportGroup>
<ImportGroup Label="Shared">
</ImportGroup>
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<OutDir>bin\</OutDir>
<IntDir>$(Platform)\ocl_$(Configuration)\</IntDir>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<OutDir>bin\</OutDir>
</PropertyGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<Optimization>Disabled</Optimization>
<SDLCheck>true</SDLCheck>
</ClCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<Optimization>Disabled</Optimization>
<SDLCheck>true</SDLCheck>
</ClCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<Optimization>MaxSpeed</Optimization>
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
<SDLCheck>true</SDLCheck>
</ClCompile>
<Link>
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>
</Link>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<Optimization>MaxSpeed</Optimization>
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
<SDLCheck>true</SDLCheck>
<AdditionalIncludeDirectories>C:\opencv_3.0\opencv\build\include;3rdparty\include;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v8.0\include;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0\include;3rdparty\CLBlast\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<PreprocessorDefinitions>OPENCL;OPENCV;_TIMESPEC_DEFINED;_CRT_SECURE_NO_WARNINGS;WIN32;NDEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<OpenMPSupport>true</OpenMPSupport>
<MultiProcessorCompilation>true</MultiProcessorCompilation>
</ClCompile>
<Link>
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>
<AdditionalLibraryDirectories>C:\opencv_3.0\opencv\build\x64\vc14\lib;C:\opencv_2.4.13\opencv\build\x64\vc12\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
<AdditionalDependencies>3rdparty\lib\x64\pthreadVC2.lib;3rdparty/CLBlast/Release/clblast.lib;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0\lib\x64\OpenCL.lib;%(AdditionalDependencies)</AdditionalDependencies>
</Link>
</ItemDefinitionGroup>
<ItemGroup>
<ClCompile Include="src\additionally.c" />
<ClCompile Include="src\box.c" />
<ClCompile Include="src\main.c" />
<ClCompile Include="src\OpenCL\buffer.cpp" />
<ClCompile Include="src\OpenCL\clutils.cpp" />
<ClCompile Include="src\OpenCL\cl_wrapper.cpp" />
<ClCompile Include="src\OpenCL\helpers.cpp" />
<ClCompile Include="src\OpenCL\image2D.cpp" />
<ClCompile Include="src\OpenCL\iv_common.cpp" />
<ClCompile Include="src\OpenCL\kernel_launcher.cpp" />
<ClCompile Include="src\OpenCL\OCLManager.cpp" />
<ClCompile Include="src\OpenCL\oclUtils.cpp" />
<ClCompile Include="src\OpenCL\program.cpp" />
<ClCompile Include="src\OpenCL\sampler.cpp" />
<ClCompile Include="src\OpenCL\shrUtils.cpp" />
<ClCompile Include="src\yolov2_forward_network.c" />
<ClCompile Include="src\yolov2_forward_network_ocl.cpp" />
</ItemGroup>
<ItemGroup>
<ClInclude Include="src\additionally.h" />
<ClInclude Include="src\box.h" />
<ClInclude Include="src\ocl.h" />
<ClInclude Include="src\OpenCL\include\buffer.h" />
<ClInclude Include="src\stb_image.h" />
<ClInclude Include="src\stb_image_write.h" />
</ItemGroup>
<ItemGroup>
<None Include="bin\DeepNNFP32.cl" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
</ImportGroup>
</Project>