Skip to content
24 changes: 24 additions & 0 deletions 26_Blur/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
include(common RESULT_VARIABLE RES)
if(NOT RES)
message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory")
endif()

nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")

if(NBL_EMBED_BUILTIN_RESOURCES)
set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData)
set(RESOURCE_DIR "app_resources")

get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE)
get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE)
get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE)

file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" CONFIGURE_DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*")
foreach(RES_FILE ${BUILTIN_RESOURCE_FILES})
LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}")
endforeach()

ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}")

LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_})
endif()
11 changes: 11 additions & 0 deletions 26_Blur/app_resources/common.hlsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#include "nbl/builtin/hlsl/cpp_compat.hlsl"
#include "nbl/builtin/hlsl/type_traits.hlsl"

static const uint16_t PASSES = 2;

struct PushConstants
{
nbl::hlsl::float32_t radius;
uint32_t activeAxis : 2;
uint32_t edgeWrapMode : 6;
};
162 changes: 162 additions & 0 deletions 26_Blur/app_resources/shader.comp.hlsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
#include "nbl/builtin/hlsl/prefix_sum_blur/blur.hlsl"
#include "nbl/builtin/hlsl/prefix_sum_blur/box_sampler.hlsl"
#include "nbl/builtin/hlsl/workgroup/scratch_size.hlsl"
#include "nbl/builtin/hlsl/jit/device_capabilities.hlsl"
#include "nbl/builtin/hlsl/colorspace/OETF.hlsl"
#include "common.hlsl"

using namespace nbl::hlsl;

uint32_t3 glsl::gl_WorkGroupSize() { return uint32_t3(WORKGROUP_SIZE, 1, 1); }

[[vk::binding(0)]]
Texture2D<float32_t4> input;
[[vk::binding(1)]]
RWTexture2D<float32_t4> output;

[[vk::push_constant]] PushConstants pc;

template<uint16_t Chnls>
struct TextureProxy
{
NBL_CONSTEXPR uint16_t Channels = Chnls;
using texel_t = vector<float32_t, Channels>;

// divisions by PoT constant will optimize out nicely
template<typename T>
T get(const uint16_t channel, const uint16_t uv)
{
return spill[uv / WORKGROUP_SIZE][channel];
}

template<typename T>
void set(const uint16_t channel, const uint16_t uv, T value)
{
spill[uv / WORKGROUP_SIZE][channel] = value;
}

void load()
{
const uint16_t end = linearSize();
uint16_t ix = workgroup::SubgroupContiguousIndex();
// because workgroups do scans cooperatively all spill values need sane defaults
for (uint16_t i=0; i < SpillSize; ix += WORKGROUP_SIZE)
spill[i++] = ix < end ? (texel_t)input[position(ix)] : promote<texel_t>(0.f);
}

void store()
{
const uint16_t end = linearSize();
uint16_t i = 0;
// making sure that we don't store out of range
for (uint16_t ix = workgroup::SubgroupContiguousIndex(); ix < end; ix += WORKGROUP_SIZE)
{
float32_t4 tmp = float32_t4(0, 0, 0, 1);
for (uint16_t ch=0; ch < Channels; ch++)
tmp[ch] = spill[i][ch];
i++;
output[position(ix)] = tmp;
}
}

uint16_t linearSize()
{
uint32_t3 dims;
input.GetDimensions(0, dims.x, dims.y, dims.z);
return _static_cast<uint16_t>(dims[activeAxis]);
}

uint16_t2 position(uint16_t ix)
{
uint16_t2 pos;
pos[activeAxis] = ix;
pos[activeAxis ^ 0x1] = _static_cast<uint16_t>(glsl::gl_WorkGroupID().x);
return pos;
}

// whether we pas along X or Y
uint16_t activeAxis;
NBL_CONSTEXPR uint16_t SpillSize = (MAX_SCANLINE_SIZE - 1) / WORKGROUP_SIZE + 1;
texel_t spill[SpillSize];
};

static const uint16_t MAX_SCAN_SCRATCH_SIZE = workgroup::scratch_size_arithmetic<WORKGROUP_SIZE, MAX_SUBGROUP_SIZE>::value + 2;

// we always use `uint32_t`
groupshared uint32_t smem[MAX_SCANLINE_SIZE];
groupshared uint32_t prefix_smem[MAX_SCAN_SCRATCH_SIZE];

struct SharedMemoryProxy
{
NBL_CONSTEXPR uint16_t Size = MAX_SCANLINE_SIZE;

template<typename T, typename I = uint16_t>
enable_if_t<sizeof(T) == sizeof(uint32_t), T> get(const I idx)
{
return bit_cast<T>(smem[idx]);
}

template<typename T, typename I = uint16_t>
enable_if_t<sizeof(T) == sizeof(uint32_t), void> set(const I idx, T value)
{
smem[idx] = bit_cast<uint32_t>(value);
}

void workgroupExecutionAndMemoryBarrier()
{
glsl::barrier();
}
};

struct ScanSharedMemoryProxy
{
NBL_CONSTEXPR uint16_t Size = MAX_SCAN_SCRATCH_SIZE;

// these get used by Box1D
template<typename T, typename I = uint16_t>
enable_if_t<sizeof(T) == sizeof(uint32_t), T> get(const I idx)
{
return bit_cast<T>(prefix_smem[idx]);
}

template<typename T, typename I = uint16_t>
enable_if_t<sizeof(T) == sizeof(uint32_t), void> set(const I idx, T value)
{
prefix_smem[idx] = bit_cast<uint32_t>(value);
}

void workgroupExecutionAndMemoryBarrier()
{
glsl::barrier();
}
};

[numthreads(WORKGROUP_SIZE, 1, 1)]
void main()
{
ScanSharedMemoryProxy scanSmemAccessor;

TextureProxy<CHANNELS> texAccessor;
texAccessor.activeAxis = (uint16_t)pc.activeAxis;
texAccessor.load();

prefix_sum_blur::BoxSampler<SharedMemoryProxy, float32_t> boxSampler;
boxSampler.wrapMode = uint16_t(pc.edgeWrapMode);
boxSampler.linearSize = texAccessor.linearSize();
boxSampler.normalizationFactor = 1 / (2 * pc.radius + 1);

prefix_sum_blur::Blur1D<decltype(texAccessor), decltype(scanSmemAccessor), decltype(boxSampler), WORKGROUP_SIZE, jit::device_capabilities> blur;
blur.radius = pc.radius;
blur.borderColor = float32_t4(0, 1, 0, 1);

for (uint16_t ch=0; ch < CHANNELS; ch++)
for (uint16_t pass=0; pass < PASSES; pass++)
{
// its the `SharedMemoryProxy` that gets aliased and reused so we need to barrier on its memory
if (ch != 0 && pass != 0)
boxSampler.prefixSumAccessor.workgroupExecutionAndMemoryBarrier();
blur(texAccessor, scanSmemAccessor, boxSampler, ch);
}

texAccessor.store();
}
Loading