Skip to content

Commit

Permalink
cu RAM optimization & ETA countdown
Browse files Browse the repository at this point in the history
  • Loading branch information
0xTiger committed Aug 28, 2020
1 parent 15112a5 commit d6bb98a
Show file tree
Hide file tree
Showing 5 changed files with 90 additions and 25 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@
##
## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore

#Myfiles
clumpFinderCUDA/recordFile.txt
clumpFinderOpenCL/recordFile.txt

# User-specific files
*.rsuser
*.suo
Expand Down
56 changes: 41 additions & 15 deletions clumpFinderCUDA/clumpFinderCUDA.cu
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,11 @@
#include "PrecomputedRandAdvance.h"
#include <fstream>
#include <math.h>
#include <chrono>
#include <iomanip>
#include <tuple>

typedef std::chrono::high_resolution_clock Clock;

#define THREADSPERBLOCK_X 16
#define THREADSPERBLOCK_Y 16
Expand Down Expand Up @@ -240,7 +244,22 @@ __global__ void reduction(int* inData, int* outData, int* outIdData) {
}*/
}

std::ostream& operator<<(std::ostream& os, const std::chrono::microseconds& v) {
// convert to microseconds
int us = v.count();

int h = us / (1000 * 1000 * 60 * 60);
us -= h * (1000 * 1000 * 60 * 60);

int m = us / (1000 * 1000 * 60);
us -= m * (1000 * 1000 * 60);

int s = us / (1000 * 1000);
us -= s * (1000 * 1000);

return os << std::setfill('0') << std::setw(2) << h << ':' << std::setw(2) << m
<< ':' << std::setw(2) << s;
}


std::vector<int> spiral(int n) {
Expand Down Expand Up @@ -290,13 +309,10 @@ int main(int argc, char* argv[])
const size_t start = atoi(argv[1]); // 0;
const size_t end = atoi(argv[2]); // start + 15;

std::vector<int> bedrock(len * len, 0);
std::vector<int> offset = { 0, 0};

std::vector<int> final((len * len) / 256, 0);
std::vector<int> finalIds((len * len) / 256, 0);
std::vector<int> labels(len * len, 0);
std::vector<int> freq(len * len, 0);


cudaError_t err;
Expand All @@ -312,22 +328,22 @@ int main(int argc, char* argv[])
err = cudaMalloc(&a_d, sizeof(int64_t) * A_OW_112.size());
err = cudaMalloc(&b_d, sizeof(int64_t) * B_OW_112.size());
err = cudaMalloc(&off_d, sizeof(int) * offset.size());
err = cudaMalloc(&bedrock_d, sizeof(int) * bedrock.size());
err = cudaMalloc(&labels_d, sizeof(int) * labels.size());
err = cudaMalloc(&bedrock_d, sizeof(int) * len * len);
err = cudaMalloc(&labels_d, sizeof(int) * len * len);
err = cudaMalloc(&final_d, sizeof(int) * final.size());
err = cudaMalloc(&finalIds_d, sizeof(int) * finalIds.size());
err = cudaMalloc(&freq_d, sizeof(int) * freq.size());
err = cudaMalloc(&freq_d, sizeof(int) * len * len);


err = cudaMemcpy(a_d, A_OW_112.data(), sizeof(int64_t) * A_OW_112.size(), cudaMemcpyHostToDevice);
err = cudaMemcpy(b_d, B_OW_112.data(), sizeof(int64_t) * B_OW_112.size(), cudaMemcpyHostToDevice);
err = cudaMemcpy(bedrock_d, bedrock.data(), sizeof(int) * bedrock.size(), cudaMemcpyHostToDevice);

err = cudaMemset(bedrock_d, 0, sizeof(int) * len * len);

err = cudaMemcpy(final_d, final.data(), sizeof(int) * final.size(), cudaMemcpyHostToDevice);
err = cudaMemcpy(finalIds_d, finalIds.data(), sizeof(int) * finalIds.size(), cudaMemcpyHostToDevice);



std::tuple<size_t, int, int> best = { 0, 0, 0 };

//1000 (*1000*1000) takes 7500ms before new shiny kernel
Expand All @@ -347,9 +363,11 @@ int main(int argc, char* argv[])
dim3 DimBlock2(THREADSPERBLOCK_2);

for (int i = start; i < end; i++) {
std::cout << i << ' ';
auto t1 = Clock::now();

offset = { spiral(i)[0] * len , spiral(i)[1] * len };
//std::cout << offset[0] << ' ' << offset[1] << std::endl;


err = cudaMemcpy(off_d, offset.data(), sizeof(int) * offset.size(), cudaMemcpyHostToDevice);
getBedrockTile << <DimGrid, DimBlock >> > (a_d, b_d, off_d, bedrock_d);
Expand All @@ -358,7 +376,7 @@ int main(int argc, char* argv[])


//begin labeling clumps
err = cudaMemset(labels_d, 0, sizeof(int) * labels.size());
err = cudaMemset(labels_d, 0, sizeof(int) * len * len);

Init << <DimGrid, DimBlock >> > (bedrock_d, labels_d);
Analyze << <DimGrid, DimBlock >> > (labels_d);
Expand All @@ -369,7 +387,7 @@ int main(int argc, char* argv[])
//err = cudaMemcpy(bedrock.data(), bedrock_d, sizeof(int) * bedrock.size(), cudaMemcpyDeviceToHost);
//err = cudaMemcpy(labels.data(), labels_d, sizeof(int) * labels.size(), cudaMemcpyDeviceToHost);

err = cudaMemset(freq_d, 0, sizeof(int) * freq.size());
err = cudaMemset(freq_d, 0, sizeof(int) * len * len);

getFrequency << <DimGrid2, DimBlock2 >> > (labels_d, freq_d);

Expand All @@ -390,7 +408,15 @@ int main(int argc, char* argv[])
int recordX = finalIds[recordi] / len;
int recordZ = finalIds[recordi] % len;

std::cout << ' ' << record << " @ (" << recordX + offset[0] << ", " << recordZ + offset[1] << ')' << std::endl;
auto t2 = Clock::now();

std::cout << i << ' ';
std::cout << ' ' << record << " @ (" << recordX + offset[0] << ", " << recordZ + offset[1] << ')' << " " << std::endl;

std::chrono::microseconds ms = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1);
int per_sec = (float)(1000000) / ms.count();

std::cout << per_sec << "tiles/s" << " ETA: " << ms * (end - i) << '\r';

std::tuple<size_t, int, int> result = { record, recordX + offset[0], recordZ + offset[1] };

Expand All @@ -399,7 +425,7 @@ int main(int argc, char* argv[])
}
}

std::cout << "Best found: " << std::endl;
std::cout << "Best found: " << " " << std::endl;
std::cout << std::get<0>(best) << " @ (" << std::get<1>(best) << ", " << std::get<2>(best) << ')' << std::endl;

cudaFree(a_d);
Expand Down
2 changes: 2 additions & 0 deletions clumpFinderCUDA/clumpFinderCUDA.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@
<SDLCheck>true</SDLCheck>
<ConformanceMode>true</ConformanceMode>
<AdditionalIncludeDirectories>%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<LanguageStandard>Default</LanguageStandard>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
Expand Down Expand Up @@ -136,6 +137,7 @@
<SDLCheck>true</SDLCheck>
<ConformanceMode>true</ConformanceMode>
<AdditionalIncludeDirectories>%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<LanguageStandard>Default</LanguageStandard>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
Expand Down
45 changes: 39 additions & 6 deletions clumpFinderOpenCL/clumpFinderOpenCL.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,31 @@
#include <vector>
#include <math.h>
#include "PrecomputedRandAdvance.h"
#include <chrono>
#include <tuple>
#include <iomanip>

typedef std::chrono::high_resolution_clock Clock;

#define MAXPASS 10

std::ostream& operator<<(std::ostream& os, const std::chrono::microseconds& v) {
// convert to microseconds
int us = v.count();

int h = us / (1000 * 1000 * 60 * 60);
us -= h * (1000 * 1000 * 60 * 60);

int m = us / (1000 * 1000 * 60);
us -= m * (1000 * 1000 * 60);

int s = us / (1000 * 1000);
us -= s * (1000 * 1000);

return os << std::setfill('0') << std::setw(2) << h << ':' << std::setw(2) << m
<< ':' << std::setw(2) << s;
}

std::vector<int> spiral(int n) {
n++;
int k = ceil((sqrt(n) - 1) / 2);
Expand Down Expand Up @@ -76,7 +98,7 @@ int main(int argc, char* argv[])
const size_t end = atoi(argv[2]); // start + 15;


std::vector<int> bedrock(len * len, 0);
std::vector<unsigned char> bedrock(len * len, 0);
std::vector<int> offset = { 0, 0 };
std::tuple<size_t, int, int> best = { 0, 0, 0 };

Expand All @@ -89,7 +111,7 @@ int main(int argc, char* argv[])

cl::Buffer a_buf(context, CL_MEM_READ_ONLY | CL_MEM_HOST_NO_ACCESS | CL_MEM_COPY_HOST_PTR, sizeof(int64_t) * A_OW_112.size(), A_OW_112.data(), &err);
cl::Buffer b_buf(context, CL_MEM_READ_ONLY | CL_MEM_HOST_NO_ACCESS | CL_MEM_COPY_HOST_PTR, sizeof(int64_t) * B_OW_112.size(), B_OW_112.data(), &err);
cl::Buffer bedrock_buf(context, CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, sizeof(int) * bedrock.size(), &err);
cl::Buffer bedrock_buf(context, CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, sizeof(unsigned char) * bedrock.size(), &err);

kernel_bedrock.setArg(0, a_buf);
kernel_bedrock.setArg(1, b_buf);
Expand All @@ -109,6 +131,8 @@ int main(int argc, char* argv[])
cl::Buffer freq_buf(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, len * len * sizeof(cl_int), freq.data());
cl::Buffer final_buf(context, CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY, numWorkGroups * sizeof(int), final.data());
cl::Buffer finalIds_buf(context, CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY, numWorkGroups * sizeof(int), finalIds.data());


//1000 (*1000*1000) takes 7500ms before new shiny kernel
//1000 (*1000*1000) takes 2500ms with new shinyish kernel
//60 (*4096*4096) takes 2000ms with new shinyish kernel
Expand All @@ -118,7 +142,8 @@ int main(int argc, char* argv[])
// ^ at this point after scaling up to 150 iters, we can do about 10^9 blocks per second

for (int i = start; i < end; i++) {
std::cout << i << ' ';
auto t1 = Clock::now();

offset = { spiral(i)[0]*len , spiral(i)[1]*len};
//std::cout << offset[0] << ' ' << offset[1] << std::endl;

Expand All @@ -127,7 +152,7 @@ int main(int argc, char* argv[])
kernel_bedrock.setArg(2, off_buf);

err = queue.enqueueNDRangeKernel(kernel_bedrock, cl::NullRange, cl::NDRange(len, len));
//err = queue.enqueueReadBuffer(bedrock_buf, CL_FALSE, 0, sizeof(int) * bedrock.size(), bedrock.data());
//err = queue.enqueueReadBuffer(bedrock_buf, CL_FALSE, 0, sizeof(unsigned char) * bedrock.size(), bedrock.data());

cl::finish();

Expand Down Expand Up @@ -185,7 +210,15 @@ int main(int argc, char* argv[])
int recordX = finalIds[recordi] / len;
int recordZ = finalIds[recordi] % len;

std::cout << ' ' << record << " @ (" << recordX + offset[0] << ", " << recordZ + offset[1] << ')' << std::endl;
auto t2 = Clock::now();

std::cout << i << ' ';
std::cout << ' ' << record << " @ (" << recordX + offset[0] << ", " << recordZ + offset[1] << ')' << " " << std::endl;

std::chrono::microseconds ms = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1);
int per_sec = (float)(1000000) / ms.count();

std::cout << per_sec << "tiles/s" << " ETA: " << ms * (end - i) << '\r';

std::tuple<size_t, int, int> result = {record, recordX + offset[0], recordZ + offset[1] };

Expand All @@ -194,7 +227,7 @@ int main(int argc, char* argv[])
}
}

std::cout << "Best found: " << std::endl;
std::cout << "Best found: " << " " << std::endl;
std::cout << std::get<0>(best) << " @ (" << std::get<1>(best) << ", " << std::get<2>(best) << ')' << std::endl;


Expand Down
8 changes: 4 additions & 4 deletions clumpFinderOpenCL/ker.cl
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ furnished to do so, subject to the following conditions :
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
*/
__kernel void labelxPreprocess_int_int(global int* label, global int* pix, global int* flags, int maxpass, int bgc, int iw, int ih) {
__kernel void labelxPreprocess_int_int(global int* label, global unsigned char* pix, global int* flags, int maxpass, int bgc, int iw, int ih) {
const int x = get_global_id(0), y = get_global_id(1);
const int p0 = y * iw + x;

Expand All @@ -37,7 +37,7 @@ __kernel void labelxPreprocess_int_int(global int* label, global int* pix, globa
label[p0] = p0;
}

__kernel void label4xMain_int_int(global int* label, global int* pix, global int* flags, int pass, int iw, int ih) {
__kernel void label4xMain_int_int(global int* label, global unsigned char* pix, global int* flags, int pass, int iw, int ih) {
const int x = get_global_id(0), y = get_global_id(1);
if (x >= iw || y >= ih) return;
const int p0 = y * iw + x;
Expand Down Expand Up @@ -191,15 +191,15 @@ inline int precompChunkIndCalcNormal(int x, int y, int z, int nether)
}


inline int getBedrock(int x, int y, int z, global const long* a, global const long* b)
inline unsigned char getBedrock(int x, int y, int z, global const long* a, global const long* b)
{
if (y == 0) return 1;
if (y < 0 || y > 4) return 0;
int precomp_ind = precompChunkIndCalcNormal(x & 15, y - 1, z & 15, 0);
return (rand5(rawSeedFromChunk(x >> 4, z >> 4), a[precomp_ind], b[precomp_ind]) >= y)? 1 : 0;
}

kernel void getBedrockTile(global const long* a, global const long* b, global const int* offset, global int* outData)
kernel void getBedrockTile(global const long* a, global const long* b, global const int* offset, global unsigned char* outData)
{
int x = get_global_id(0);
int z = get_global_id(1);
Expand Down

0 comments on commit d6bb98a

Please sign in to comment.