cu RAM optimization & ETA countdown

0xTiger · Aug 28, 2020 · d6bb98a · d6bb98a
1 parent 15112a5
commit d6bb98a
Show file tree

Hide file tree

Showing 5 changed files with 90 additions and 25 deletions.
diff --git a/.gitignore b/.gitignore
@@ -3,6 +3,10 @@
 ##
 ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
 
+#Myfiles
+clumpFinderCUDA/recordFile.txt
+clumpFinderOpenCL/recordFile.txt
+
 # User-specific files
 *.rsuser
 *.suo

diff --git a/clumpFinderCUDA/clumpFinderCUDA.cu b/clumpFinderCUDA/clumpFinderCUDA.cu
@@ -6,7 +6,11 @@
 #include "PrecomputedRandAdvance.h"
 #include <fstream>
 #include <math.h>
+#include <chrono>
+#include <iomanip>
+#include <tuple>
 
+typedef std::chrono::high_resolution_clock Clock;
 
 #define THREADSPERBLOCK_X 16
 #define THREADSPERBLOCK_Y 16
@@ -240,7 +244,22 @@ __global__ void reduction(int* inData, int* outData, int* outIdData) {
 	}*/
 }
 
+std::ostream& operator<<(std::ostream& os, const std::chrono::microseconds& v) {
+	// convert to microseconds
+	int us = v.count();
 
+	int h = us / (1000 * 1000 * 60 * 60);
+	us -= h * (1000 * 1000 * 60 * 60);
+
+	int m = us / (1000 * 1000 * 60);
+	us -= m * (1000 * 1000 * 60);
+
+	int s = us / (1000 * 1000);
+	us -= s * (1000 * 1000);
+
+	return os << std::setfill('0') << std::setw(2) << h << ':' << std::setw(2) << m
+		<< ':' << std::setw(2) << s;
+}
 
 
 std::vector<int> spiral(int n) {
@@ -290,13 +309,10 @@ int main(int argc, char* argv[])
 	const size_t start = atoi(argv[1]); // 0;
 	const size_t end = atoi(argv[2]); // start + 15;
 
-	std::vector<int> bedrock(len * len, 0);
 	std::vector<int> offset = { 0, 0};
 
 	std::vector<int> final((len * len) / 256, 0);
 	std::vector<int> finalIds((len * len) / 256, 0);
-	std::vector<int> labels(len * len, 0);
-	std::vector<int> freq(len * len, 0);
 
 
 	cudaError_t err;
@@ -312,22 +328,22 @@ int main(int argc, char* argv[])
 	err = cudaMalloc(&a_d, sizeof(int64_t) * A_OW_112.size());
 	err = cudaMalloc(&b_d, sizeof(int64_t) * B_OW_112.size());
 	err = cudaMalloc(&off_d, sizeof(int) * offset.size());
-	err = cudaMalloc(&bedrock_d, sizeof(int) * bedrock.size());
-	err = cudaMalloc(&labels_d, sizeof(int) * labels.size());
+	err = cudaMalloc(&bedrock_d, sizeof(int) * len * len);
+	err = cudaMalloc(&labels_d, sizeof(int) * len * len);
 	err = cudaMalloc(&final_d, sizeof(int) * final.size());
 	err = cudaMalloc(&finalIds_d, sizeof(int) * finalIds.size());
-	err = cudaMalloc(&freq_d, sizeof(int) * freq.size());
+	err = cudaMalloc(&freq_d, sizeof(int) * len * len);
+
 
 	err = cudaMemcpy(a_d, A_OW_112.data(), sizeof(int64_t) * A_OW_112.size(), cudaMemcpyHostToDevice);
 	err = cudaMemcpy(b_d, B_OW_112.data(), sizeof(int64_t) * B_OW_112.size(), cudaMemcpyHostToDevice);
-	
-	err = cudaMemcpy(bedrock_d, bedrock.data(), sizeof(int) * bedrock.size(), cudaMemcpyHostToDevice);
-	
+
+	err = cudaMemset(bedrock_d, 0, sizeof(int) * len * len);
+
 	err = cudaMemcpy(final_d, final.data(), sizeof(int) * final.size(), cudaMemcpyHostToDevice);
 	err = cudaMemcpy(finalIds_d, finalIds.data(), sizeof(int) * finalIds.size(), cudaMemcpyHostToDevice);
 
 
-
 	std::tuple<size_t, int, int> best = { 0, 0, 0 };
 
 	//1000 (*1000*1000) takes 7500ms before new shiny kernel
@@ -347,9 +363,11 @@ int main(int argc, char* argv[])
 	dim3 DimBlock2(THREADSPERBLOCK_2);
 
 	for (int i = start; i < end; i++) {
-		std::cout << i << ' ';
+		auto t1 = Clock::now();
+
 		offset = { spiral(i)[0] * len , spiral(i)[1] * len };
 		//std::cout << offset[0] << ' ' << offset[1] << std::endl;
+
 
 		err = cudaMemcpy(off_d, offset.data(), sizeof(int) * offset.size(), cudaMemcpyHostToDevice);
 		getBedrockTile << <DimGrid, DimBlock >> > (a_d, b_d, off_d, bedrock_d);
@@ -358,7 +376,7 @@ int main(int argc, char* argv[])
 
 
 		//begin labeling clumps
-		err = cudaMemset(labels_d, 0, sizeof(int) * labels.size());
+		err = cudaMemset(labels_d, 0, sizeof(int) * len * len);
 
 		Init << <DimGrid, DimBlock >> > (bedrock_d, labels_d);
 		Analyze << <DimGrid, DimBlock >> > (labels_d);
@@ -369,7 +387,7 @@ int main(int argc, char* argv[])
 		//err = cudaMemcpy(bedrock.data(), bedrock_d, sizeof(int) * bedrock.size(), cudaMemcpyDeviceToHost);
 		//err = cudaMemcpy(labels.data(), labels_d, sizeof(int) * labels.size(), cudaMemcpyDeviceToHost);
 
-		err = cudaMemset(freq_d, 0, sizeof(int) * freq.size());
+		err = cudaMemset(freq_d, 0, sizeof(int) * len * len);
 
 		getFrequency << <DimGrid2, DimBlock2 >> > (labels_d, freq_d);
 
@@ -390,7 +408,15 @@ int main(int argc, char* argv[])
 		int recordX = finalIds[recordi] / len;
 		int recordZ = finalIds[recordi] % len;
 
-		std::cout << ' ' << record << " @ (" << recordX + offset[0] << ", " << recordZ + offset[1] << ')' << std::endl;
+		auto t2 = Clock::now();
+
+		std::cout << i << ' ';
+		std::cout << ' ' << record << " @ (" << recordX + offset[0] << ", " << recordZ + offset[1] << ')' << "                             " << std::endl;
+
+		std::chrono::microseconds ms = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1);
+		int per_sec = (float)(1000000) / ms.count();
+
+		std::cout << per_sec << "tiles/s"  << " ETA: " << ms * (end - i) << '\r';
 
 		std::tuple<size_t, int, int> result = { record, recordX + offset[0], recordZ + offset[1] };
 
@@ -399,7 +425,7 @@ int main(int argc, char* argv[])
 		}
 	}
 
-	std::cout << "Best found: " << std::endl;
+	std::cout << "Best found: " << "                             " << std::endl;
 	std::cout << std::get<0>(best) << " @ (" << std::get<1>(best) << ", " << std::get<2>(best) << ')' << std::endl;
 
 	cudaFree(a_d);

diff --git a/clumpFinderCUDA/clumpFinderCUDA.vcxproj b/clumpFinderCUDA/clumpFinderCUDA.vcxproj
@@ -105,6 +105,7 @@
       <SDLCheck>true</SDLCheck>
       <ConformanceMode>true</ConformanceMode>
       <AdditionalIncludeDirectories>%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <LanguageStandard>Default</LanguageStandard>
     </ClCompile>
     <Link>
       <SubSystem>Console</SubSystem>
@@ -136,6 +137,7 @@
       <SDLCheck>true</SDLCheck>
       <ConformanceMode>true</ConformanceMode>
       <AdditionalIncludeDirectories>%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <LanguageStandard>Default</LanguageStandard>
     </ClCompile>
     <Link>
       <SubSystem>Console</SubSystem>

diff --git a/clumpFinderOpenCL/clumpFinderOpenCL.cpp b/clumpFinderOpenCL/clumpFinderOpenCL.cpp
@@ -4,9 +4,31 @@
 #include <vector>
 #include <math.h>
 #include "PrecomputedRandAdvance.h"
+#include <chrono>
+#include <tuple>
+#include <iomanip>
+
+typedef std::chrono::high_resolution_clock Clock;
 
 #define MAXPASS 10
 
+std::ostream& operator<<(std::ostream& os, const std::chrono::microseconds& v) {
+	// convert to microseconds
+	int us = v.count();
+
+	int h = us / (1000 * 1000 * 60 * 60);
+	us -= h * (1000 * 1000 * 60 * 60);
+
+	int m = us / (1000 * 1000 * 60);
+	us -= m * (1000 * 1000 * 60);
+
+	int s = us / (1000 * 1000);
+	us -= s * (1000 * 1000);
+
+	return os << std::setfill('0') << std::setw(2) << h << ':' << std::setw(2) << m
+		<< ':' << std::setw(2) << s;
+}
+
 std::vector<int> spiral(int n) {
 	n++;
 	int k = ceil((sqrt(n) - 1) / 2);
@@ -76,7 +98,7 @@ int main(int argc, char* argv[])
 	const size_t end = atoi(argv[2]); // start + 15;
 
 
-	std::vector<int> bedrock(len * len, 0);
+	std::vector<unsigned char> bedrock(len * len, 0);
 	std::vector<int> offset = { 0, 0 };
 	std::tuple<size_t, int, int> best = { 0, 0, 0 };
 
@@ -89,7 +111,7 @@ int main(int argc, char* argv[])
 
 	cl::Buffer a_buf(context, CL_MEM_READ_ONLY | CL_MEM_HOST_NO_ACCESS | CL_MEM_COPY_HOST_PTR, sizeof(int64_t) * A_OW_112.size(), A_OW_112.data(), &err);
 	cl::Buffer b_buf(context, CL_MEM_READ_ONLY | CL_MEM_HOST_NO_ACCESS | CL_MEM_COPY_HOST_PTR, sizeof(int64_t) * B_OW_112.size(), B_OW_112.data(), &err);
-	cl::Buffer bedrock_buf(context, CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, sizeof(int) * bedrock.size(), &err);
+	cl::Buffer bedrock_buf(context, CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, sizeof(unsigned char) * bedrock.size(), &err);
 
 	kernel_bedrock.setArg(0, a_buf);
 	kernel_bedrock.setArg(1, b_buf);
@@ -109,6 +131,8 @@ int main(int argc, char* argv[])
 	cl::Buffer freq_buf(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, len * len * sizeof(cl_int), freq.data());
 	cl::Buffer final_buf(context, CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY, numWorkGroups * sizeof(int), final.data());
 	cl::Buffer finalIds_buf(context, CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY, numWorkGroups * sizeof(int), finalIds.data());
+
+
 	//1000 (*1000*1000) takes 7500ms before new shiny kernel
 	//1000 (*1000*1000) takes 2500ms with new shinyish kernel
 	//60 (*4096*4096) takes 2000ms with new shinyish kernel
@@ -118,7 +142,8 @@ int main(int argc, char* argv[])
 	// ^ at this point after scaling up to 150 iters, we can do about 10^9 blocks per second
 
 	for (int i = start; i < end; i++) {
-		std::cout << i << ' ';
+		auto t1 = Clock::now();
+
 		offset = { spiral(i)[0]*len , spiral(i)[1]*len};
 		//std::cout << offset[0] << ' ' << offset[1] << std::endl;
 
@@ -127,7 +152,7 @@ int main(int argc, char* argv[])
 		kernel_bedrock.setArg(2, off_buf);
 
 		err = queue.enqueueNDRangeKernel(kernel_bedrock, cl::NullRange, cl::NDRange(len, len));
-		//err = queue.enqueueReadBuffer(bedrock_buf, CL_FALSE, 0, sizeof(int) * bedrock.size(), bedrock.data());
+		//err = queue.enqueueReadBuffer(bedrock_buf, CL_FALSE, 0, sizeof(unsigned char) * bedrock.size(), bedrock.data());
 
 		cl::finish();
 
@@ -185,7 +210,15 @@ int main(int argc, char* argv[])
 		int recordX = finalIds[recordi] / len;
 		int recordZ = finalIds[recordi] % len;
 
-		std::cout << ' ' << record <<  " @ (" << recordX + offset[0] << ", " << recordZ + offset[1] << ')' << std::endl;
+		auto t2 = Clock::now();
+
+		std::cout << i << ' ';
+		std::cout << ' ' << record << " @ (" << recordX + offset[0] << ", " << recordZ + offset[1] << ')' << "                             " << std::endl;
+
+		std::chrono::microseconds ms = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1);
+		int per_sec = (float)(1000000) / ms.count();
+
+		std::cout << per_sec << "tiles/s" << " ETA: " << ms * (end - i) << '\r';
 
 		std::tuple<size_t, int, int> result = {record, recordX + offset[0], recordZ + offset[1] };
 
@@ -194,7 +227,7 @@ int main(int argc, char* argv[])
 		}
 	}
 
-	std::cout << "Best found: " << std::endl;
+	std::cout << "Best found: " << "                             " << std::endl;
 	std::cout << std::get<0>(best) << " @ (" << std::get<1>(best) << ", " << std::get<2>(best) << ')' << std::endl;
 
 

diff --git a/clumpFinderOpenCL/ker.cl b/clumpFinderOpenCL/ker.cl
@@ -21,7 +21,7 @@ furnished to do so, subject to the following conditions :
 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.
 */
-__kernel void labelxPreprocess_int_int(global int* label, global int* pix, global int* flags, int maxpass, int bgc, int iw, int ih) {
+__kernel void labelxPreprocess_int_int(global int* label, global unsigned char* pix, global int* flags, int maxpass, int bgc, int iw, int ih) {
 	const int x = get_global_id(0), y = get_global_id(1);
 	const int p0 = y * iw + x;
 
@@ -37,7 +37,7 @@ __kernel void labelxPreprocess_int_int(global int* label, global int* pix, globa
 	label[p0] = p0;
 }
 
-__kernel void label4xMain_int_int(global int* label, global int* pix, global int* flags, int pass, int iw, int ih) {
+__kernel void label4xMain_int_int(global int* label, global unsigned char* pix, global int* flags, int pass, int iw, int ih) {
 	const int x = get_global_id(0), y = get_global_id(1);
 	if (x >= iw || y >= ih) return;
 	const int p0 = y * iw + x;
@@ -191,15 +191,15 @@ inline int precompChunkIndCalcNormal(int x, int y, int z, int nether)
 }
 
 
-inline int getBedrock(int x, int y, int z, global const long* a, global const long* b)
+inline unsigned char getBedrock(int x, int y, int z, global const long* a, global const long* b)
 {
 	if (y == 0) return 1;
 	if (y < 0 || y > 4) return 0;
 	int precomp_ind = precompChunkIndCalcNormal(x & 15, y - 1, z & 15, 0);
 	return (rand5(rawSeedFromChunk(x >> 4, z >> 4), a[precomp_ind], b[precomp_ind]) >= y)? 1 : 0;
 }
 
-kernel void getBedrockTile(global const long* a, global const long* b, global const int* offset, global int* outData)
+kernel void getBedrockTile(global const long* a, global const long* b, global const int* offset, global unsigned char* outData)
 {
 	int x = get_global_id(0);
 	int z = get_global_id(1);