-
Notifications
You must be signed in to change notification settings - Fork 2
/
opt1.cu
executable file
·102 lines (89 loc) · 2.83 KB
/
opt1.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
long long get_time() {
struct timeval tv;
gettimeofday(&tv, NULL);
return (tv.tv_sec * 1000000) + tv.tv_usec;
}
__global__ void hotspotOpt1(float *p, float* tIn, float *tOut, float sdc,
int nx, int ny, int nz,
float ce, float cw,
float cn, float cs,
float ct, float cb,
float cc)
{
float amb_temp = 80.0;
int i = blockDim.x * blockIdx.x + threadIdx.x;
int j = blockDim.y * blockIdx.y + threadIdx.y;
int c = i + j * nx;
int xy = nx * ny;
int W = (i == 0) ? c : c - 1;
int E = (i == nx-1) ? c : c + 1;
int N = (j == 0) ? c : c - nx;
int S = (j == ny-1) ? c : c + nx;
float temp1, temp2, temp3;
temp1 = temp2 = tIn[c];
temp3 = tIn[c+xy];
tOut[c] = cc * temp2 + cw * tIn[W] + ce * tIn[E] + cs * tIn[S]
+ cn * tIn[N] + cb * temp1 + ct * temp3 + sdc * p[c] + ct * amb_temp;
c += xy;
W += xy;
E += xy;
N += xy;
S += xy;
for (int k = 1; k < nz-1; ++k) {
temp1 = temp2;
temp2 = temp3;
temp3 = tIn[c+xy];
tOut[c] = cc * temp2 + cw * tIn[W] + ce * tIn[E] + cs * tIn[S]
+ cn * tIn[N] + cb * temp1 + ct * temp3 + sdc * p[c] + ct * amb_temp;
c += xy;
W += xy;
E += xy;
N += xy;
S += xy;
}
temp1 = temp2;
temp2 = temp3;
tOut[c] = cc * temp2 + cw * tIn[W] + ce * tIn[E] + cs * tIn[S]
+ cn * tIn[N] + cb * temp1 + ct * temp3 + sdc * p[c] + ct * amb_temp;
return;
}
void hotspot_opt1(float *p, float *tIn, float *tOut,
int nx, int ny, int nz,
float Cap,
float Rx, float Ry, float Rz,
float dt, int numiter)
{
float ce, cw, cn, cs, ct, cb, cc;
float stepDivCap = dt / Cap;
ce = cw =stepDivCap/ Rx;
cn = cs =stepDivCap/ Ry;
ct = cb =stepDivCap/ Rz;
cc = 1.0 - (2.0*ce + 2.0*cn + 3.0*ct);
size_t s = sizeof(float) * nx * ny * nz;
float *tIn_d, *tOut_d, *p_d;
cudaMalloc((void**)&p_d,s);
cudaMalloc((void**)&tIn_d,s);
cudaMalloc((void**)&tOut_d,s);
cudaMemcpy(tIn_d, tIn, s, cudaMemcpyHostToDevice);
cudaMemcpy(p_d, p, s, cudaMemcpyHostToDevice);
cudaFuncSetCacheConfig(hotspotOpt1, cudaFuncCachePreferL1);
dim3 block_dim(64, 4, 1);
dim3 grid_dim(nx / 64, ny / 4, 1);
long long start = get_time();
for (int i = 0; i < numiter; ++i) {
hotspotOpt1<<<grid_dim, block_dim>>>
(p_d, tIn_d, tOut_d, stepDivCap, nx, ny, nz, ce, cw, cn, cs, ct, cb, cc);
float *t = tIn_d;
tIn_d = tOut_d;
tOut_d = t;
}
cudaDeviceSynchronize();
long long stop = get_time();
float time = (float)((stop - start)/(1000.0 * 1000.0));
printf("Time: %.3f (s)\n",time);
cudaMemcpy(tOut, tOut_d, s, cudaMemcpyDeviceToHost);
cudaFree(p_d);
cudaFree(tIn_d);
cudaFree(tOut_d);
return;
}