Skip to content

Commit d9d49bc

Browse files
authored
Update 00_vector_add_v1.cu
1 parent 57e2c02 commit d9d49bc

File tree

1 file changed

+84
-29
lines changed

1 file changed

+84
-29
lines changed

05 Writing your First Kernels/02 Kernels/00_vector_add_v1.cu

Lines changed: 84 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -3,82 +3,133 @@
33
#include <time.h>
44
#include <cuda_runtime.h>
55

6-
#define N 10000000 // Vector size = 10 million
6+
/*
7+
* Vector size constant - 10 million elements
8+
* Choose a large enough size to demonstrate GPU parallelism benefits
9+
* But not too large to exceed GPU memory
10+
*/
11+
#define N 10000000
12+
13+
/*
14+
* CUDA thread block size
15+
* 256 is a common choice because:
16+
* - It's a multiple of 32 (warp size)
17+
* - Provides good occupancy on most GPUs
18+
* - Balances resource usage and parallelism
19+
*/
720
#define BLOCK_SIZE 256
821

9-
// Example:
10-
// A = [1, 2, 3, 4, 5]
11-
// B = [6, 7, 8, 9, 10]
12-
// C = A + B = [7, 9, 11, 13, 15]
13-
14-
// CPU vector addition
22+
/*
23+
* CPU implementation of vector addition
24+
* Performs sequential addition of two vectors
25+
* Parameters:
26+
* a, b - input vectors
27+
* c - output vector
28+
* n - vector size
29+
* Time complexity: O(n)
30+
*/
1531
void vector_add_cpu(float *a, float *b, float *c, int n) {
1632
for (int i = 0; i < n; i++) {
17-
c[i] = a[i] + b[i];
33+
c[i] = a[i] + b[i]; // Sequential addition
1834
}
1935
}
2036

21-
// CUDA kernel for vector addition
37+
/*
38+
* CUDA kernel for parallel vector addition
39+
* Each thread processes one element of the vectors
40+
* Parameters:
41+
* a, b - input vectors in device memory
42+
* c - output vector in device memory
43+
* n - vector size
44+
*
45+
* Thread organization:
46+
* - Multiple thread blocks, each with BLOCK_SIZE threads
47+
* - Each thread handles one array element
48+
* - Global thread ID = blockIdx.x * blockDim.x + threadIdx.x
49+
*/
2250
__global__ void vector_add_gpu(float *a, float *b, float *c, int n) {
51+
// Calculate global thread ID
2352
int i = blockIdx.x * blockDim.x + threadIdx.x;
53+
54+
// Boundary check to prevent buffer overflow
2455
if (i < n) {
25-
c[i] = a[i] + b[i];
56+
c[i] = a[i] + b[i]; // Parallel addition
2657
}
2758
}
2859

29-
// Initialize vector with random values
60+
/*
61+
* Initialize vector with random float values between 0 and 1
62+
* Parameters:
63+
* vec - vector to initialize
64+
* n - vector size
65+
* Note: Uses rand(), which is not thread-safe
66+
*/
3067
void init_vector(float *vec, int n) {
3168
for (int i = 0; i < n; i++) {
3269
vec[i] = (float)rand() / RAND_MAX;
3370
}
3471
}
3572

36-
// Function to measure execution time
73+
/*
74+
* High-precision timer function
75+
* Returns:
76+
* Current time in seconds with nanosecond precision
77+
* Uses CLOCK_MONOTONIC to avoid issues with system time changes
78+
*/
3779
double get_time() {
3880
struct timespec ts;
3981
clock_gettime(CLOCK_MONOTONIC, &ts);
4082
return ts.tv_sec + ts.tv_nsec * 1e-9;
4183
}
4284

4385
int main() {
44-
float *h_a, *h_b, *h_c_cpu, *h_c_gpu;
45-
float *d_a, *d_b, *d_c;
86+
// Host (CPU) and Device (GPU) pointers
87+
float *h_a, *h_b, *h_c_cpu, *h_c_gpu; // h_ prefix for host memory
88+
float *d_a, *d_b, *d_c; // d_ prefix for device memory
4689
size_t size = N * sizeof(float);
4790

48-
// Allocate host memory
91+
// Allocate host (CPU) memory
92+
// Using malloc() for page-able memory
93+
// Consider using cudaHostAlloc() for pinned memory in production
4994
h_a = (float*)malloc(size);
5095
h_b = (float*)malloc(size);
5196
h_c_cpu = (float*)malloc(size);
5297
h_c_gpu = (float*)malloc(size);
5398

54-
// Initialize vectors
99+
// Initialize random number generator and vectors
55100
srand(time(NULL));
56101
init_vector(h_a, N);
57102
init_vector(h_b, N);
58103

59-
// Allocate device memory
104+
// Allocate device (GPU) memory
105+
// cudaMalloc() allocates linear memory on the device
60106
cudaMalloc(&d_a, size);
61107
cudaMalloc(&d_b, size);
62108
cudaMalloc(&d_c, size);
63109

64-
// Copy data to device
110+
// Copy input data from host to device
111+
// cudaMemcpy() is synchronous (blocking)
65112
cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice);
66113
cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice);
67114

68-
// Define grid and block dimensions
115+
/*
116+
* Calculate grid dimensions
117+
* Formula ensures enough blocks to cover N elements:
118+
* - If N = 1000 and BLOCK_SIZE = 256:
119+
* - num_blocks = (1000 + 256 - 1) / 256 = 4
120+
* - This creates 4 blocks × 256 threads = 1024 threads total
121+
*/
69122
int num_blocks = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;
70-
// N = 1024, BLOCK_SIZE = 256, num_blocks = 4
71-
// (N + BLOCK_SIZE - 1) / BLOCK_SIZE = ( (1025 + 256 - 1) / 256 ) = 1280 / 256 = 4 rounded
72123

73-
// Warm-up runs
124+
// Perform warm-up runs to stabilize GPU clock speeds
74125
printf("Performing warm-up runs...\n");
75126
for (int i = 0; i < 3; i++) {
76127
vector_add_cpu(h_a, h_b, h_c_cpu, N);
77128
vector_add_gpu<<<num_blocks, BLOCK_SIZE>>>(d_a, d_b, d_c, N);
78-
cudaDeviceSynchronize();
129+
cudaDeviceSynchronize(); // Wait for GPU to finish
79130
}
80131

81-
// Benchmark CPU implementation
132+
// Benchmark CPU implementation (average of 20 runs)
82133
printf("Benchmarking CPU implementation...\n");
83134
double cpu_total_time = 0.0;
84135
for (int i = 0; i < 20; i++) {
@@ -89,39 +140,43 @@ int main() {
89140
}
90141
double cpu_avg_time = cpu_total_time / 20.0;
91142

92-
// Benchmark GPU implementation
143+
// Benchmark GPU implementation (average of 20 runs)
93144
printf("Benchmarking GPU implementation...\n");
94145
double gpu_total_time = 0.0;
95146
for (int i = 0; i < 20; i++) {
96147
double start_time = get_time();
97148
vector_add_gpu<<<num_blocks, BLOCK_SIZE>>>(d_a, d_b, d_c, N);
98-
cudaDeviceSynchronize();
149+
cudaDeviceSynchronize(); // Ensure GPU finished before stopping timer
99150
double end_time = get_time();
100151
gpu_total_time += end_time - start_time;
101152
}
102153
double gpu_avg_time = gpu_total_time / 20.0;
103154

104-
// Print results
155+
// Display benchmark results
105156
printf("CPU average time: %f milliseconds\n", cpu_avg_time*1000);
106157
printf("GPU average time: %f milliseconds\n", gpu_avg_time*1000);
107158
printf("Speedup: %fx\n", cpu_avg_time / gpu_avg_time);
108159

109-
// Verify results (optional)
160+
// Verify results by comparing CPU and GPU outputs
161+
// Copy GPU results back to host for comparison
110162
cudaMemcpy(h_c_gpu, d_c, size, cudaMemcpyDeviceToHost);
111163
bool correct = true;
112164
for (int i = 0; i < N; i++) {
165+
// Allow small floating-point differences (epsilon = 1e-5)
113166
if (fabs(h_c_cpu[i] - h_c_gpu[i]) > 1e-5) {
114167
correct = false;
115168
break;
116169
}
117170
}
118171
printf("Results are %s\n", correct ? "correct" : "incorrect");
119172

120-
// Free memory
173+
// Clean up: Free all allocated memory
174+
// Host memory
121175
free(h_a);
122176
free(h_b);
123177
free(h_c_cpu);
124178
free(h_c_gpu);
179+
// Device memory
125180
cudaFree(d_a);
126181
cudaFree(d_b);
127182
cudaFree(d_c);

0 commit comments

Comments
 (0)