33#include < time.h>
44#include < cuda_runtime.h>
55
6- #define N 10000000 // Vector size = 10 million
6+ /*
7+ * Vector size constant - 10 million elements
8+ * Choose a large enough size to demonstrate GPU parallelism benefits
9+ * But not too large to exceed GPU memory
10+ */
11+ #define N 10000000
12+
13+ /*
14+ * CUDA thread block size
15+ * 256 is a common choice because:
16+ * - It's a multiple of 32 (warp size)
17+ * - Provides good occupancy on most GPUs
18+ * - Balances resource usage and parallelism
19+ */
720#define BLOCK_SIZE 256
821
9- // Example:
10- // A = [1, 2, 3, 4, 5]
11- // B = [6, 7, 8, 9, 10]
12- // C = A + B = [7, 9, 11, 13, 15]
13-
14- // CPU vector addition
22+ /*
23+ * CPU implementation of vector addition
24+ * Performs sequential addition of two vectors
25+ * Parameters:
26+ * a, b - input vectors
27+ * c - output vector
28+ * n - vector size
29+ * Time complexity: O(n)
30+ */
1531void vector_add_cpu (float *a, float *b, float *c, int n) {
1632 for (int i = 0 ; i < n; i++) {
17- c[i] = a[i] + b[i];
33+ c[i] = a[i] + b[i]; // Sequential addition
1834 }
1935}
2036
21- // CUDA kernel for vector addition
37+ /*
38+ * CUDA kernel for parallel vector addition
39+ * Each thread processes one element of the vectors
40+ * Parameters:
41+ * a, b - input vectors in device memory
42+ * c - output vector in device memory
43+ * n - vector size
44+ *
45+ * Thread organization:
46+ * - Multiple thread blocks, each with BLOCK_SIZE threads
47+ * - Each thread handles one array element
48+ * - Global thread ID = blockIdx.x * blockDim.x + threadIdx.x
49+ */
2250__global__ void vector_add_gpu (float *a, float *b, float *c, int n) {
51+ // Calculate global thread ID
2352 int i = blockIdx .x * blockDim .x + threadIdx .x ;
53+
54+ // Boundary check to prevent buffer overflow
2455 if (i < n) {
25- c[i] = a[i] + b[i];
56+ c[i] = a[i] + b[i]; // Parallel addition
2657 }
2758}
2859
29- // Initialize vector with random values
60+ /*
61+ * Initialize vector with random float values between 0 and 1
62+ * Parameters:
63+ * vec - vector to initialize
64+ * n - vector size
65+ * Note: Uses rand(), which is not thread-safe
66+ */
3067void init_vector (float *vec, int n) {
3168 for (int i = 0 ; i < n; i++) {
3269 vec[i] = (float )rand () / RAND_MAX;
3370 }
3471}
3572
36- // Function to measure execution time
73+ /*
74+ * High-precision timer function
75+ * Returns:
76+ * Current time in seconds with nanosecond precision
77+ * Uses CLOCK_MONOTONIC to avoid issues with system time changes
78+ */
3779double get_time () {
3880 struct timespec ts;
3981 clock_gettime (CLOCK_MONOTONIC, &ts);
4082 return ts.tv_sec + ts.tv_nsec * 1e-9 ;
4183}
4284
4385int main () {
44- float *h_a, *h_b, *h_c_cpu, *h_c_gpu;
45- float *d_a, *d_b, *d_c;
86+ // Host (CPU) and Device (GPU) pointers
87+ float *h_a, *h_b, *h_c_cpu, *h_c_gpu; // h_ prefix for host memory
88+ float *d_a, *d_b, *d_c; // d_ prefix for device memory
4689 size_t size = N * sizeof (float );
4790
48- // Allocate host memory
91+ // Allocate host (CPU) memory
92+ // Using malloc() for page-able memory
93+ // Consider using cudaHostAlloc() for pinned memory in production
4994 h_a = (float *)malloc (size);
5095 h_b = (float *)malloc (size);
5196 h_c_cpu = (float *)malloc (size);
5297 h_c_gpu = (float *)malloc (size);
5398
54- // Initialize vectors
99+ // Initialize random number generator and vectors
55100 srand (time (NULL ));
56101 init_vector (h_a, N);
57102 init_vector (h_b, N);
58103
59- // Allocate device memory
104+ // Allocate device (GPU) memory
105+ // cudaMalloc() allocates linear memory on the device
60106 cudaMalloc (&d_a, size);
61107 cudaMalloc (&d_b, size);
62108 cudaMalloc (&d_c, size);
63109
64- // Copy data to device
110+ // Copy input data from host to device
111+ // cudaMemcpy() is synchronous (blocking)
65112 cudaMemcpy (d_a, h_a, size, cudaMemcpyHostToDevice);
66113 cudaMemcpy (d_b, h_b, size, cudaMemcpyHostToDevice);
67114
68- // Define grid and block dimensions
115+ /*
116+ * Calculate grid dimensions
117+ * Formula ensures enough blocks to cover N elements:
118+ * - If N = 1000 and BLOCK_SIZE = 256:
119+ * - num_blocks = (1000 + 256 - 1) / 256 = 4
120+ * - This creates 4 blocks × 256 threads = 1024 threads total
121+ */
69122 int num_blocks = (N + BLOCK_SIZE - 1 ) / BLOCK_SIZE;
70- // N = 1024, BLOCK_SIZE = 256, num_blocks = 4
71- // (N + BLOCK_SIZE - 1) / BLOCK_SIZE = ( (1025 + 256 - 1) / 256 ) = 1280 / 256 = 4 rounded
72123
73- // Warm -up runs
124+ // Perform warm -up runs to stabilize GPU clock speeds
74125 printf (" Performing warm-up runs...\n " );
75126 for (int i = 0 ; i < 3 ; i++) {
76127 vector_add_cpu (h_a, h_b, h_c_cpu, N);
77128 vector_add_gpu<<<num_blocks, BLOCK_SIZE>>> (d_a, d_b, d_c, N);
78- cudaDeviceSynchronize ();
129+ cudaDeviceSynchronize (); // Wait for GPU to finish
79130 }
80131
81- // Benchmark CPU implementation
132+ // Benchmark CPU implementation (average of 20 runs)
82133 printf (" Benchmarking CPU implementation...\n " );
83134 double cpu_total_time = 0.0 ;
84135 for (int i = 0 ; i < 20 ; i++) {
@@ -89,39 +140,43 @@ int main() {
89140 }
90141 double cpu_avg_time = cpu_total_time / 20.0 ;
91142
92- // Benchmark GPU implementation
143+ // Benchmark GPU implementation (average of 20 runs)
93144 printf (" Benchmarking GPU implementation...\n " );
94145 double gpu_total_time = 0.0 ;
95146 for (int i = 0 ; i < 20 ; i++) {
96147 double start_time = get_time ();
97148 vector_add_gpu<<<num_blocks, BLOCK_SIZE>>> (d_a, d_b, d_c, N);
98- cudaDeviceSynchronize ();
149+ cudaDeviceSynchronize (); // Ensure GPU finished before stopping timer
99150 double end_time = get_time ();
100151 gpu_total_time += end_time - start_time;
101152 }
102153 double gpu_avg_time = gpu_total_time / 20.0 ;
103154
104- // Print results
155+ // Display benchmark results
105156 printf (" CPU average time: %f milliseconds\n " , cpu_avg_time*1000 );
106157 printf (" GPU average time: %f milliseconds\n " , gpu_avg_time*1000 );
107158 printf (" Speedup: %fx\n " , cpu_avg_time / gpu_avg_time);
108159
109- // Verify results (optional)
160+ // Verify results by comparing CPU and GPU outputs
161+ // Copy GPU results back to host for comparison
110162 cudaMemcpy (h_c_gpu, d_c, size, cudaMemcpyDeviceToHost);
111163 bool correct = true ;
112164 for (int i = 0 ; i < N; i++) {
165+ // Allow small floating-point differences (epsilon = 1e-5)
113166 if (fabs (h_c_cpu[i] - h_c_gpu[i]) > 1e-5 ) {
114167 correct = false ;
115168 break ;
116169 }
117170 }
118171 printf (" Results are %s\n " , correct ? " correct" : " incorrect" );
119172
120- // Free memory
173+ // Clean up: Free all allocated memory
174+ // Host memory
121175 free (h_a);
122176 free (h_b);
123177 free (h_c_cpu);
124178 free (h_c_gpu);
179+ // Device memory
125180 cudaFree (d_a);
126181 cudaFree (d_b);
127182 cudaFree (d_c);
0 commit comments