# Parallel Programming Lab
**Name:** Assem Saied ElQersh  
**ID:** 120210321

## Check CUDA Installation

In [None]:
# System checks
!nvcc --version
!which nvcc
!nvidia-smi

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0
/usr/local/cuda/bin/nvcc
Fri May 16 19:00:03 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   33C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |

## Problem 1: Vector Addition (2^24 elements)

### CUDA Implementation

In [None]:
%%writefile vector_add.cu
#include <iostream>
#include <cuda_runtime.h>

__global__ void vectorAdd(const float *a, const float *b, float *c, int n) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < n) c[i] = a[i] + b[i];
}

int main() {
    const int N = 1 << 24;
    size_t size = N * sizeof(float);

    // Allocate pinned memory
    float *h_a, *h_b, *h_c;
    cudaMallocHost(&h_a, size);
    cudaMallocHost(&h_b, size);
    cudaMallocHost(&h_c, size);

    // Initialize vectors
    for (int i = 0; i < N; ++i) {
        h_a[i] = static_cast<float>(rand()) / RAND_MAX;
        h_b[i] = static_cast<float>(rand()) / RAND_MAX;
    }

    // GPU execution
    float *d_a, *d_b, *d_c;
    cudaMalloc(&d_a, size);
    cudaMalloc(&d_b, size);
    cudaMalloc(&d_c, size);

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaEventRecord(start);
    cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice);

    int blockSize = 256;
    int numBlocks = (N + blockSize - 1) / blockSize;
    vectorAdd<<<numBlocks, blockSize>>>(d_a, d_b, d_c, N);

    cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    float ms;
    cudaEventElapsedTime(&ms, start, stop);
    std::cout << "CUDA Vector Add Time: " << ms << " ms\n";

    // Cleanup
    cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
    cudaFreeHost(h_a); cudaFreeHost(h_b); cudaFreeHost(h_c);
    return 0;
}

Overwriting vector_add.cu


In [None]:
!nvcc -o vector_add vector_add.cu
!./vector_add

CUDA Vector Add Time: 24.9311 ms


### Problem 1: C++ Implementation

In [None]:
%%writefile vector_add.cpp
#include <iostream>
#include <vector>
#include <chrono>

int main() {
    const int N = 1 << 24;
    std::vector<float> a(N), b(N), c(N);

    // Initialize vectors
    for (int i = 0; i < N; ++i) {
        a[i] = static_cast<float>(rand()) / RAND_MAX;
        b[i] = static_cast<float>(rand()) / RAND_MAX;
    }

    auto start = std::chrono::high_resolution_clock::now();
    for (int i = 0; i < N; ++i)
        c[i] = a[i] + b[i];
    auto end = std::chrono::high_resolution_clock::now();

    auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
    std::cout << "C++ Vector Add Time: " << duration << " ms\n";
    return 0;
}

Overwriting vector_add.cpp


In [None]:
!g++ -O3 -o vector_add_cpp vector_add.cpp
!./vector_add_cpp

C++ Vector Add Time: 19 ms


## Problem 2: 4D Vector Normalization (2^22 elements)

### CUDA Implementation

In [None]:
%%writefile normalize.cu
#include <iostream>
#include <cuda_runtime.h>

__global__ void normalize_kernel(float4 *input, float4 *output, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        float4 vec = input[idx];
        float sum = vec.x*vec.x + vec.y*vec.y + vec.z*vec.z + vec.w*vec.w;
        float inv_norm = rsqrtf(sum + 1e-8f);  // Use fast inverse square root
        output[idx] = make_float4(vec.x*inv_norm, vec.y*inv_norm, vec.z*inv_norm, vec.w*inv_norm);
    }
}

int main() {
    const int N = 1 << 22;
    float4 *h_input, *h_output;
    cudaMallocHost(&h_input, N*sizeof(float4));
    cudaMallocHost(&h_output, N*sizeof(float4));

    // Initialize vectors
    for (int i = 0; i < N; ++i) {
        h_input[i] = make_float4(
            static_cast<float>(rand())/RAND_MAX,
            static_cast<float>(rand())/RAND_MAX,
            static_cast<float>(rand())/RAND_MAX,
            static_cast<float>(rand())/RAND_MAX
        );
    }

    // GPU execution
    float4 *d_input, *d_output;
    cudaMalloc(&d_input, N*sizeof(float4));
    cudaMalloc(&d_output, N*sizeof(float4));

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaEventRecord(start);
    cudaMemcpy(d_input, h_input, N*sizeof(float4), cudaMemcpyHostToDevice);
    normalize_kernel<<<(N+255)/256, 256>>>(d_input, d_output, N);
    cudaMemcpy(h_output, d_output, N*sizeof(float4), cudaMemcpyDeviceToHost);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    float ms;
    cudaEventElapsedTime(&ms, start, stop);
    std::cout << "CUDA Normalization Time: " << ms << " ms\n";

    // Cleanup
    cudaFree(d_input); cudaFree(d_output);
    cudaFreeHost(h_input); cudaFreeHost(h_output);
    return 0;
}

Overwriting normalize.cu


In [None]:
!nvcc -o normalize normalize.cu
!./normalize

CUDA Normalization Time: 20.6321 ms


### Problem 2: C++ Implementation

In [None]:
%%writefile normalize.cpp
#include <iostream>
#include <vector>
#include <chrono>
#include <cmath>

struct float4 { float x, y, z, w; };

int main() {
    const int N = 1 << 22;
    std::vector<float4> input(N), output(N);

    // Initialize vectors
    for (int i = 0; i < N; ++i) {
        input[i] = {
            static_cast<float>(rand())/RAND_MAX,
            static_cast<float>(rand())/RAND_MAX,
            static_cast<float>(rand())/RAND_MAX,
            static_cast<float>(rand())/RAND_MAX
        };
    }

    auto start = std::chrono::high_resolution_clock::now();
    for (int i = 0; i < N; ++i) {
        float sum = input[i].x*input[i].x +
                   input[i].y*input[i].y +
                   input[i].z*input[i].z +
                   input[i].w*input[i].w;
        float inv_norm = 1.0f / sqrt(sum + 1e-8f);
        output[i] = {
            input[i].x * inv_norm,
            input[i].y * inv_norm,
            input[i].z * inv_norm,
            input[i].w * inv_norm
        };
    }
    auto end = std::chrono::high_resolution_clock::now();

    auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
    std::cout << "C++ Normalization Time: " << duration << " ms\n";
    return 0;
}

Overwriting normalize.cpp


In [None]:
!g++ -O3 -o normalize_cpp normalize.cpp
!./normalize_cpp

C++ Normalization Time: 22 ms


## Performance Comparison and Analysis

After running the code above, fill in the execution times and analyze the results:

| Problem | CUDA Time (ms) | C++ Time (ms) | Speedup Factor |
|---------|----------------|---------------|----------------|
| Vector Addition | 24.9311 | 19 | 0.76x |
| Vector Normalization | 20.6321 | 22 | 1.07x |

### Analysis:

1. **Vector Addition**:
   - The C++ implementation (19ms) outperforms the CUDA implementation (24.93ms) by approximately 24%
   - This is primarily due to the overhead of memory transfers between CPU and GPU
   - The simple nature of vector addition (one addition per element) doesn't provide enough computational intensity to overcome the memory transfer overhead
   - The CUDA implementation uses pinned memory (cudaMallocHost) which helps with transfer speeds but still incurs overhead

2. **Vector Normalization**:
   - The CUDA implementation (20.63ms) slightly outperforms the C++ version (22ms) by about 7%
   - This is because normalization is more computationally intensive, involving:
     - Multiple multiplications per element
     - Square root calculation (using fast rsqrtf in CUDA)
     - Division operations
   - The higher computational intensity helps offset the memory transfer overhead

3. **Performance Factors**:
   - Memory Transfer Overhead:
     - Each CUDA operation requires host-to-device and device-to-host transfers
     - For small operations, this overhead can be significant
   - Computational Intensity:
     - Operations with higher computational intensity (like normalization) benefit more from GPU parallelization
     - Simple operations (like addition) may not benefit enough to overcome transfer overhead
   - Memory Access Patterns:
     - Both implementations use contiguous memory access
     - CUDA benefits from coalesced memory access in the kernel
   - Hardware Utilization:
     - The Tesla T4 GPU used in testing has good compute capabilities
     - The CPU implementation benefits from modern CPU optimizations and cache utilization

4. **Conclusion**:
   - CUDA's performance advantage depends heavily on the computational intensity of the operation
   - For simple operations like vector addition, CPU implementations can be faster due to lower overhead
   - For more complex operations like normalization, CUDA can provide better performance
   - The break-even point for GPU acceleration depends on:
     - Problem size
     - Computational intensity
     - Memory transfer requirements
   - For real-world applications, the decision to use CUDA should consider:
     - The nature of the computation
     - Data size
     - Required precision
     - Development complexity