<a href="https://colab.research.google.com/github/Bhavika-30/Parallel-Distributed-Assignment/blob/main/Assignment5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 1.1

In [3]:
%%writefile 1.1.cu
#include <stdio.h>
#include <cuda_runtime.h>

#define N 1048576
__device__ float d_A[N];
__device__ float d_B[N];
__device__ float d_C[N];

__global__ void vectorAdd(int n)
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i < n)
    {
        d_C[i] = d_A[i] + d_B[i];
    }
}

__global__ void initializeArrays(int n)
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i < n)
    {
        d_A[i] = i;
        d_B[i] = 2 * i;
    }
}

int main(void)
{
    cudaError_t err = cudaSuccess;

    printf("Vector addition of %d elements\n", N);

    int threadsPerBlock = 256;
    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

    printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);

    initializeArrays<<<blocksPerGrid, threadsPerBlock>>>(N);
    err = cudaGetLastError();
    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to launch initialize kernel (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    cudaDeviceSynchronize();

    vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(N);
    err = cudaGetLastError();
    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    cudaDeviceSynchronize();

    float h_C[10]; // Just verify first 10 elements to avoid memory issues
    err = cudaMemcpyFromSymbol(h_C, d_C, 10 * sizeof(float));
    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to copy result array from device to host (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    for (int i = 0; i < 10; i++)
    {
        float expected = i + 2*i;
        if (fabs(h_C[i] - expected) > 1e-5)
        {
            fprintf(stderr, "Result verification failed at element %d!\n", i);
            exit(EXIT_FAILURE);
        }
    }

    printf("Test PASSED\n");

    err = cudaDeviceReset();
    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to deinitialize the device! error=%s\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    printf("Done\n");
    return 0;
}

Overwriting 1.1.cu


In [4]:
!nvcc 1.1.cu -o 1.1 -arch=sm_75
!./1.1

Vector addition of 1048576 elements
CUDA kernel launch with 4096 blocks of 256 threads
Test PASSED
Done


In [5]:
!nvprof ./1.1

Vector addition of 1048576 elements
CUDA kernel launch with 4096 blocks of 256 threads
==3028== NVPROF is profiling process 3028, command: ./1.1
Test PASSED
Done
==3028== Profiling application: ./1.1
==3028== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   57.57%  54.142us         1  54.142us  54.142us  54.142us  vectorAdd(int)
                   39.95%  37.567us         1  37.567us  37.567us  37.567us  initializeArrays(int)
                    2.48%  2.3360us         1  2.3360us  2.3360us  2.3360us  [CUDA memcpy DtoH]
      API calls:   75.77%  115.20ms         2  57.598ms  10.337us  115.19ms  cudaLaunchKernel
                   23.34%  35.489ms         1  35.489ms  35.489ms  35.489ms  cudaDeviceReset
                    0.65%  994.26us         1  994.26us  994.26us  994.26us  cudaMemcpyFromSymbol
                    0.15%  229.02us       114  2.0080us     193ns  98.298us  cuDeviceGetAttribute
                    0

## 1.2

In [7]:
%%writefile 1.2.cu
#include <stdio.h>
#include <cuda_runtime.h>
#define N 1048576
__device__ float d_A[N];
__device__ float d_B[N];
__device__ float d_C[N];

__global__ void vectorAdd(int n)
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i < n)
    {
        d_C[i] = d_A[i] + d_B[i];
    }
}

__global__ void initializeArrays(int n)
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i < n)
    {
        d_A[i] = i;
        d_B[i] = 2 * i;
    }
}

int main(void)
{
    cudaError_t err = cudaSuccess;

    printf("Vector addition of %d elements\n", N);

    int threadsPerBlock = 256;
    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

    printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);

    initializeArrays<<<blocksPerGrid, threadsPerBlock>>>(N);
    err = cudaGetLastError();
    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to launch initialize kernel (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    cudaDeviceSynchronize();

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaEventRecord(start, NULL);

    vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(N);
    err = cudaGetLastError();
    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    cudaEventRecord(stop, NULL);

    cudaEventSynchronize(stop);

    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);

    printf("Kernel execution time: %f ms\n", milliseconds);

    float h_C[10]; // Just verify first 10 elements
    err = cudaMemcpyFromSymbol(h_C, d_C, 10 * sizeof(float));
    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to copy result array from device to host (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    for (int i = 0; i < 10; i++)
    {
        float expected = i + 2*i;
        if (fabs(h_C[i] - expected) > 1e-5)
        {
            fprintf(stderr, "Result verification failed at element %d!\n", i);
            exit(EXIT_FAILURE);
        }
    }

    printf("Test PASSED\n");

    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    err = cudaDeviceReset();
    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to deinitialize the device! error=%s\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    printf("Done\n");
    return 0;
}

Writing 1.2.cu


In [8]:
!nvcc 1.2.cu -o 1.2 -arch=sm_75
!./1.2

Vector addition of 1048576 elements
CUDA kernel launch with 4096 blocks of 256 threads
Kernel execution time: 0.090144 ms
Test PASSED
Done


In [9]:
!nvprof ./1.2

Vector addition of 1048576 elements
CUDA kernel launch with 4096 blocks of 256 threads
==4510== NVPROF is profiling process 4510, command: ./1.2
Kernel execution time: 0.068352 ms
Test PASSED
Done
==4510== Profiling application: ./1.2
==4510== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   57.42%  54.495us         1  54.495us  54.495us  54.495us  vectorAdd(int)
                   40.12%  38.079us         1  38.079us  38.079us  38.079us  initializeArrays(int)
                    2.46%  2.3360us         1  2.3360us  2.3360us  2.3360us  [CUDA memcpy DtoH]
      API calls:   74.75%  115.02ms         2  57.509ms  10.729us  115.01ms  cudaLaunchKernel
                   25.00%  38.471ms         1  38.471ms  38.471ms  38.471ms  cudaDeviceReset
                    0.13%  200.21us       114  1.7560us     183ns  77.345us  cuDeviceGetAttribute
                    0.04%  56.185us         1  56.185us  56.185us  56.185us  cudaEve

## 1.3

In [12]:
%%writefile 1.3.cu
#include <stdio.h>
#include <cuda_runtime.h>
#define N 1048576
__device__ float d_A[N];
__device__ float d_B[N];
__device__ float d_C[N];

__global__ void vectorAdd(int n)
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i < n)
    {
        d_C[i] = d_A[i] + d_B[i];
    }
}

__global__ void initializeArrays(int n)
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i < n)
    {
        d_A[i] = i;
        d_B[i] = 2 * i;
    }
}

int main(void)
{
    cudaError_t err = cudaSuccess;

    printf("Vector addition of %d elements\n", N);

    cudaDeviceProp deviceProp;
    err = cudaGetDeviceProperties(&deviceProp, 0);
    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to get device properties (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    float memClockRate = deviceProp.memoryClockRate * 1000;
    float memBusWidth = deviceProp.memoryBusWidth;
    float theoreticalBW = (memClockRate * memBusWidth * 2) / 8.0f;
    theoreticalBW = theoreticalBW / 1.0e9f; // Convert to GB/s

    printf("Device: %s\n", deviceProp.name);
    printf("Memory Clock Rate (KHz): %d\n", deviceProp.memoryClockRate);
    printf("Memory Bus Width (bits): %d\n", deviceProp.memoryBusWidth);
    printf("Theoretical Bandwidth (GB/s): %f\n", theoreticalBW);

    int threadsPerBlock = 256;
    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

    printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);

    initializeArrays<<<blocksPerGrid, threadsPerBlock>>>(N);
    err = cudaGetLastError();
    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to launch initialize kernel (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    cudaDeviceSynchronize();

    vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(N);
    err = cudaGetLastError();
    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    cudaDeviceSynchronize();

    float h_C[10]; // Just verify first 10 elements
    err = cudaMemcpyFromSymbol(h_C, d_C, 10 * sizeof(float));
    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to copy result array from device to host (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    for (int i = 0; i < 10; i++)
    {
        float expected = i + 2*i;
        if (fabs(h_C[i] - expected) > 1e-5)
        {
            fprintf(stderr, "Result verification failed at element %d!\n", i);
            exit(EXIT_FAILURE);
        }
    }

    printf("Test PASSED\n");

    err = cudaDeviceReset();
    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to deinitialize the device! error=%s\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    printf("Done\n");
    return 0;
}

Overwriting 1.3.cu


In [13]:
!nvcc 1.3.cu -o 1.3 -arch=sm_75
!./1.3

Vector addition of 1048576 elements
Device: Tesla T4
Memory Clock Rate (KHz): 5001000
Memory Bus Width (bits): 256
Theoretical Bandwidth (GB/s): 45.186092
CUDA kernel launch with 4096 blocks of 256 threads
Test PASSED
Done


In [14]:
!nvprof ./1.3

Vector addition of 1048576 elements
==6261== NVPROF is profiling process 6261, command: ./1.3
Device: Tesla T4
Memory Clock Rate (KHz): 5001000
Memory Bus Width (bits): 256
Theoretical Bandwidth (GB/s): 45.186092
CUDA kernel launch with 4096 blocks of 256 threads
Test PASSED
Done
==6261== Profiling application: ./1.3
==6261== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   57.51%  55.007us         1  55.007us  55.007us  55.007us  vectorAdd(int)
                   39.75%  38.015us         1  38.015us  38.015us  38.015us  initializeArrays(int)
                    2.74%  2.6240us         1  2.6240us  2.6240us  2.6240us  [CUDA memcpy DtoH]
      API calls:   78.58%  100.73ms         2  50.365ms  6.2520us  100.72ms  cudaLaunchKernel
                   21.13%  27.086ms         1  27.086ms  27.086ms  27.086ms  cudaDeviceReset
                    0.11%  136.49us       114  1.1970us     106ns  55.292us  cuDeviceGetAttribute


## 1.4

In [21]:
%%writefile 1.4.cu
#include <stdio.h>
#include <cuda_runtime.h>
#define N 1048576
__device__ float d_A[N];
__device__ float d_B[N];
__device__ float d_C[N];

__global__ void vectorAdd(int n)
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i < n)
    {
        d_C[i] = d_A[i] + d_B[i];
    }
}

__global__ void initializeArrays(int n)
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i < n)
    {
        d_A[i] = i;
        d_B[i] = 2 * i;
    }
}

int main(void)
{
    cudaError_t err = cudaSuccess;

    printf("Vector addition of %d elements\n", N);

    cudaDeviceProp deviceProp;
    err = cudaGetDeviceProperties(&deviceProp, 0);
    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to get device properties (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    float memClockRate = deviceProp.memoryClockRate * 1000;
    float memBusWidth = deviceProp.memoryBusWidth;
    float theoreticalBW = (memClockRate * memBusWidth * 2) / 8.0f;
    theoreticalBW = theoreticalBW / 1.0e9f; // Convert to GB/s

    printf("Device: %s\n", deviceProp.name);
    printf("Memory Clock Rate (KHz): %d\n", deviceProp.memoryClockRate);
    printf("Memory Bus Width (bits): %d\n", deviceProp.memoryBusWidth);
    printf("Theoretical Bandwidth (GB/s): %f\n", theoreticalBW);

    int threadsPerBlock = 256;
    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

    printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);

    initializeArrays<<<blocksPerGrid, threadsPerBlock>>>(N);
    err = cudaGetLastError();
    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to launch initialize kernel (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    cudaDeviceSynchronize();

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaEventRecord(start, NULL);

    vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(N);
    err = cudaGetLastError();
    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    cudaEventRecord(stop, NULL);

    cudaEventSynchronize(stop);

    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);

    printf("Kernel execution time: %f ms\n", milliseconds);

    size_t bytesRead = 2 * sizeof(float) * N;
    size_t bytesWritten = sizeof(float) * N;
    float totalBytes = (float)(bytesRead + bytesWritten);
    float seconds = milliseconds / 1000.0f;
    float measuredBW = totalBytes / seconds / 1.0e9f;

    printf("Data read: %.2f MB\n", bytesRead / (1024.0f * 1024.0f));
    printf("Data written: %.2f MB\n", bytesWritten / (1024.0f * 1024.0f));
    printf("Measured Bandwidth (GB/s): %.2f\n", measuredBW);
    printf("Bandwidth utilization: %.2f%%\n", (measuredBW / theoreticalBW) * 100.0f);

    float h_C[10]; // Just verify first 10 elements
    err = cudaMemcpyFromSymbol(h_C, d_C, 10 * sizeof(float));
    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to copy result array from device to host (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    for (int i = 0; i < 10; i++)
    {
        float expected = i + 2*i;
        if (fabs(h_C[i] - expected) > 1e-5)
        {
            fprintf(stderr, "Result verification failed at element %d!\n", i);
            exit(EXIT_FAILURE);
        }
    }

    printf("Test PASSED\n");

    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    err = cudaDeviceReset();
    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to deinitialize the device! error=%s\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    printf("Done\n");
    return 0;
}

Overwriting 1.4.cu


In [22]:
!nvcc 1.4.cu -o 1.4 -arch=sm_75
!./1.4

Vector addition of 1048576 elements
Device: Tesla T4
Memory Clock Rate (KHz): 5001000
Memory Bus Width (bits): 256
Theoretical Bandwidth (GB/s): 45.186092
CUDA kernel launch with 4096 blocks of 256 threads
Kernel execution time: 0.067040 ms
Data read: 8.00 MB
Data written: 4.00 MB
Measured Bandwidth (GB/s): 187.69
Bandwidth utilization: 415.38%
Test PASSED
Done


In [23]:
!nvprof ./1.4

Vector addition of 1048576 elements
==8146== NVPROF is profiling process 8146, command: ./1.4
Device: Tesla T4
Memory Clock Rate (KHz): 5001000
Memory Bus Width (bits): 256
Theoretical Bandwidth (GB/s): 45.186092
CUDA kernel launch with 4096 blocks of 256 threads
Kernel execution time: 0.064608 ms
Data read: 8.00 MB
Data written: 4.00 MB
Measured Bandwidth (GB/s): 194.76
Bandwidth utilization: 431.01%
Test PASSED
Done
==8146== Profiling application: ./1.4
==8146== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   57.50%  54.463us         1  54.463us  54.463us  54.463us  vectorAdd(int)
                   40.03%  37.919us         1  37.919us  37.919us  37.919us  initializeArrays(int)
                    2.47%  2.3350us         1  2.3350us  2.3350us  2.3350us  [CUDA memcpy DtoH]
      API calls:   75.73%  83.909ms         2  41.955ms  7.1890us  83.902ms  cudaLaunchKernel
                   23.90%  26.483ms         1  26.