# 1. Write a CUDA C/C++ program to perform element-wise addition of two vectors.
C[i]=A[i]+B[i]

Given: Vector size: N = 1024

In [1]:
%%writefile vector_addition.cu
#include<stdio.h>
#define N 1024

__global__ void vectoradd(int *a,int*b,int*c){
  int i=blockIdx.x*blockDim.x+threadIdx.x;
  c[i]=a[i]+b[i];
}

int main(){
  int a[N],b[N],c[N];
  int *d_a,*d_b,*d_c;
  for (int i=0;i<N;i++){
    a[i]=i;
    b[i]=2*i;
  }
  int size=N*sizeof(int);
  cudaMalloc((void**)&d_a,size);
  cudaMalloc((void**)&d_b,size);
  cudaMalloc((void**)&d_c,size);

  cudaMemcpy(d_a,a,size,cudaMemcpyHostToDevice);
  cudaMemcpy(d_b,b,size,cudaMemcpyHostToDevice);
  vectoradd<<<N/256,256>>>(d_a,d_b,d_c);

  cudaMemcpy(c,d_c,size,cudaMemcpyDeviceToHost);
  printf("First 10 results:\n");
  for (int i=0;i<10;i++){
    printf("%d\n",c[i]);
  }
  cudaFree(d_a);
  cudaFree(d_b);
  cudaFree(d_c);

  return 0;
}

Overwriting vector_addition.cu


In [2]:
! nvcc -arch=sm_75 vector_addition.cu -o add

In [3]:
! ./add

First 10 results:
0
3
6
9
12
15
18
21
24
27


# 2. Perform the same vector additon as in Q1 using Thrust library only

In [4]:
%%writefile add.cu

#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <iostream>
#define N 1024

int main(){

  thrust::host_vector<float> h_a(N);
  thrust::host_vector<float> h_b(N);
  for (int i=0;i<N;i++){
    h_a[i]=i;
    h_b[i]=2*i;
  }
  thrust::device_vector<int>a=h_a;
  thrust::device_vector<int>b=h_b;
  thrust::device_vector<int>c(N);
  thrust::transform(a.begin(),a.end(),b.begin(),c.begin(),thrust::plus<int>());
  thrust::host_vector<int>h_c=c;
  printf("First 10 results:\n");
  for (int i=0;i<10;i++){
    printf("%d\n",h_c[i]);
  }

}

Overwriting add.cu


In [5]:
!nvcc -arch=sm_75 add.cu -o add

In [6]:
! ./add

First 10 results:
0
3
6
9
12
15
18
21
24
27


# 3. Compute the dot product of two vectors of size, N =1024: Result=∑A[i]×B[i] using Thrust and compare its performance with that on CPU.

In [7]:
%%writefile inner_product.cu

#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/inner_product.h>
#include <iostream>
#include <chrono>
#define N 1024

int main(){
  thrust::host_vector<int> h_a(N);
  thrust::host_vector<int> h_b(N);
  for (int i=0;i<N;i++){
    h_a[i]=i;
    h_b[i]=2*i;
  }

  auto cpu_start = std::chrono::high_resolution_clock::now();
  int cpu_result = 0;
  for(int i=0;i<N;i++){
      cpu_result += h_a[i] * h_b[i];
  }
  auto cpu_end = std::chrono::high_resolution_clock::now();

  auto start = std::chrono::high_resolution_clock::now();
  thrust::device_vector<int>a=h_a;
  thrust::device_vector<int>b=h_b;
  int result = thrust::inner_product(a.begin(),a.end(),b.begin(),0);
  auto end = std::chrono::high_resolution_clock::now();

  std::cout << "CPU Dot Product = " << cpu_result <<std::endl;
  std::cout << "CPU Time = "
            << std::chrono::duration<double, std::milli>(cpu_end - cpu_start).count()
            << " ms\n";

  std::cout << "\nGPU Dot Product = " << result << std::endl;
  std::cout << "GPU Time: " << std::chrono::duration<double, std::milli>(end-start).count() <<" ms\n";
  return 0;
}

Overwriting inner_product.cu


 for int result -> The computation happens on GPU, but the final scalar result is copied back and stored in host memory.

In [8]:
!nvcc -arch=sm_75 inner_product.cu -o inner_prod

In [9]:
! ./inner_prod

CPU Dot Product = 714779648
CPU Time = 0.127337 ms

GPU Dot Product = 714779648
GPU Time: 283.062 ms


# 4. Write a CUDA kernel for matrix multiplication: C=A×B where Matrix size is 16 X 16. Explain why matrix multiplication needs more computation than addition (as in Q1).

In [10]:
%%writefile matrix_multiplication.cu
#include <stdio.h>
#include <cuda_runtime.h>

#define N 16

__global__ void matrixMul(float *A, float *B, float *C)
{
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    float sum = 0.0f;

    for (int k=0;k<N;k++)
    {
        sum+=A[row*N+k]*B[k*N+col];
    }

    C[row*N+col]=sum;
}

int main()
{
    float A[N][N], B[N][N], C[N][N];

    float *d_A, *d_B, *d_C;

    for(int i=0;i<N;i++)
    {
        for(int j=0;j<N;j++)
        {
            A[i][j]=i+j;
            B[i][j]=1;
        }
    }

    cudaMalloc((void**)&d_A, N*N*sizeof(float));
    cudaMalloc((void**)&d_B, N*N*sizeof(float));
    cudaMalloc((void**)&d_C, N*N*sizeof(float));

    cudaMemcpy(d_A, A, N*N*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, B, N*N*sizeof(float), cudaMemcpyHostToDevice);

    dim3 threadsPerBlock(16,16);
    dim3 blocksPerGrid(1,1);

    matrixMul<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C);

    cudaMemcpy(C, d_C, N*N*sizeof(float), cudaMemcpyDeviceToHost);

    printf("Result Matrix C:\n");
    for(int i=0;i<N;i++)
    {
        for(int j=0;j<N;j++)
        {
            printf("%6.1f ",C[i][j]);
        }
        printf("\n");
    }

    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    return 0;
}


Overwriting matrix_multiplication.cu


In [11]:
!nvcc -arch=sm_75 matrix_multiplication.cu -o matrix_multiplication

In [12]:
! ./matrix_multiplication

Result Matrix C:
 120.0  120.0  120.0  120.0  120.0  120.0  120.0  120.0  120.0  120.0  120.0  120.0  120.0  120.0  120.0  120.0 
 136.0  136.0  136.0  136.0  136.0  136.0  136.0  136.0  136.0  136.0  136.0  136.0  136.0  136.0  136.0  136.0 
 152.0  152.0  152.0  152.0  152.0  152.0  152.0  152.0  152.0  152.0  152.0  152.0  152.0  152.0  152.0  152.0 
 168.0  168.0  168.0  168.0  168.0  168.0  168.0  168.0  168.0  168.0  168.0  168.0  168.0  168.0  168.0  168.0 
 184.0  184.0  184.0  184.0  184.0  184.0  184.0  184.0  184.0  184.0  184.0  184.0  184.0  184.0  184.0  184.0 
 200.0  200.0  200.0  200.0  200.0  200.0  200.0  200.0  200.0  200.0  200.0  200.0  200.0  200.0  200.0  200.0 
 216.0  216.0  216.0  216.0  216.0  216.0  216.0  216.0  216.0  216.0  216.0  216.0  216.0  216.0  216.0  216.0 
 232.0  232.0  232.0  232.0  232.0  232.0  232.0  232.0  232.0  232.0  232.0  232.0  232.0  232.0  232.0  232.0 
 248.0  248.0  248.0  248.0  248.0  248.0  248.0  248.0  248.0  248.0  248.0  2

__Case 1: Matrix Addition (16 × 16)__

Formula: C[i][j]=A[i][j]+B[i][j]

For each element: Only 1 addition

Total operations: 16×16=256 additions

Time complexity:O(N^2)

__Case 2: Matrix Multiplication (16 × 16)__

Formula: C[i][j]= k=0∑15 A[i][k]×B[k][j]

For each element: 16 multiplications and 15 additions

Total elements: 16×16=256

Total operations: 256×16=4096 multiplications

Time complexity:O(N^3)

# 5. For vector addition of size 5,000,000, implement and compare:

• CPU sequential C/C++ program

• CUDA kernel implementation

• Thrust implementation

• RAPIDS implementation

Measure execution time and compare complexity for each approach and
present results in a table. Plot comparison graph.

In [13]:
%%writefile cpu_vector_add.cu

#include <iostream>
#include <vector>
#include <chrono>

#define N 5000000

int main() {

    std::vector<float> A(N, 1.0f);
    std::vector<float> B(N, 2.0f);
    std::vector<float> C(N);

    auto start = std::chrono::high_resolution_clock::now();

    for(int i = 0; i < N; i++)
        C[i] = A[i] + B[i];

    auto end = std::chrono::high_resolution_clock::now();

    std::cout << "CPU Time: "
              << std::chrono::duration<double, std::milli>(end-start).count()
              << " ms\n";

    return 0;
}


Overwriting cpu_vector_add.cu


In [14]:
!nvcc -arch=sm_75 cpu_vector_add.cu -o cpu_vector_add

In [15]:
! ./cpu_vector_add

CPU Time: 37.7338 ms


In [16]:
%%writefile cuda_vector_add.cu
#include<stdio.h>
#include <cuda_runtime.h>
#include <chrono>
#include <iostream>

#define N 5000000

__global__ void vectoradd(int *a,int*b,int*c){
  int i=blockIdx.x*blockDim.x+threadIdx.x;
  if(i < N)
    c[i]=a[i]+b[i];
}

int main(){

  int *a = new int[N];
  int *b = new int[N];
  int *c = new int[N];

  int *d_a,*d_b,*d_c;

  for (int i=0;i<N;i++){
    a[i]=i;
    b[i]=2*i;
  }

  int size=N*sizeof(int);

  cudaMalloc((void**)&d_a,size);
  cudaMalloc((void**)&d_b,size);
  cudaMalloc((void**)&d_c,size);

  int blockSize = 256;
  int gridSize = (N + blockSize - 1) / blockSize;

  auto start = std::chrono::high_resolution_clock::now();

  cudaMemcpy(d_a,a,size,cudaMemcpyHostToDevice);
  cudaMemcpy(d_b,b,size,cudaMemcpyHostToDevice);

  vectoradd<<<gridSize,blockSize>>>(d_a,d_b,d_c);

  cudaDeviceSynchronize();

  cudaMemcpy(c,d_c,size,cudaMemcpyDeviceToHost);

  auto end = std::chrono::high_resolution_clock::now();

  printf("First 10 results:\n");
  for (int i=0;i<10;i++){
    printf("%d\n",c[i]);
  }

  std::cout << "CUDA Time: "
            << std::chrono::duration<double, std::milli>(end-start).count()
            << " ms\n";

  cudaFree(d_a);
  cudaFree(d_b);
  cudaFree(d_c);

  delete[] a;
  delete[] b;
  delete[] c;

  return 0;
}


Overwriting cuda_vector_add.cu


In [17]:
!nvcc -arch=sm_75 cuda_vector_add.cu -o cuda_vector_add

In [18]:
! ./cuda_vector_add

First 10 results:
0
3
6
9
12
15
18
21
24
27
CUDA Time: 24.9006 ms


In [19]:
%%writefile add.cu

#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <chrono>
#include <iostream>
#define N 5000000

int main(){

  thrust::host_vector<float> h_a(N);
  thrust::host_vector<float> h_b(N);
  for (int i=0;i<N;i++){
    h_a[i]=i;
    h_b[i]=2*i;
  }

  {
    thrust::device_vector<float> warm_a = h_a;
    thrust::device_vector<float> warm_b = h_b;
    thrust::device_vector<float> warm_c(N);

    thrust::transform(warm_a.begin(),
                        warm_a.end(),
                        warm_b.begin(),
                        warm_c.begin(),
                        thrust::plus<float>());

    cudaDeviceSynchronize();   // Ensure GPU finished
  }

  auto start = std::chrono::high_resolution_clock::now();
  thrust::device_vector<int>a=h_a;
  thrust::device_vector<int>b=h_b;
  thrust::device_vector<int>c(N);
  thrust::transform(a.begin(),a.end(),b.begin(),c.begin(),thrust::plus<int>());
  thrust::host_vector<int>h_c=c;
  auto end = std::chrono::high_resolution_clock::now();

  printf("First 10 results:\n");
  for (int i=0;i<10;i++){
    printf("%d\n",h_c[i]);
  }

  std::cout << "Thrust Time:"
              << std::chrono::duration<double, std::milli>(end-start).count()
              << " ms\n";

}

Overwriting add.cu


In [20]:
!nvcc -arch=sm_75 add.cu -o add

In [21]:
! ./add

First 10 results:
0
3
6
9
12
15
18
21
24
27
Thrust Time:676.17 ms


In [22]:
# rapids_vector_add.py
import cupy as cp
import time

N = 5_000_000

A = cp.ones(N, dtype=cp.float32)
B = cp.full(N, 2.0, dtype=cp.float32)

_ = A + B
cp.cuda.Stream.null.synchronize()

start = time.time()
C = A + B
cp.cuda.Stream.null.synchronize()
end = time.time()

print("RAPIDS (CuPy) Time:", (end - start) * 1000, "ms")


RAPIDS (CuPy) Time: 0.48828125 ms


# 6. Write a CUDA C++ program using the Thrust library to compute the sum of all elements in a vector stored on the GPU. The vector is of size 10 and it should be initialized with values 1,…..10

In [23]:
%%writefile thrust_sum.cu

#include <thrust/device_vector.h>
#include <thrust/reduce.h>
#include <iostream>

int main()
{
    const int N = 10;
    thrust::device_vector<int> d_vec(N);
    for(int i=0;i<N;i++)
    {
        d_vec[i]=i+1;
    }
    int sum = thrust::reduce(d_vec.begin(),
                             d_vec.end(),
                             0,
                             thrust::plus<int>());

    std::cout << "Sum of elements (1 to 10) = " << sum << std::endl;

    return 0;
}


Overwriting thrust_sum.cu


In [24]:
!nvcc -arch=sm_75 thrust_sum.cu -o thrust_sum

In [25]:
! ./thrust_sum

Sum of elements (1 to 10) = 55


# 7. Write a CUDA C++ program using Thrust to sort (ascending) a vector of integers on the GPU. Consider vector size 8 with following values: 7, 2, 9, 1, 5, 3, 8, 4. Print the vector before and afer sorting

In [26]:
%%writefile thrust_sort.cu

#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/sort.h>
#include <iostream>

int main()
{
    const int N = 8;
    thrust::host_vector<int> h_vec(N);

    int values[N] = {7, 2, 9, 1, 5, 3, 8, 4};

    for(int i = 0; i < N; i++)
        h_vec[i] = values[i];

    std::cout << "Before Sorting:\n";
    for(int i = 0; i < N; i++)
        std::cout << h_vec[i] << " ";
    std::cout << std::endl;

    thrust::device_vector<int> d_vec = h_vec;
    thrust::sort(d_vec.begin(), d_vec.end());
    h_vec = d_vec;

    std::cout << "After Sorting (Ascending):\n";
    for(int i = 0; i < N; i++)
        std::cout << h_vec[i] << " ";
    std::cout << std::endl;

    return 0;
}


Overwriting thrust_sort.cu


In [27]:
!nvcc -arch=sm_75 thrust_sort.cu -o thrust_sort

In [28]:
! ./thrust_sort

Before Sorting:
7 2 9 1 5 3 8 4 
After Sorting (Ascending):
1 2 3 4 5 7 8 9 
