In [None]:
%%writefile vector_add.cu
#include <iostream>
#include <cuda_runtime.h>  
__global__ void vectorAdd(const float* A, const float* B, float* C, int N) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < N) {
        C[i] = A[i] + B[i];
    }
}
int main() {
    const int N = 10;
    float A[N], B[N], C[N];
    // Initializing input arrays
    for (int i = 0; i < N; i++) {
        A[i] = i;
        B[i] = i * 2;
    }
    float *d_a, *d_b, *d_c;
    cudaMalloc(&d_a, N * sizeof(float));
    cudaMalloc(&d_b, N * sizeof(float));
    cudaMalloc(&d_c, N * sizeof(float));
    cudaMemcpy(d_a, A, N * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, B, N * sizeof(float), cudaMemcpyHostToDevice);
    int blocksize = 256;
    int gridsize = (N + blocksize - 1) / blocksize;
    vectorAdd<<<gridsize, blocksize>>>(d_a, d_b, d_c, N);
    cudaDeviceSynchronize();  // Ensure kernel completes before copying back
    cudaMemcpy(C, d_c, N * sizeof(float), cudaMemcpyDeviceToHost);
    // Printing the result
    for (int i = 0; i < N; i++) {
        std::cout << A[i] << " + " << B[i] << " = " << C[i] << std::endl;
    }
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);
    return 0;
}

Writing vector_add.cu


In [None]:
# Compile with the specified architecture
!nvcc vector_add.cu -o vector_add -gencode arch=compute_75,code=sm_75
# Run the executable
!./vector_add

0 + 0 = 0
1 + 2 = 3
2 + 4 = 6
3 + 6 = 9
4 + 8 = 12
5 + 10 = 15
6 + 12 = 18
7 + 14 = 21
8 + 16 = 24
9 + 18 = 27
