<a href="https://colab.research.google.com/github/BaggyBro/CUDA_learn/blob/main/cuda.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!nvidia-smi

Sun Mar 23 07:14:23 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   38C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
%%writefile vector_add.cu
#include <iostream>
#include <cuda_runtime.h>

__global__ void vectorAdd(int *a, int *b, int *c, int N) {
    int i = threadIdx.x + blockIdx.x * blockDim.x;
    if (i < N)
        c[i] = a[i] + b[i];
}

void checkCuda(cudaError_t result, const char *msg) {
    if (result != cudaSuccess) {
        std::cerr << "CUDA Error: " << msg << " - " << cudaGetErrorString(result) << std::endl;
        exit(1);
    }
}

int main() {
    const int N = 10;
    int a[N], b[N], c[N];
    for (int i = 0; i < N; i++) {
        a[i] = i;
        b[i] = i * 10;
    }

    int *d_a, *d_b, *d_c;
    checkCuda(cudaMalloc(&d_a, N * sizeof(int)), "Allocating d_a");
    checkCuda(cudaMalloc(&d_b, N * sizeof(int)), "Allocating d_b");
    checkCuda(cudaMalloc(&d_c, N * sizeof(int)), "Allocating d_c");

    checkCuda(cudaMemcpy(d_a, a, N * sizeof(int), cudaMemcpyHostToDevice), "Copying a");
    checkCuda(cudaMemcpy(d_b, b, N * sizeof(int), cudaMemcpyHostToDevice), "Copying b");

    int threadsPerBlock = 256;
    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
    vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, d_c, N);

    // Check for kernel launch errors
    checkCuda(cudaGetLastError(), "Kernel launch");
    checkCuda(cudaDeviceSynchronize(), "Kernel execution");

    checkCuda(cudaMemcpy(c, d_c, N * sizeof(int), cudaMemcpyDeviceToHost), "Copying c");

    for (int i = 0; i < N; i++)
        std::cout << a[i] << " + " << b[i] << " = " << c[i] << std::endl;

    cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
    return 0;
}


Overwriting vector_add.cu


In [None]:
!nvcc -arch=sm_75 -rdc=true -lcudadevrt -o vector_add vector_add.cu

In [None]:
!./vector_add

0 + 0 = 0
1 + 10 = 11
2 + 20 = 22
3 + 30 = 33
4 + 40 = 44
5 + 50 = 55
6 + 60 = 66
7 + 70 = 77
8 + 80 = 88
9 + 90 = 99
