In [None]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0


In [None]:
%%writefile cuda_program.cu
#include <iostream>
#include <cuda_runtime.h>

// Kernel for vector addition
__global__ void addVectors(int* A, int* B, int* C, int n) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < n) {
        C[i] = A[i] + B[i];
    }
}

void vectorAddition() {
    int n = 1000000;
    int size = n * sizeof(int);
    int *A, *B, *C, *d_A, *d_B, *d_C;

    // Allocate pinned host memory for better data transfer performance
    cudaMallocHost(&A, size);
    cudaMallocHost(&B, size);
    cudaMallocHost(&C, size);
    cudaMalloc(&d_A, size);
    cudaMalloc(&d_B, size);
    cudaMalloc(&d_C, size);

    for (int i = 0; i < n; i++) {
        A[i] = i;
        B[i] = i * 2;
    }

    cudaMemcpy(d_A, A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, B, size, cudaMemcpyHostToDevice);

    int blockSize = 256;
    int numBlocks = (n + blockSize - 1) / blockSize;
    addVectors<<<numBlocks, blockSize>>>(d_A, d_B, d_C, n);
    cudaDeviceSynchronize();

    cudaMemcpy(C, d_C, size, cudaMemcpyDeviceToHost);

    std::cout << "Vector Addition Result (First 10 values):\n";
    for (int i = 0; i < 10; i++) std::cout << C[i] << " ";
    std::cout << "\n";

    cudaFree(d_A); cudaFree(d_B); cudaFree(d_C);
    cudaFreeHost(A); cudaFreeHost(B); cudaFreeHost(C);
}

// Kernel for matrix multiplication
__global__ void matmul(int* A, int* B, int* C, int N) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    if (row < N && col < N) {
        int sum = 0;
        for (int k = 0; k < N; k++) {
            sum += A[row * N + k] * B[k * N + col];
        }
        C[row * N + col] = sum;
    }
}

void matrixMultiplication() {
    int N = 512;
    int size = N * N * sizeof(int);
    int *A, *B, *C, *d_A, *d_B, *d_C;

    cudaMallocHost(&A, size);
    cudaMallocHost(&B, size);
    cudaMallocHost(&C, size);
    cudaMalloc(&d_A, size);
    cudaMalloc(&d_B, size);
    cudaMalloc(&d_C, size);

    // Initialize matrices with smaller values to avoid overflow
    for (int i = 0; i < N * N; i++) {
        A[i] = (i % 10) + 1;
        B[i] = (i % 10) + 1;
    }

    cudaMemcpy(d_A, A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, B, size, cudaMemcpyHostToDevice);

    dim3 blockSize(16, 16);
    dim3 gridSize((N + 15) / 16, (N + 15) / 16);
    matmul<<<gridSize, blockSize>>>(d_A, d_B, d_C, N);
    cudaDeviceSynchronize();

    cudaMemcpy(C, d_C, size, cudaMemcpyDeviceToHost);

    std::cout << "Matrix Multiplication Result (First 5x5 values):\n";
    for (int i = 0; i < 5; i++) {
        for (int j = 0; j < 5; j++) {
            std::cout << C[i * N + j] << " ";
        }
        std::cout << "\n";
    }

    cudaFree(d_A); cudaFree(d_B); cudaFree(d_C);
    cudaFreeHost(A); cudaFreeHost(B); cudaFreeHost(C);
}

int main() {
    vectorAddition();
    matrixMultiplication();
    return 0;
}


Writing cuda_program.cu


In [None]:
!nvcc -arch=sm_75 cuda_program.cu -o cuda_program

In [None]:
!./cuda_program


Vector Addition Result (First 10 values):
0 3 6 9 12 15 18 21 24 27 
Matrix Multiplication Result (First 5x5 values):
16072 18880 14038 16846 13024 
13020 15832 14054 16866 16108 
14048 16864 13050 15866 13072 
14056 16876 16126 18946 14116 
13044 15868 13082 15906 14140 


In [None]:
!nvcc --version
!nvidia-smi


nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0
Tue Apr  1 13:58:04 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   42C    P8             12W /   70W |       0MiB /  15360MiB |      0%      Default |
|                       