In [1]:

!nvcc --version
!pip install git+https://github.com/afnan47/cuda.git
%load_ext nvcc_plugin


nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0
Collecting git+https://github.com/afnan47/cuda.git
  Cloning https://github.com/afnan47/cuda.git to /tmp/pip-req-build-4any113t
  Running command git clone --filter=blob:none --quiet https://github.com/afnan47/cuda.git /tmp/pip-req-build-4any113t
  Resolved https://github.com/afnan47/cuda.git to commit aac710a35f52bb78ab34d2e52517237941399eff
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4289 sha256=7ca17e541c4a029b45cd484b77537c1e5f23c131deb79f78e5f877617bba9385
  Stored in directory: /tmp/pip-ephem-wheel-cache-3p7teppe/wheels/aa/f3/44/e10c1d226ec561d971fcd4b0463f6bff08602afa928a3e

In [9]:
%%writefile add.cu
#include <iostream>
#include <vector>
#include <cuda_runtime.h>

// CUDA kernel to add two large vectors
__global__ void vectorAdd(const int *a, const int *b, int *c, int size) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < size) {
        printf("Thread %d is computing c[%d] = a[%d] + b[%d]\n", threadIdx.x, i, i, i);
        c[i] = a[i] + b[i];
    }
}

int main() {
    int size;
    std::cout << "Enter the size of the vectors: ";
    std::cin >> size;

    // Allocate memory for vectors on host
    std::vector<int> host_a(size);
    std::vector<int> host_b(size);
    std::vector<int> host_c(size);

    // Input elements of vectors
    std::cout << "Enter elements of vector A: ";
    for (int i = 0; i < size; ++i) {
        std::cin >> host_a[i];
    }

    std::cout << "Enter elements of vector B: ";
    for (int i = 0; i < size; ++i) {
        std::cin >> host_b[i];
    }

    // Allocate memory for vectors on device
    int *device_a, *device_b, *device_c;
    cudaMalloc(&device_a, size * sizeof(int));
    cudaMalloc(&device_b, size * sizeof(int));
    cudaMalloc(&device_c, size * sizeof(int));

    // Copy vectors from host to device
    cudaMemcpy(device_a, host_a.data(), size * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(device_b, host_b.data(), size * sizeof(int), cudaMemcpyHostToDevice);

    // Define grid and block dimensions
    int blockSize = 256;
    int numBlocks = (size + blockSize - 1) / blockSize;

    // Launch kernel
    vectorAdd<<<numBlocks, blockSize>>>(device_a, device_b, device_c, size);

    // Copy result vector from device to host
    cudaMemcpy(host_c.data(), device_c, size * sizeof(int), cudaMemcpyDeviceToHost);

    // Print result
    std::cout << "Result vector:" << std::endl;
    for (int i = 0; i < size; ++i) {
        std::cout << host_a[i] << " + " << host_b[i] << " = " << host_c[i] << std::endl;
    }

    // Free device memory
    cudaFree(device_a);
    cudaFree(device_b);
    cudaFree(device_c);

    return 0;
}


Overwriting add.cu


In [10]:
!nvcc add.cu -o add
!./add

Enter the size of the vectors: 4
Enter elements of vector A: 12 2 3 4
Enter elements of vector B: 44 3 23 55
Thread 0 is computing c[0] = a[0] + b[0]
Thread 1 is computing c[1] = a[1] + b[1]
Thread 2 is computing c[2] = a[2] + b[2]
Thread 3 is computing c[3] = a[3] + b[3]
Result vector:
12 + 44 = 56
2 + 3 = 5
3 + 23 = 26
4 + 55 = 59
