In [1]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git

Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-dnjd8cba
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-dnjd8cba
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit 5741c522547756ac4bb7a16df32106a15efb8a57
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: nvcc4jupyter
  Building wheel for nvcc4jupyter (pyproject.toml) ... [?25l[?25hdone
  Created wheel for nvcc4jupyter: filename=nvcc4jupyter-1.2.1-py3-none-any.whl size=10741 sha256=1c43b610d84440f376c57bb0b3d20f87e2433fdb6f06eb98d5fd7c81c461a971
  Stored in directory: /tmp/pip-ephem-wheel-cache-n3ieb3zs/wheels/a8/b9/18/23f8ef71ceb0f63297dd1903aedd067e6243a68ea756d6feea
Successfully bu

In [2]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


In [3]:
pip install nvcc4jupyter



In [4]:
%load_ext nvcc4jupyter

Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmp7vd6avqz".


In [5]:
%%cuda

#include <stdio.h>

#define HANDLE_ERROR( err ) ( HandleError( err, __FILE__, __LINE__ ) )

static void HandleError( cudaError_t err, const char *file, int line )
{
    if (err != cudaSuccess)
      {
        printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
                file, line );
        exit( EXIT_FAILURE );
    }
}

const int N = 1000000; // 1 million

__global__ void Vector_Addition_CUDA(const int *dev_a, const int *dev_b, int *dev_c, int n)
{
    int tid = blockIdx.x * blockDim.x + threadIdx.x;

    if (tid < n)
        dev_c[tid] = dev_a[tid] + dev_b[tid];
}

int main(void)
{
    int *Host_a, *Host_b, *Host_c;
    int *dev_a, *dev_b, *dev_c;

    // Allocate memory for host arrays
    Host_a = (int*)malloc(N * sizeof(int));
    Host_b = (int*)malloc(N * sizeof(int));
    Host_c = (int*)malloc(N * sizeof(int));

    // Initialize host arrays
    for (int i = 0; i < N; i++)
    {
        Host_a[i] = i;
        Host_b[i] = i * i;
    }

    // Allocate memory on device
    HANDLE_ERROR(cudaMalloc((void **)&dev_a, N * sizeof(int)));
    HANDLE_ERROR(cudaMalloc((void **)&dev_b, N * sizeof(int)));
    HANDLE_ERROR(cudaMalloc((void **)&dev_c, N * sizeof(int)));

    // Copy host arrays to device
    HANDLE_ERROR(cudaMemcpy(dev_a, Host_a, N * sizeof(int), cudaMemcpyHostToDevice));
    HANDLE_ERROR(cudaMemcpy(dev_b, Host_b, N * sizeof(int), cudaMemcpyHostToDevice));

    // Calculate grid size
    int threads_per_block = 256;
    int num_blocks = (N + threads_per_block - 1) / threads_per_block;

    // Start timer
    cudaEvent_t start, stop;
    float cuda_elapsed_time_ms;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);

    // Launch kernel
    Vector_Addition_CUDA<<<num_blocks, threads_per_block>>>(dev_a, dev_b, dev_c, N);

    // Stop timer
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&cuda_elapsed_time_ms, start, stop);

    // Copy result back to host
    HANDLE_ERROR(cudaMemcpy(Host_c, dev_c, N * sizeof(int), cudaMemcpyDeviceToHost));
      // Print result
    for (int i = 0; i < N; i++)
        printf("%d + %d = %d\n", Host_a[i], Host_b[i], Host_c[i]);

    // Print elapsed time
    printf("Time elapsed on CUDA Vector addition for %d size input : %f ms.\n\n", N, cuda_elapsed_time_ms);

    // Free device memory
    cudaFree(dev_a);
    cudaFree(dev_b);
    cudaFree(dev_c);

    // Free host memory
    free(Host_a);
    free(Host_b);
    free(Host_c);

    return 0;
}


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
995003 + -2106475367 = -2105480364
995004 + -2104485360 = -2103490356
995005 + -2102495351 = -2101500346
995006 + -2100505340 = -2099510334
995007 + -2098515327 = -2097520320
995008 + -2096525312 = -2095530304
995009 + -2094535295 = -2093540286
995010 + -2092545276 = -2091550266
995011 + -2090555255 = -2089560244
995012 + -2088565232 = -2087570220
995013 + -2086575207 = -2085580194
995014 + -2084585180 = -2083590166
995015 + -2082595151 = -2081600136
995016 + -2080605120 = -2079610104
995017 + -2078615087 = -2077620070
995018 + -2076625052 = -2075630034
995019 + -2074635015 = -2073639996
995020 + -2072644976 = -2071649956
995021 + -2070654935 = -2069659914
995022 + -2068664892 = -2067669870
995023 + -2066674847 = -2065679824
995024 + -2064684800 = -2063689776
995025 + -2062694751 = -2061699726
995026 + -2060704700 = -2059709674
995027 + -2058714647 = -2057719620
995028 + -2056724592 = -2055729564
995029 + -2054734535 = -2