In [1]:
!nvcc --version


nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


Program 1:Execute the following program and check the properties of your GPGPU.

In [2]:
# Step 1: Write the CUDA code
%%writefile cuda_device_info.cu
#include <stdio.h>
#include <cuda_runtime.h>

int main()
{
    int deviceCount;
    cudaGetDeviceCount(&deviceCount);
    printf("Checking device count...\n"); // Debug statement
    if (deviceCount == 0)
    {
        printf("There is no device supporting CUDA\n");
        return 0; // Exit the program
    }
    printf("Device Count: %d\n", deviceCount); // Debug statement
    int dev;
    for (dev = 0; dev < deviceCount; ++dev)
    {
        cudaDeviceProp deviceProp;
        cudaGetDeviceProperties(&deviceProp, dev);
        if (dev == 0)
        {
            if (deviceProp.major < 1)
            {
                printf("There is no device supporting CUDA.\n");
            }
            else if (deviceCount == 1)
            {
                printf("There is 1 device supporting CUDA\n");
            }
            else
            {
                printf("There are %d devices supporting CUDA\n", deviceCount);
            }
        }
        printf("  Device %d: \"%s\"\n", dev, deviceProp.name);
        printf("  Major revision number:                         %d\n", deviceProp.major);
        printf("  Minor revision number:                         %d\n", deviceProp.minor);
        printf("  Total amount of global memory:                 %ld bytes\n", deviceProp.totalGlobalMem);
        printf("  Total amount of constant memory:               %ld bytes\n", deviceProp.totalConstMem);
        printf("  Total amount of shared memory per block:       %ld bytes\n", deviceProp.sharedMemPerBlock);
        printf("  Total number of registers available per block: %d\n", deviceProp.regsPerBlock);
        printf("  Warp size:                                     %d\n", deviceProp.warpSize);
        printf("  Multiprocessor count:                          %d\n", deviceProp.multiProcessorCount);
        printf("  Maximum number of threads per block:           %d\n", deviceProp.maxThreadsPerBlock);
        printf("  Maximum sizes of each dimension of a block:    %d x %d x %d\n", deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1], deviceProp.maxThreadsDim[2]);
        printf("  Maximum sizes of each dimension of a grid:     %d x %d x %d\n", deviceProp.maxGridSize[0], deviceProp.maxGridSize[1], deviceProp.maxGridSize[2]);
        printf("  Maximum memory pitch:                          %ld bytes\n", deviceProp.memPitch);
        printf("  Texture alignment:                             %ld bytes\n", deviceProp.textureAlignment);
        printf("  Clock rate:                                    %d kilohertz\n", deviceProp.clockRate);
    }
    return 0;
}


Writing cuda_device_info.cu


In [3]:
# Step 2: Compile the code
!nvcc cuda_device_info.cu -o cuda_device_info

In [4]:
# Step 3: Run the executable
!./cuda_device_info

Checking device count...
Device Count: 1
There is 1 device supporting CUDA
  Device 0: "Tesla T4"
  Major revision number:                         7
  Minor revision number:                         5
  Total amount of global memory:                 15835660288 bytes
  Total amount of constant memory:               65536 bytes
  Total amount of shared memory per block:       49152 bytes
  Total number of registers available per block: 65536
  Warp size:                                     32
  Multiprocessor count:                          40
  Maximum number of threads per block:           1024
  Maximum sizes of each dimension of a block:    1024 x 1024 x 64
  Maximum sizes of each dimension of a grid:     2147483647 x 65535 x 65535
  Maximum memory pitch:                          2147483647 bytes
  Texture alignment:                             512 bytes
  Clock rate:                                    1590000 kilohertz


Program 2: Write a program to where each thread prints its thread ID along with hello world. Lauch the kernel with one block and multiple threads.

In [5]:
%%writefile cuda_device_info.cu
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void helloWorldKernel() {
    // Get the block ID and thread ID
    int blockId = blockIdx.x;
    int threadId = threadIdx.x;
    printf("Hello World from block %d, thread %d\n", blockId, threadId);
}

int main() {
    // Launch the kernel with 5 blocks and 10 threads per block
    helloWorldKernel<<<5, 10>>>(); // 5 blocks, 10 threads per block
    cudaDeviceSynchronize(); // Wait for the kernel to finish
    return 0;
}

Overwriting cuda_device_info.cu


In [6]:
!nvcc cuda_device_info.cu -o cuda_device_info


In [7]:
# Step 3: Run the executable
!./cuda_device_info

Hello World from block 4, thread 0
Hello World from block 4, thread 1
Hello World from block 4, thread 2
Hello World from block 4, thread 3
Hello World from block 4, thread 4
Hello World from block 4, thread 5
Hello World from block 4, thread 6
Hello World from block 4, thread 7
Hello World from block 4, thread 8
Hello World from block 4, thread 9
Hello World from block 1, thread 0
Hello World from block 1, thread 1
Hello World from block 1, thread 2
Hello World from block 1, thread 3
Hello World from block 1, thread 4
Hello World from block 1, thread 5
Hello World from block 1, thread 6
Hello World from block 1, thread 7
Hello World from block 1, thread 8
Hello World from block 1, thread 9
Hello World from block 3, thread 0
Hello World from block 3, thread 1
Hello World from block 3, thread 2
Hello World from block 3, thread 3
Hello World from block 3, thread 4
Hello World from block 3, thread 5
Hello World from block 3, thread 6
Hello World from block 3, thread 7
Hello World from blo

Program 3:Write a program to where each thread prints its thread ID along with hello world. Lauch the kernel with multiple blocks and multiple threads.

In [17]:
# Step 1: Write the CUDA code
%%writefile cuda_device_info.cu
#include <stdio.h>
#include <cuda_runtime.h>

#include <stdio.h>

__global__ void helloWorldKernel() {
    // Get the thread ID
    int threadId = blockIdx.x * blockDim.x + threadIdx.x;
    printf("Hello World from thread %d ( block %d)\n", threadId,blockIdx.x);
}

int main() {
    int numBlocks = 2; // Change this to the desired number of blocks
    int threadsPerBlock = 5; // Change this to the desired number of threads per block
    // Launch the kernel with multiple blocks and multiple threads
    helloWorldKernel<<<numBlocks, threadsPerBlock>>>();
    cudaDeviceSynchronize(); // Wait for the kernel to finish
    return 0;
}

Overwriting cuda_device_info.cu


In [18]:
# Step 2: Compile the code
!nvcc cuda_device_info.cu -o cuda_device_info

In [19]:
# Step 3: Run the executable
!./cuda_device_info

Hello World from thread 5 ( block 1)
Hello World from thread 6 ( block 1)
Hello World from thread 7 ( block 1)
Hello World from thread 8 ( block 1)
Hello World from thread 9 ( block 1)
Hello World from thread 0 ( block 0)
Hello World from thread 1 ( block 0)
Hello World from thread 2 ( block 0)
Hello World from thread 3 ( block 0)
Hello World from thread 4 ( block 0)


Program 4: Write a program to where each thread prints its thread ID along with hello world. Lauch the kernel with 2D blocks and 2D threads.

In [14]:
# Step 1: Write the CUDA code
%%writefile cuda_device_info.cu
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void helloWorldKernel() {
    // Get the 2D thread ID
    int threadIdX = threadIdx.x;
    int threadIdY = threadIdx.y;
    int blockIDX = blockIdx.x;
    int blockIDY = blockIdx.y;
    printf("Hello World from thread (%d, %d) for block (%d,%d)\n", threadIdX, threadIdY,blockIDX,blockIDY);
}

int main() {
    dim3 threadsPerBlock(2, 2); // Change this to the desired size of the block
    dim3 numBlocks(2, 2); // Change this to the desired number of blocks
    // Launch the kernel with 2D blocks and 2D threads
    helloWorldKernel<<<numBlocks, threadsPerBlock>>>();
    cudaDeviceSynchronize(); // Wait for the kernel to finish
    return 0;
}

Overwriting cuda_device_info.cu


In [15]:
# Step 2: Compile the code
!nvcc cuda_device_info.cu -o cuda_device_info

In [16]:
# Step 3: Run the executable
!./cuda_device_info

Hello World from thread (0, 0) for block (1,0)
Hello World from thread (1, 0) for block (1,0)
Hello World from thread (0, 1) for block (1,0)
Hello World from thread (1, 1) for block (1,0)
Hello World from thread (0, 0) for block (1,1)
Hello World from thread (1, 0) for block (1,1)
Hello World from thread (0, 1) for block (1,1)
Hello World from thread (1, 1) for block (1,1)
Hello World from thread (0, 0) for block (0,1)
Hello World from thread (1, 0) for block (0,1)
Hello World from thread (0, 1) for block (0,1)
Hello World from thread (1, 1) for block (0,1)
Hello World from thread (0, 0) for block (0,0)
Hello World from thread (1, 0) for block (0,0)
Hello World from thread (0, 1) for block (0,0)
Hello World from thread (1, 1) for block (0,0)
