In [None]:
!nvidia-smi

Tue Nov  5 09:34:45 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   44C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
%%writefile device_info.cu
#include <stdlib.h>
#include <stdio.h>

int main()
{
        int deviceCount;
        cudaGetDeviceCount(&deviceCount);
        if (deviceCount == 0)
        {
                printf("There is no device supporting CUDA\n");
        }
        int dev;
        for (dev = 0; dev < deviceCount; ++dev)
        {
                cudaDeviceProp deviceProp;
                cudaGetDeviceProperties(&deviceProp, dev);
                if (dev == 0)
                {
                        if (deviceProp.major < 1)
                        {
                                printf("There is no device supporting CUDA.\n");
                        }
                        else if (deviceCount == 1)
                        {
                                printf("There is 1 device supporting CUDA\n");
                        }
                        else
                        {
                                printf("There are %d devices supporting CUDA\n", deviceCount);
                        }
                }
                printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name);
                printf("  Major revision number:                         %d\n", deviceProp.major);
                printf("  Minor revision number:                         %d\n", deviceProp.minor);
                printf("  Total amount of global memory:                 %ld bytes\n", deviceProp.totalGlobalMem);
                printf("  Total amount of constant memory:               %ld bytes\n", deviceProp.totalConstMem);
                printf("  Total amount of shared memory per block:       %ld bytes\n", deviceProp.sharedMemPerBlock);
                printf("  Total number of registers available per block: %d\n", deviceProp.regsPerBlock);
                printf("  Warp size:                                     %d\n", deviceProp.warpSize);
                printf("  Multiprocessor count:                          %d\n",deviceProp.multiProcessorCount );

                printf("  Maximum number of threads per block:           %d\n", deviceProp.maxThreadsPerBlock);
                printf("  Maximum sizes of each dimension of a block:    %d x %d x %d\n", deviceProp.maxThreadsDim[0],deviceProp.maxThreadsDim[1], deviceProp.maxThreadsDim[2]);
                printf("  Maximum sizes of each dimension of a grid:     %d x %d x %d\n", deviceProp.maxGridSize[0], deviceProp.maxGridSize[1],  deviceProp.maxGridSize[2]);
                printf("  Maximum memory pitch:                          %ld bytes\n", deviceProp.memPitch);
                printf("  Texture alignment:                             %ld bytes\n", deviceProp.textureAlignment);
                printf("  Clock rate:                                    %d kilohertz\n", deviceProp.clockRate);
        }
}

Writing device_info.cu


In [None]:
! nvcc device_info.cu -o device_info

In [None]:
ls

[0m[01;32mdevice_info[0m*  device_info.cu  [01;34msample_data[0m/


In [None]:
!./device_info

There is 1 device supporting CUDA

Device 0: "Tesla T4"
  Major revision number:                         7
  Minor revision number:                         5
  Total amount of global memory:                 15835660288 bytes
  Total amount of constant memory:               65536 bytes
  Total amount of shared memory per block:       49152 bytes
  Total number of registers available per block: 65536
  Warp size:                                     32
  Multiprocessor count:                          40
  Maximum number of threads per block:           1024
  Maximum sizes of each dimension of a block:    1024 x 1024 x 64
  Maximum sizes of each dimension of a grid:     2147483647 x 65535 x 65535
  Maximum memory pitch:                          2147483647 bytes
  Texture alignment:                             512 bytes
  Clock rate:                                    1590000 kilohertz


In [None]:
%%writefile HW.cu
#include <stdio.h>

// Kernel function to print "Hello, World!" from the GPU
__global__ void helloFromGPU()
{
    printf("Hello, World from GPU!\n");
}

int main()
{
    // Print "Hello, World!" from the CPU
    printf("Hello, World from CPU!\n");

    // Launch kernel with a single thread to print "Hello, World!" from the GPU
    helloFromGPU<<<4, 4>>>();

    // Synchronize to ensure all printf statements from the GPU are executed
    cudaDeviceSynchronize();

    return 0;
}


Writing HW.cu


In [None]:
! nvcc HW.cu -o HW

In [None]:
!./HW

Hello, World from CPU!
Hello, World from GPU!
Hello, World from GPU!
Hello, World from GPU!
Hello, World from GPU!
Hello, World from GPU!
Hello, World from GPU!
Hello, World from GPU!
Hello, World from GPU!
Hello, World from GPU!
Hello, World from GPU!
Hello, World from GPU!
Hello, World from GPU!
Hello, World from GPU!
Hello, World from GPU!
Hello, World from GPU!
Hello, World from GPU!


In [None]:
%%writefile grid_block_check.cu
#include <stdio.h>

// CUDA runtime
#include <cuda_runtime.h>

__global__ void hello() {

    /*********************************************************************************************/
		int Global_Block_ID =blockIdx.z * gridDim.x * gridDim.y + blockIdx.y * gridDim.x + blockIdx.x;

    /*********************************************************************************************/
    int Threads_Per_Block = blockDim.x * blockDim.y * blockDim.z;

    /*********************************************************************************************/
    int Global_Thread_ID= Global_Block_ID * Threads_Per_Block +((threadIdx.z * blockDim.x * blockDim.y ) + (threadIdx.y *blockDim.x) + threadIdx.x );

    /*********************************************************************************************/

    printf("Global BID : %d| Global TID = %d |I am thread (%d, %d, %d) of block (%d, %d, %d) in the grid\n",
           Global_Block_ID,Global_Thread_ID,threadIdx.x, threadIdx.y, threadIdx.z,
           blockIdx.x, blockIdx.y, blockIdx.z );

}


void printDims(dim3 gridDim, dim3 blockDim) {
    printf("Grid Dimensions : {%d, %d, %d} blocks. \n",
    gridDim.x, gridDim.y, gridDim.z);

    printf("Block Dimensions : {%d, %d, %d} threads.\n",
    blockDim.x, blockDim.y, blockDim.z);
}

int main(int argc, char **argv) {


    dim3 gridDim(2,2);     // 2 blocks in x and y direction, z default to 1
    dim3 blockDim(2,2);  // 4 threads per block: 2 in x direction, 2 in y


    printDims(gridDim, blockDim);

    printf("From each thread:\n");
    hello<<<gridDim, blockDim>>>();
    cudaDeviceSynchronize();      // need for printfs in kernel to flush

    return 0;
}

Writing grid_block_check.cu


In [None]:
! nvcc grid_block_check.cu -o grid_block_check

In [None]:
!./grid_block_check

Grid Dimensions : {2, 2, 1} blocks. 
Block Dimensions : {2, 2, 1} threads.
From each thread:
Global BID : 2| Global TID = 8 |I am thread (0, 0, 0) of block (0, 1, 0) in the grid
Global BID : 2| Global TID = 9 |I am thread (1, 0, 0) of block (0, 1, 0) in the grid
Global BID : 2| Global TID = 10 |I am thread (0, 1, 0) of block (0, 1, 0) in the grid
Global BID : 2| Global TID = 11 |I am thread (1, 1, 0) of block (0, 1, 0) in the grid
Global BID : 0| Global TID = 0 |I am thread (0, 0, 0) of block (0, 0, 0) in the grid
Global BID : 0| Global TID = 1 |I am thread (1, 0, 0) of block (0, 0, 0) in the grid
Global BID : 0| Global TID = 2 |I am thread (0, 1, 0) of block (0, 0, 0) in the grid
Global BID : 0| Global TID = 3 |I am thread (1, 1, 0) of block (0, 0, 0) in the grid
Global BID : 3| Global TID = 12 |I am thread (0, 0, 0) of block (1, 1, 0) in the grid
Global BID : 3| Global TID = 13 |I am thread (1, 0, 0) of block (1, 1, 0) in the grid
Global BID : 3| Global TID = 14 |I am thread (0, 1, 0

In [None]:
%%writefile sm_id.cu
#include <stdio.h>
#include <stdint.h>

static __device__ __inline__ uint32_t __mysmid(){
  uint32_t smid;
  asm volatile("mov.u32 %0, %%smid;" : "=r"(smid));
  return smid;}

static __device__ __inline__ uint32_t __mywarpid(){
  uint32_t warpid;
  asm volatile("mov.u32 %0, %%warpid;" : "=r"(warpid));
  return warpid;}

static __device__ __inline__ uint32_t __mylaneid(){
  uint32_t laneid;
  asm volatile("mov.u32 %0, %%laneid;" : "=r"(laneid));
  return laneid;}


__global__ void mykernel(){

  int idx = threadIdx.x+blockDim.x*blockIdx.x;
  printf("I am thread %d | my SM ID is %d | my warp ID is %d | and my warp lane is %d\n", idx, __mysmid(), __mywarpid(), __mylaneid());
}

int main(){

  mykernel<<<41,2>>>();
  cudaDeviceSynchronize();
  return 0;
}

Writing sm_id.cu


In [None]:
! nvcc sm_id.cu -o sm_id

In [None]:
!./sm_id

I am thread 34 | my SM ID is 34 | my warp ID is 0 | and my warp lane is 0
I am thread 35 | my SM ID is 34 | my warp ID is 0 | and my warp lane is 1
I am thread 74 | my SM ID is 35 | my warp ID is 0 | and my warp lane is 0
I am thread 75 | my SM ID is 35 | my warp ID is 0 | and my warp lane is 1
I am thread 24 | my SM ID is 24 | my warp ID is 0 | and my warp lane is 0
I am thread 25 | my SM ID is 24 | my warp ID is 0 | and my warp lane is 1
I am thread 64 | my SM ID is 25 | my warp ID is 0 | and my warp lane is 0
I am thread 65 | my SM ID is 25 | my warp ID is 0 | and my warp lane is 1
I am thread 38 | my SM ID is 38 | my warp ID is 0 | and my warp lane is 0
I am thread 39 | my SM ID is 38 | my warp ID is 0 | and my warp lane is 1
I am thread 78 | my SM ID is 39 | my warp ID is 0 | and my warp lane is 0
I am thread 79 | my SM ID is 39 | my warp ID is 0 | and my warp lane is 1
I am thread 14 | my SM ID is 14 | my warp ID is 0 | and my warp lane is 0
I am thread 15 | my SM ID is 14 | my w

In [None]:
%%writefile matrix_mul.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>

#define MATRIX_SIZE 25000

// CUDA kernel for matrix multiplication using global memory
__global__ void matrixMultiply(float *A, float *B, float *C, int width) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < width && col < width) {
        float sum = 0.0;
        for (int k = 0; k < width; ++k) {
            sum += A[row * width + k] * B[k * width + col];
        }
        C[row * width + col] = sum;
    }
}

int main() {
    int width = MATRIX_SIZE;
    size_t size = width * width * sizeof(float);

    // Host matrices and result
    float *h_A, *h_B, *h_C;
    h_A = (float *)malloc(size);
    h_B = (float *)malloc(size);
    h_C = (float *)malloc(size);

    // Initialize matrices A and B
    for (int i = 0; i < width * width; ++i) {
        h_A[i] = 1.0; // Replace with your initialization
        h_B[i] = 2.0; // Replace with your initialization
    }

    // Device matrices and result
    float *d_A, *d_B, *d_C;
    cudaMalloc(&d_A, size);
    cudaMalloc(&d_B, size);
    cudaMalloc(&d_C, size);

    // Copy matrices A and B from host to device
    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

    // Define grid and block dimensions
    dim3 threadsPerBlock(16, 16);
    dim3 blocksPerGrid((width + threadsPerBlock.x - 1) / threadsPerBlock.x, (width + threadsPerBlock.y - 1) / threadsPerBlock.y);

    // Launch kernel for matrix multiplication
    matrixMultiply<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, width);
    printf("Grid dimensions: (%d, %d, %d)\n", blocksPerGrid.x, blocksPerGrid.y, blocksPerGrid.z);
    printf("Threads dimensions : (%d, %d, %d)\n",threadsPerBlock.x,threadsPerBlock.y,threadsPerBlock.z);
    // Copy matrix C from device to host
    cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);

    // Verify results (print some elements if needed)
    printf("Sample result: C[0][0] = %f\n", h_C[0]);

    // Free memory
    free(h_A);
    free(h_B);
    free(h_C);
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    return 0;
}

Writing matrix_mul.cu


In [None]:
! nvcc matrix_mul.cu -o matrix_mul

In [None]:
!./matrix_mul

Grid dimensions: (1563, 1563, 1)
Threads dimensions : (16, 16, 1)
Sample result: C[0][0] = 50000.000000


In [None]:
%%writefile matrix_mul_shared.cu
 #include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>

#define MATRIX_SIZE 25000
#define TILE_SIZE 16 // Assuming a square tile size for simplicity

// CUDA kernel for matrix multiplication using shared memory
__global__ void matrixMultiply(float *A, float *B, float *C, int width) {
    // Allocate shared memory for tiles of matrices A and B
    __shared__ float tileA[TILE_SIZE][TILE_SIZE];
    __shared__ float tileB[TILE_SIZE][TILE_SIZE];

    // Calculate global row and column indices
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    float sum = 0.0;

    // Iterate over tiles
    for (int tileIdx = 0; tileIdx < width / TILE_SIZE; ++tileIdx) {
        // Load tiles into shared memory
        tileA[threadIdx.y][threadIdx.x] = A[row * width + tileIdx * TILE_SIZE + threadIdx.x];
        tileB[threadIdx.y][threadIdx.x] = B[(tileIdx * TILE_SIZE + threadIdx.y) * width + col];

        // Synchronize threads to ensure all data is loaded
        __syncthreads();

        // Compute partial sum for the tile
        for (int k = 0; k < TILE_SIZE; ++k) {
            sum += tileA[threadIdx.y][k] * tileB[k][threadIdx.x];
        }

        // Synchronize threads before loading the next tile
        __syncthreads();
    }

    // Write result to matrix C
    if (row < width && col < width) {
        C[row * width + col] = sum;
    }
}

int main() {
    int width = MATRIX_SIZE;
    size_t size = width * width * sizeof(float);

    // Host matrices and result
    float *h_A, *h_B, *h_C;
    h_A = (float *)malloc(size);
    h_B = (float *)malloc(size);
    h_C = (float *)malloc(size);

    // Initialize matrices A and B
    for (int i = 0; i < width * width; ++i) {
        h_A[i] = 1.0; // Replace with your initialization
        h_B[i] = 2.0; // Replace with your initialization
    }

    // Device matrices and result
    float *d_A, *d_B, *d_C;
    cudaMalloc(&d_A, size);
    cudaMalloc(&d_B, size);
    cudaMalloc(&d_C, size);

    // Copy matrices A and B from host to device
    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

    // Define grid and block dimensions
    dim3 threadsPerBlock(TILE_SIZE, TILE_SIZE);
    dim3 blocksPerGrid((width + threadsPerBlock.x - 1) / threadsPerBlock.x, (width + threadsPerBlock.y - 1) / threadsPerBlock.y);

    // Launch kernel for matrix multiplication
    matrixMultiply<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, width);
    printf("Grid dimensions: (%d, %d, %d)\n", blocksPerGrid.x, blocksPerGrid.y, blocksPerGrid.z);
    printf("Threads dimensions : (%d, %d, %d)\n",threadsPerBlock.x,threadsPerBlock.y,threadsPerBlock.z);
    // Copy matrix C from device to host
    cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);

    // Verify results (print some elements if needed)
    printf("Sample result: C[0][0] = %f\n", h_C[0]);

    // Free memory
    free(h_A);
    free(h_B);
    free(h_C);
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    return 0;
}

Writing matrix_mul_shared.cu


In [None]:
! nvcc matrix_mul_shared.cu -o matrix_mul_shared

In [None]:
! time ./matrix_mul_shared

Grid dimensions: (1563, 1563, 1)
Threads dimensions : (16, 16, 1)
Sample result: C[0][0] = 49984.000000

real	0m52.134s
user	0m47.594s
sys	0m4.222s
