<a href="https://colab.research.google.com/github/1bharadvaja/ML-Code/blob/master/CUDA_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0


In [11]:
!pip install --quiet "nvcc4jupyter==1.1.0"


In [3]:
%load_ext nvcc4jupyter

Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmpdfyy0gr7".


In [14]:
%%writefile hello.cu
#include <stdio.h>
#include <cuda_runtime.h>

#define CHECK(call) do {                                           \
  cudaError_t err__ = (call);                                      \
  if (err__ != cudaSuccess) {                                      \
    fprintf(stderr, "CUDA error %s at %s:%d\n",                    \
            cudaGetErrorString(err__), __FILE__, __LINE__);        \
    return 1;                                                      \
  }                                                                \
} while (0)

__global__ void hello(){
  printf("Hello from block %u, thread %u\n", blockIdx.x, threadIdx.x);
}

int main(){
  int devCount = 0;
  CHECK(cudaGetDeviceCount(&devCount));
  if (devCount == 0) { fprintf(stderr, "No CUDA device visible.\n"); return 1; }

  // Optional: make the device printf buffer large (not needed here, but good habit)
  CHECK(cudaDeviceSetLimit(cudaLimitPrintfFifoSize, 8 * 1024 * 1024));

  hello<<<2, 2>>>();
  CHECK(cudaGetLastError());          // catch launch errors
  CHECK(cudaDeviceSynchronize());     // flush device-side printf
  CHECK(cudaDeviceReset());           // final flush & clean exit
  return 0;
}


Writing hello.cu


In [15]:
!nvcc -arch=sm_75 hello.cu -o hello
!./hello


Hello from block 0, thread 0
Hello from block 0, thread 1
Hello from block 1, thread 0
Hello from block 1, thread 1


In [1]:
%%writefile softmax.cu

#include <stdio.h>
#include <cuda_runtime.h>
#include <math.h>

const int N = 1024*32

const int threadsPerBlock = 256;
const int blocksPerGrid = 128




__global__ void softmax(float *vec, float *res) {
  __shared__ float cache[threadsPerBlock]; //next level on the memory hierarchy, scope is per block so want the number of threads per
  float local_max = -INFINITY
  float local_norm = 0.0f

  int tid = threadIdx.x + blockIdx.x * blockDim.x;

  int cacheIndex = threadIdx.x; //why? for each block, want one entry for each thread

  //online softmax
  for (int i = tid; i <= threadsPerBlock*blocksPerGrid; i+= blockDim.x) {
    float x = vec[i];

    if (x > local_max) {
      local_norm *= expf(local_max - x);
      local_max = x;
    }
    local_norm += expf(x - local_max);
  }

  __syncthreads(); //i want to try implementing the syncthreads primitive using semaphores later
  //now we have thread level local_norms and local_maxes, the next step is to reduce them thru shared memory in logN time to get a globalmax

  cache[tid] = local_max;
  __syncthreads();
  for (int stride = threadsPerBlock/2; stride != 0; stride = stride/2) {
    if (tid < stride) {
      cache[tid] = fmax(cache[tid], cache[tid + stride]);
    }

    __syncthreads();

  }
  //now we have a global max at cache[0], so we need to correct the local_norm
  float global_max = cache[0];
  local_norm *= expf(local_max - global_max);

  cache[tid] = local_norm;

  for (int stride = threadsPerBlock/2; stride != 0; stride = stride/2) {
    if (tid < stride) {
      cache[tid] += cache[tid + stride];
    }
    __syncthreads();

  }

  float global_norm = cache[0]; //now we have the global norm stored as well, can actually do the softmax computation now

  for (int i = tid; i < threadsPerBlock; i += blockDim.x) {
    res[tid] = expf(vec[tid] - global_max) / global_norm;

  }



}

int main() {
  dim3 block_size(threadsPerBlock);
  dim3 grid_size(blocksPerGrid);

  float *vec;
  float *res;


  vec = malloc(N*sizeof(float));
  res = malloc(N*sizeof(float));

  for (int i = 0; i < N; i++) {
    vec[i] = rand();
  }

  float *dev_vec;
  float *dev_res;

  float dev_vec = cudaMalloc((void**)&dev_vec, N*sizeof(float));
  float dev_res = cudaMalloc((void**)&dev_res, N*sizeof(float));

  cudaMemcpy(dev_vec, vec, N*sizeof(float), cudaMemcpyHostToDevice);

  cudaEvent_t start, stop;
  CUDA_CHECK(cudaEventCreate(&start));
  CUDA_CHECK(cudaEventCreate(&stop));
  float ms = 0.f;

  CUDA_CHECK(cudaEventRecord(start));

  softmax<<<grid_size, block_size>>>(dev_vec, dev_res);

  CUDA_CHECK(cudaEventRecord(stop));
  CUDA_CHECK(cudaEventSynchronize(stop));
  CUDA_CHECK(cudaEventElapsedTime(&ms, start, stop));
  printf(">> Kernel execution time: %f ms\n", ms);

  CUDA_CHECK(cudaEventDestroy(start));
  CUDA_CHECK(cudaEventDestroy(stop));

}



Writing softmax.cu
