<a href="https://colab.research.google.com/github/A2R-Lab/rsLQR-Spring23/blob/main/Bugs_in_CUDA_101.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

1. Don't forget to make sure your collab notebook runs on GPU ! Go to settings and select GPU in hardware accelerator.
2. When you move any object from RAM to shared don't forget to move it back to RAM after calling the device function.
3. You can use the following skeleton for CUDA implementations

In [None]:
  if(!DEBUG){
    if(thread_id==0 && block_id ==0) {

      printf("PRINTING ALL DATA AFTER INIT\n");
      for(unsigned i = 0; i < nhorizon; i++) {
        printf("\nQ%d: \n", i);
        printMatrix(s_Q_R+(i*cost_step),nstates,nstates);

        printf("\nR%d: \n", i);
        printMatrix(s_Q_R+(i*cost_step+states_sq),ninputs,ninputs);

        printf("\nq%d: \n", i);
        printMatrix(s_q_r+(i*(ninputs+nstates)),1,nstates);

        printf("\nr%d: \n", i);
        printMatrix(s_q_r+(i*(ninputs+nstates)+nstates),1,ninputs);

        printf("\nA%d: \n", i);
        printMatrix(s_A_B+(i*dyn_step),nstates,nstates);

        printf("\nB%d: \n", i);
        printMatrix(s_A_B+(i*dyn_step+states_sq),nstates,ninputs);

        printf("\nd%d: \n", i);
        printMatrix(s_d+i*nstates,1,nstates);        
      } 

      for(uint32_t ind = 0; ind < nhorizon * depth ;  ind++) {
          if(ind%nhorizon==0){ 
            printf("\nLEVEL %d\n", ind/nhorizon);
          } 
            printf("\nF_lambda[%d]\n", ind);
            printMatrix(s_F_lambda+(ind*states_sq),nstates,nstates);

            printf("\nF_state%d: \n", ind);
            printMatrix(s_F_state+(ind*states_sq),nstates,nstates);

            printf("\nF_input%d: \n", ind);
            printMatrix(s_F_input+ind*inp_states, nstates,ninputs);

        }
    }    
  }

In [None]:
%%cuda -n run.cu

#include <stdio.h>
#include <iostream>
#include <cmath>

//DEVICE FUNCTIONS

template <typename T> 
__device__ 
void funct_cu(T *s_A) {
    //do something
    __syncthreads();
}


//GLOBAL/Kernel function

template <typename T>
__global__ 
void funct_Kernel(T *d_A) {
    
    //declare shared memory
    __shared__ T s_A[9];

    // move RAM memory to shared
    for(unsigned i = threadIdx.x; i < n*n; i += blockDim.x){s_A[i] = d_A[i];}

    // call device function
    funct_cu<T>(s_A);

    // move shared to RAM
    for(unsigned i = threadIdx.x; i < n*n; i += blockDim.x){d_A[i] = s_A[i];}
}


//main/host function

__host__
int main() {
    // Input matrix on the host
    int N=3;
    float A[N*N] = {6,15,55,15,55,225,55,225,979};


    // Allocate memory on the GPU for the input and output matrices
    float* d_A; cudaMalloc((void**)&d_A, N * N * sizeof(float));

    // Copy the input matrix from the host to the GPU memory
    cudaMemcpy(d_A, A, N * N * sizeof(float), cudaMemcpyHostToDevice);

    //Launch the CUDA kernel with appropriate block and grid dimensions
    int blockSize = 256;
    int gridSize = 1;
    funct_Kernel<float><<<gridSize, blockSize>>>(d_A);
    cudaDeviceSynchronize();


    // Copy the result back from the GPU memory to the host
    cudaMemcpy(A, d_A, N * N * sizeof(float), cudaMemcpyDeviceToHost);
    
    // Free the allocated GPU memory
    cudaFree(d_A);

    return 0;
}