In [None]:
%%writefile dot.cu

#include <iostream>         // For standard input/output operations
#include <cuda_runtime.h>   // For CUDA runtime API
#include <device_launch_parameters.h> // For defining CUDA kernel launch parameters
#include <stdlib.h>


#define num_threads 256
__global__ void dotProduct(float *A, float *B, float* partial_sums, float *result, int N) {

  __shared__ float shemm[num_threads];

  int tid = threadIdx.x;
  int gid = blockIdx.x*blockDim.x + tid;
  float sum;

  if (gid < N) {
    sum = A[gid] *B[gid];
    shemm[tid] = sum;
    }

  __syncthreads();

  //reduction logic
  for (int s=num_threads/2; s>0; s>>=1){
    if (tid < s){
      shemm[tid] +=shemm[tid+s];
    }
    __syncthreads();
  }
  if (tid ==0) {
    partial_sums[blockIdx.x] = shemm[0];}


  __threadfence();	// Ensures writes to global memory are visible to all threads.

  float finalsum=0;
  if (blockIdx.x ==0 && tid==0){
    for (int i=0; i<gridDim.x; i++){
      finalsum+=partial_sums[i];
    }
  }
  *result = finalsum;


}

int main() {
    int N = 1<<16;
    size_t size =N * sizeof(float);

    // Allocate memory on the host
    float *h_A = (float*)malloc(size);
    float *h_B = (float*)malloc(size);
    float* finalresult=(float*)malloc(sizeof(float));


    // Initialize matrices A and B with random values
    for (int i = 0; i < N; ++i) {
        h_A[i] = 1;
        h_B[i] = 2;
    }

    // Allocate memory on the device
    float *d_A, *d_B, *partial_sums, *result;
    cudaMalloc((void**)&d_A, size);
    cudaMalloc((void**)&d_B, size);
    cudaMalloc(&result, sizeof(float));


    // Copy matrices A and B to device memory
    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

    // Define grid and block dimensions

    int numBlocks= (N / num_threads);

    cudaMalloc(&partial_sums, numBlocks *sizeof(float));

    // Launch the kernel
    dotProduct<<<numBlocks, num_threads>>>(d_A, d_B, partial_sums, result,N);




    cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
    std::cout << "CUDA Kernel Launch Error: " << cudaGetErrorString(err) << std::endl;
}

    cudaDeviceSynchronize();

    cudaMemcpy(finalresult, result, sizeof(float), cudaMemcpyDeviceToHost);
    std::cout << *finalresult;



    // Free memory
    free(h_A);
    free(h_B);

    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(result);
    cudaFree(partial_sums);
    return 0;
}


Overwriting dot.cu


In [None]:
!nvcc -gencode=arch=compute_75,code=sm_75 -o dot dot.cu

In [None]:
%%shell
nvprof ./dot

==1334== NVPROF is profiling process 1334, command: ./dot
131072==1334== Profiling application: ./dot
==1334== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   72.90%  47.520us         2  23.760us  23.744us  23.776us  [CUDA memcpy HtoD]
                   23.91%  15.584us         1  15.584us  15.584us  15.584us  dotProduct(float*, float*, float*, float*, int)
                    3.19%  2.0800us         1  2.0800us  2.0800us  2.0800us  [CUDA memcpy DtoH]
      API calls:   99.67%  175.74ms         4  43.935ms  2.4930us  175.73ms  cudaMalloc
                    0.10%  182.52us         3  60.839us  18.438us  85.338us  cudaMemcpy
                    0.07%  128.88us       114  1.1300us     104ns  52.223us  cuDeviceGetAttribute
                    0.07%  118.88us         4  29.719us  3.1580us  100.38us  cudaFree
                    0.07%  117.10us         1  117.10us  117.10us  117.10us  cudaLaunchKernel
                  



In [None]:
!nvcc --version


nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0


In [5]:
%%writefile dot.cu

#include <iostream>         // For standard input/output operations
#include <cuda_runtime.h>   // For CUDA runtime API
#include <device_launch_parameters.h> // For defining CUDA kernel launch parameters
#include <stdlib.h>


#define num_threads 256

__global__ void dotProduct(float *A, float *B, float* block_sums, int N) {

  __shared__ float shemm[num_threads];

  int tid = threadIdx.x;
  int gid = blockIdx.x*blockDim.x + tid;
  float sum;

  if (gid < N) {
    sum = A[gid] *B[gid];
    shemm[tid] = sum;
    }

  __syncthreads();

  //reduction logic
  for (int s=num_threads/2; s>0; s>>=1){
    if (tid < s){
      shemm[tid] +=shemm[tid+s];
    }
    __syncthreads();
  }
  if (tid ==0) {
    block_sums[blockIdx.x] = shemm[0];}

}


__global__ void addValues(float* blockSums, float* result, int N){

  __shared__ float partials[num_threads];

  int index=threadIdx.x;

  float sum=0;
  for (int i=index; i<N; i+=num_threads){
    sum+=blockSums[i];
  }
  partials[index] = sum;
  __syncthreads();

  for (int s=num_threads/2; s>0; s>>=1){
    if (index < s){
      partials[index] +=partials[index+ s];
    }
    __syncthreads();
  }
  if (index ==0) {
    *result = partials[0];}
}

int main() {
    int N = 1<<16;
    size_t size =N * sizeof(float);

    // Allocate memory on the host
    float *h_A = (float*)malloc(size);
    float *h_B = (float*)malloc(size);
    float* finalresult=(float*)malloc(sizeof(float));


    // Initialize matrices A and B with random values
    for (int i = 0; i < N; ++i) {
        h_A[i] = 1;
        h_B[i] = 2;
    }

    // Allocate memory on the device
    float *d_A, *d_B;
    cudaMalloc((void**)&d_A, size);
    cudaMalloc((void**)&d_B, size);

    // Copy matrices A and B to device memory
    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

    // Define grid and block dimensions
    int numBlocks= (N + num_threads -1)/ (num_threads);
    float* block_sums;
    cudaMalloc(&block_sums, numBlocks * sizeof(float));

    // Launch the kernel
    dotProduct<<<numBlocks, num_threads>>>(d_A, d_B, block_sums,N);

    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess) {
    std::cout << "CUDA Kernel Launch Error: " << cudaGetErrorString(err) << std::endl;
    }

    cudaDeviceSynchronize();

    float* result;
    cudaMalloc(&result, sizeof(float));

    addValues<<<1, num_threads>>>(block_sums, result, numBlocks);  //launch reduction kernel with only 1 block.
    cudaDeviceSynchronize();


    cudaMemcpy(finalresult, result, sizeof(float), cudaMemcpyDeviceToHost);
    std::cout << *finalresult;



    // Free memory
    free(h_A);
    free(h_B);
    free(finalresult);

    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(result);
    cudaFree(block_sums);
    return 0;
}


Overwriting dot.cu


In [6]:
!nvcc -gencode=arch=compute_75,code=sm_75 -o dot dot.cu

In [7]:
%%shell
nvprof ./dot

==668== NVPROF is profiling process 668, command: ./dot
131072==668== Profiling application: ./dot
==668== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   73.86%  48.095us         2  24.047us  24.000us  24.095us  [CUDA memcpy HtoD]
                   15.77%  10.272us         1  10.272us  10.272us  10.272us  dotProduct(float*, float*, float*, int)
                    7.13%  4.6400us         1  4.6400us  4.6400us  4.6400us  addValues(float*, float*, int)
                    3.24%  2.1120us         1  2.1120us  2.1120us  2.1120us  [CUDA memcpy DtoH]
      API calls:   98.06%  127.74ms         4  31.936ms  4.3680us  127.72ms  cudaMalloc
                    0.83%  1.0818ms         3  360.62us  19.014us  961.06us  cudaMemcpy
                    0.66%  864.64us         1  864.64us  864.64us  864.64us  cuDeviceGetPCIBusId
                    0.21%  269.89us         2  134.94us  6.4380us  263.45us  cudaLaunchKernel
         

