# cuBLAS example with C++ kernel (no CUDA mode)

Notebook bases on NVIDIA CUDA example `simpleCUBLAS`

In [None]:
#pragma cling(add_include_path "/usr/local/cuda/include")
#pragma cling(add_library_path "/usr/local/lib")

#include <iostream>
#include <random>

#include <cuda_runtime.h>
#include <cublas_v2.h>

#pragma cling(load "cuda.so")
#pragma cling(load "libcudart.so")
#pragma cling(load "libcublas.so")

## Check functions

In [None]:
inline void cuCheck(cudaError_t code) {
  if (code != cudaSuccess) {
    std::cerr << "Error code: " << code << std::endl
              << cudaGetErrorString(code) << std::endl;
  }
}

In [None]:
inline void cuCheck(cublasStatus_t code) {
  if (code != CUBLAS_STATUS_SUCCESS) {
    std::cerr << "CUBLAS Error code: " << code << std::endl;
  }
}

## Initialize variables

In [None]:
int dim = 1024;

// host memory
float *h_A;
float *h_B;
float *h_C;
// device memory
float *d_A = 0;
float *d_B = 0;
float *d_C = 0;

float alpha = 1.0f;
float beta = 0.0f;

cublasHandle_t handle;

std::random_device dev;
std::mt19937 rng(dev());
std::uniform_real_distribution<> dist(0.f, (float)dim);

In [None]:
// allocate host memory
h_A = new float[dim * dim];
h_B = new float[dim * dim];
h_C = new float[dim * dim];

// allocate device memory
cuCheck(cudaMalloc((void **)&d_A, dim * dim * sizeof(d_A[0])));
cuCheck(cudaMalloc((void **)&d_B, dim * dim * sizeof(d_A[0])));
cuCheck(cudaMalloc((void **)&d_C, dim * dim * sizeof(d_A[0])));

// initial matrices with random numbers 
for(int i = 0; i < dim*dim; ++i){
    h_A[i] = dist(rng);
    h_B[i] = dist(rng);
    h_C[i] = dist(rng);
}

cuCheck(cublasCreate(&handle));

## Copy Memory and run cuBLAS

In [None]:
// copy memory to device
cuCheck(cublasSetVector(dim*dim, sizeof(h_A[0]), h_A, 1, d_A, 1));
cuCheck(cublasSetVector(dim*dim, sizeof(h_B[0]), h_B, 1, d_B, 1));
cuCheck(cublasSetVector(dim*dim, sizeof(h_C[0]), h_C, 1, d_C, 1));

In [None]:
cuCheck(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, dim, dim, dim, &alpha, d_A, dim, d_B, dim, &beta, d_C, dim));

In [None]:
// copy result back
cuCheck(cublasGetVector(dim*dim, sizeof(h_C[0]), d_C, 1, h_C, 1));

## Verify result

In [None]:
float *h_C_ref = new float[dim*dim]; 
float error_norm = 0.f;
float ref_norm = 0.f;
float diff = 0.f;

In [None]:
void simple_sgemm(int n, float alpha, const float *A, const float *B,
                         float beta, float *C)
{
    int i;
    int j;
    int k;

    for (i = 0; i < n; ++i)
    {
        for (j = 0; j < n; ++j)
        {
            float prod = 0;

            for (k = 0; k < n; ++k)
            {
                prod += A[k * n + i] * B[j * n + k];
            }

            C[j * n + i] = alpha * prod + beta * C[j * n + i];
        }
    }
}


In [None]:
simple_sgemm(dim, alpha, h_A, h_B, beta, h_C_ref);

In [None]:
for (int i = 0; i < dim*dim; ++i)
{
    diff = h_C_ref[i] - h_C[i];
    error_norm += diff * diff;
    ref_norm += h_C_ref[i] * h_C_ref[i];
}

error_norm = (float)sqrt((double)error_norm);
ref_norm = (float)sqrt((double)ref_norm);

if (fabs(ref_norm) < 1e-7)
    std::cerr << "reference norm is 0" << std::endl;

if (error_norm / ref_norm < 1e-6f)
    std::cout << "cuBLAS test passed" << std::endl;
else
    std::cout << "cuBLAS test failed" << std::endl;

## Clean up

In [None]:
free(h_A);
free(h_B);
free(h_C);
free(h_C_ref);

cuCheck(cudaFree(d_A));
cuCheck(cudaFree(d_B));
cuCheck(cudaFree(d_C));

cublasDestroy(handle);