<a href="https://colab.research.google.com/github/Aditya-11/High_Speed_Solver/blob/main/cuda_mult_bench.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Cuda Multiplication benchmark 

In [None]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2019 NVIDIA Corporation
Built on Sun_Jul_28_19:07:16_PDT_2019
Cuda compilation tools, release 10.1, V10.1.243


In [None]:
!which nvcc

/usr/local/cuda/bin/nvcc


In [None]:
%%script bash
nvcc -c kernel.cu
g++ -o program -I/usr/local/cuda/include -L/usr/local/cuda/lib64 -lcuda -lcudart vector_additon.cpp

gcc: error: kernel.cu: No such file or directory
gcc: fatal error: no input files
compilation terminated.
g++: error: vector_additon.cpp: No such file or directory


In [None]:
!apt-get --purge remove cuda nvidia* libnvidia-*
!dpkg -l | grep cuda- | awk '{print $2}' | xargs -n1 dpkg --purge
!apt-get remove cuda-*
!apt autoremove
!apt-get update

Reading package lists... Done
Building dependency tree       
Reading state information... Done
Note, selecting 'nvidia-kernel-common-418-server' for glob 'nvidia*'
Note, selecting 'nvidia-325-updates' for glob 'nvidia*'
Note, selecting 'nvidia-346-updates' for glob 'nvidia*'
Note, selecting 'nvidia-driver-binary' for glob 'nvidia*'
Note, selecting 'nvidia-331-dev' for glob 'nvidia*'
Note, selecting 'nvidia-304-updates-dev' for glob 'nvidia*'
Note, selecting 'nvidia-compute-utils-418-server' for glob 'nvidia*'
Note, selecting 'nvidia-384-dev' for glob 'nvidia*'
Note, selecting 'nvidia-libopencl1-346-updates' for glob 'nvidia*'
Note, selecting 'nvidia-driver-440-server' for glob 'nvidia*'
Note, selecting 'nvidia-340-updates-uvm' for glob 'nvidia*'
Note, selecting 'nvidia-dkms-450-server' for glob 'nvidia*'
Note, selecting 'nvidia-kernel-common' for glob 'nvidia*'
Note, selecting 'nvidia-kernel-source-440-server' for glob 'nvidia*'
Note, selecting 'nvidia-331-updates-uvm' for glob 'nvidi

In [None]:
!wget https://developer.nvidia.com/compute/cuda/9.2/Prod/local_installers/cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64 -O cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64.deb
!dpkg -i cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64.deb
!apt-key add /var/cuda-repo-9-2-local/7fa2af80.pub
!apt-get update
!apt-get install cuda-9.2

In [None]:
!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git

In [None]:
%load_ext nvcc_plugin

In [None]:
%%cu
#include <stdio.h>
#include <stdlib.h>
__global__ void add(int *a, int *b, int *c) {
*c = *a + *b;
}
int main() {
int a, b, c;
// host copies of variables a, b & c
int *d_a, *d_b, *d_c;
// device copies of variables a, b & c
int size = sizeof(int);
// Allocate space for device copies of a, b, c
cudaMalloc((void **)&d_a, size);
cudaMalloc((void **)&d_b, size);
cudaMalloc((void **)&d_c, size);
// Setup input values  
c = 0;
a = 3;
b = 5;
// Copy inputs to device
cudaMemcpy(d_a, &a, size, cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, &b, size, cudaMemcpyHostToDevice);
// Launch add() kernel on GPU
add<<<1,1>>>(d_a, d_b, d_c);
// Copy result back to host
cudaError err = cudaMemcpy(&c, d_c, size, cudaMemcpyDeviceToHost);
  if(err!=cudaSuccess) {
      printf("CUDA error copying to Host: %s\n", cudaGetErrorString(err));
  }
printf("result is %d\n",c);
// Cleanup
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return 0;
}

In [None]:
%%cu

// Adding two vectors using parallelism

#include <math.h>
#include <time.h>
#include <iostream>
#include <stdexcept>

static const int n_el = 512;
static const size_t size = n_el * sizeof(float);

// declare the kernel function
__global__ void kernel_sum(const float* A, const float* B, float* C, int n_el)
{
  // calculate the unique thread index
  int tid = blockDim.x * blockIdx.x + threadIdx.x;
  // perform tid-th elements addition 
  if (tid < n_el) C[tid] = A[tid] + B[tid];
}


// function which invokes the internal function 
void sum(const float* A, const float* B, float* C, int n_el) {
  int threadsPerBlock,blocksPerGrid;

  if (n_el<512){
    threadsPerBlock = n_el;
    blocksPerGrid   = 1;
  } else {
    threadsPerBlock = 512;
    blocksPerGrid   = ceil(double(n_el)/double(threadsPerBlock));
  }

  kernel_sum<<<blocksPerGrid,threadsPerBlock>>>(A, B, C, n_el);
}

int main()  {
  // declare and allocate input vectors h_A and h_B in the host (CPU) memory
  float* h_A = (float*)malloc(size);
  float* h_B = (float*)malloc(size);
  float* h_C = (float*)malloc(size);

  // declare device vectors in the device (GPU) memory
  float *d_A,*d_B,*d_C;

  // initialize input vectors
  for (int i=0; i<n_el; i++){
    h_A[i]=sin(i);
    h_B[i]=cos(i);
  }

  // allocate device vectors in the device (GPU) memory
  cudaMalloc(&d_A, size);
  cudaMalloc(&d_B, size);
  cudaMalloc(&d_C, size);

  // copy input vectors from the host (CPU) memory to the device (GPU) memory
  cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
  cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

  // call kernel function
  sum(d_A, d_B, d_C, n_el);

  // copy the output (results) vector from the device (GPU) memory to the host (CPU) memory
  cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
  // free device memory
  cudaFree(d_A);
  cudaFree(d_B);
  cudaFree(d_C);

  // compute the cumulative error
  double err=0;
  for (int i=0; i<n_el; i++) {
    double diff=double((h_A[i]+h_B[i])-h_C[i]);
    err+=diff*diff;
    // print results for manual checking.
    std::cout << "A+B: " << h_A[i]+h_B[i] << "\t" << "C: " << h_C[i] << "\n";
  }
  err=sqrt(err);
  std::cout << "err: " << err << "\n";

  // free host memory
  delete[] h_A;
  delete[] h_B;
  delete[] h_C;

  return cudaDeviceSynchronize();
}


In [None]:
%%cu

#include<cuda.h>
#include<stdio.h>
#include<stdlib.h>
#include <cuda_runtime.h> 
#include <chrono>
#include <iostream>

cudaEvent_t start, stop;     			
float elapsed_time_ms;       			

/*
__global__ void MatrixMulKernel(float *Md, float *Nd, float *Pd, int Width)
{
        //2D Thread ID
        int tx = threadIdx.x + blockIdx.x*blockDim.x;
        int ty = threadIdx.y + blockIdx.y*blockDim.y;

        //Pvalue stores the Pd element that is computed by the thread
        float Pvalue = 0;

        for(int k = 0; k < Width ; ++k){
        
                float Mdelement = Md[ty*Width + k];
                float Ndelement = Nd[k *Width+ tx];
                Pvalue += (Mdelement*Ndelement);
        }

    Pd[ty*Width + tx] = Pvalue;
}
*/ 
__global__ void MatrixVectorMulKernel(float *Md, float *Nd, float *Pd, int Width)
{
        //1D Thread ID
 
        int x = threadIdx.x + blockIdx.x*blockDim.x;

        //Pvalue stores the Pd element that is computed by the thread
        float Pvalue = 0 ; 

        if (x < Width) 
        {
        for(int k = 0; k < Width ; ++k)  {
                float Mdelement = Md[x*Width + k];
                float Ndelement = Nd[k];
                Pvalue += (Mdelement*Ndelement);
        }
        Pd[x] = Pvalue;
        }
}

void MatrixMultiplicationVector (float * M , float * N , float * P , int Width) { 

        int size = Width*Width*sizeof(float);
        int size_vec = Width * sizeof (float) ; 
        float *Md, *Nd, *Pd;

	    cudaEventCreate(&start);
	    cudaEventCreate(&stop);

        cudaMalloc((void**)&Md, size);
        cudaMemcpy(Md,M,size,cudaMemcpyHostToDevice);
        
        cudaMalloc((void**)&Nd, size_vec);
        cudaMemcpy(Nd,N,size_vec,cudaMemcpyHostToDevice);

        cudaMalloc((void**)&Pd,size_vec);
	
        //dim3 dimBlock(512);
        //dim3 dimGrid(ceil(Width/512)) ;

        int dimBlock = 512 ; 
        int dimGrid = ceil((double) Width/512) ;

        std :: cout << dimBlock << " " << dimGrid << std::endl ;  

	    cudaEventRecord(start, 0);		
 
        MatrixVectorMulKernel<<<dimGrid,dimBlock>>>(Md,Nd,Pd,Width);

	    cudaEventRecord(stop, 0);     		
	    cudaEventSynchronize(stop);
	    cudaEventElapsedTime(&elapsed_time_ms, start, stop );
        cudaMemcpy(P,Pd,size,cudaMemcpyDeviceToHost);

        cudaFree(Md);
        cudaFree(Nd);
        cudaFree(Pd);


}

/*
void MatrixMultiplication(float *M, float *N, float *P, int Width) 
{
        int size = Width*Width*sizeof(float);
        float *Md, *Nd, *Pd;
	    int k = 512;
	    int l = 512;

	    cudaEventCreate(&start);
	    cudaEventCreate(&stop);

        cudaMalloc((void**)&Md, size);
        cudaMemcpy(Md,M,size,cudaMemcpyHostToDevice);
        cudaMalloc((void**)&Nd, size);
        cudaMemcpy(Nd,N,size,cudaMemcpyHostToDevice);

        cudaMalloc((void**)&Pd,size);
	
        dim3 dimBlock((k-1)/Width+1,(l-1)/Width+1);
        dim3 dimGrid(Width,Width);

	    cudaEventRecord(start, 0);		

 
        MatrixMulKernel<<<dimGrid,dimBlock>>>(Md,Nd,Pd,Width);

	    cudaEventRecord(stop, 0);     		
	    cudaEventSynchronize(stop);
	    cudaEventElapsedTime(&elapsed_time_ms, start, stop );
        cudaMemcpy(P,Pd,size,cudaMemcpyDeviceToHost);

        cudaFree(Md);
        cudaFree(Nd);
        cudaFree(Pd);
}

*/

int main(void) 
{
        void MatrixMultiplication(float *, float *, float *, int);
	    const int Width= 10000;
 
        const int size = Width * Width ;
        const int size_vec = Width * 1 ; 

        float* M = (float*)malloc(size*sizeof(float)) ;
        float* N = (float*)malloc(size_vec*sizeof(float)) ;
        float* P = (float*)malloc(size_vec*sizeof(float)) ;
        
        
        for(int i = 0; i < (Width*Width) ; i++){
                M[i] = 1;
        }  
 
        for (int i = 0 ; i < (Width) ; i++ ) { 
                N[i] =  1;
        }
 
        using clock = std::chrono::system_clock;
 
        using sec = std::chrono::duration<double>;
 
        const auto before = clock::now();

        // MatrixMultiplication(M, N, P, Width); 
 
        MatrixMultiplicationVector(M , N , P , Width) ; 

        const sec duration = clock::now() - before;
 
        std::cout <<  "time to multiply : " << duration.count() << std::endl << std::endl;

        printf("%f %f %f \t \n", P[1] , P[2], P[100]);
    	printf("Computation time of GPU: %f ms.\n This is a change", elapsed_time_ms);  // exe. time
 
        free (M) ; 
        free (N) ; 
        free (P) ;
 
        return 0;
}

512 20
time to multiply : 0.224517

0.000000 0.000000 0.000000 	 
Computation time of GPU: 12.681216 ms.
 This is a change


In [None]:
%%cu

#include<cuda.h>
#include<stdio.h>
#include<stdlib.h>
#include <cuda_runtime.h> 
#include <chrono>
#include <iostream>

cudaEvent_t start, stop;     			
float elapsed_time_ms;       			

/*
__global__ void MatrixMulKernel(float *Md, float *Nd, float *Pd, int Width)
{
        //2D Thread ID
        int tx = threadIdx.x + blockIdx.x*blockDim.x;
        int ty = threadIdx.y + blockIdx.y*blockDim.y;

        //Pvalue stores the Pd element that is computed by the thread
        float Pvalue = 0;

        for(int k = 0; k < Width ; ++k){
        
                float Mdelement = Md[ty*Width + k];
                float Ndelement = Nd[k *Width+ tx];
                Pvalue += (Mdelement*Ndelement);
        }

    Pd[ty*Width + tx] = Pvalue;
}
*/ 
__global__ void MatrixVectorMulKernel(float *Md, float *Nd, float *Pd, int Width)
{
        //1D Thread ID
 
        int x = threadIdx.x + blockIdx.x*blockDim.x;

        //Pvalue stores the Pd element that is computed by the thread
        float Pvalue = 0 ; 

        if (x < Width) 
        {
        for(int k = 0; k < Width ; ++k)  {
                float Mdelement = Md[x*Width + k];
                float Ndelement = Nd[k];
                Pvalue += (Mdelement*Ndelement);
        }
        Pd[x] = Pvalue;
        }
}

void MatrixMultiplicationVector (float * M , float * N , float * P , int Width) { 

        int size = Width*Width*sizeof(float);
        int size_vec = Width * sizeof (float) ; 
        float *Md, *Nd, *Pd;

	    cudaEventCreate(&start);
	    cudaEventCreate(&stop);

        cudaMalloc((void**)&Md, size);
        cudaMemcpy(Md,M,size,cudaMemcpyHostToDevice);
        
        cudaMalloc((void**)&Nd, size_vec);
        cudaMemcpy(Nd,N,size_vec,cudaMemcpyHostToDevice);

        cudaMalloc((void**)&Pd,size_vec);
	
        //dim3 dimBlock(512);
        //dim3 dimGrid(ceil(Width/512)) ;

        int dimBlock = 512 ; 
        int dimGrid = ceil((double) Width/512) ;



        std :: cout << dimBlock << " " << dimGrid << std::endl ;  

	    cudaEventRecord(start, 0);		
 
        MatrixVectorMulKernel<<<dimGrid,dimBlock>>>(Md,Nd,Pd,Width);

	    cudaEventRecord(stop, 0);     		
	    cudaEventSynchronize(stop);
	    cudaEventElapsedTime(&elapsed_time_ms, start, stop );
        cudaMemcpy(P,Pd,size,cudaMemcpyDeviceToHost);

        cudaFree(Md);
        cudaFree(Nd);
        cudaFree(Pd);


}

/*
void MatrixMultiplication(float *M, float *N, float *P, int Width) 
{
        int size = Width*Width*sizeof(float);
        float *Md, *Nd, *Pd;
	    int k = 512;
	    int l = 512;

	    cudaEventCreate(&start);
	    cudaEventCreate(&stop);

        cudaMalloc((void**)&Md, size);
        cudaMemcpy(Md,M,size,cudaMemcpyHostToDevice);
        cudaMalloc((void**)&Nd, size);
        cudaMemcpy(Nd,N,size,cudaMemcpyHostToDevice);

        cudaMalloc((void**)&Pd,size);
	
        dim3 dimBlock((k-1)/Width+1,(l-1)/Width+1);
        dim3 dimGrid(Width,Width);
 
        dimBllock(1,1)
        dimGrid(10000 , 10000)


	    cudaEventRecord(start, 0);		

 
        MatrixMulKernel<<<dimGrid,dimBlock>>>(Md,Nd,Pd,Width);

	    cudaEventRecord(stop, 0);     		
	    cudaEventSynchronize(stop);
	    cudaEventElapsedTime(&elapsed_time_ms, start, stop );
        cudaMemcpy(P,Pd,size,cudaMemcpyDeviceToHost);

        cudaFree(Md);
        cudaFree(Nd);
        cudaFree(Pd);
}

*/

int main(void) 
{
        void MatrixMultiplication(float *, float *, float *, int);
	    const int Width= 10000;
 
        const int size = Width * Width ;
        const int size_vec = Width * 1 ; 

        float* M = (float*)malloc(size*sizeof(float)) ;
        float* N = (float*)malloc(size_vec*sizeof(float)) ;
        float* P = (float*)malloc(size_vec*sizeof(float)) ;
        
        
        for(int i = 0; i < (Width*Width) ; i++){
                M[i] = 1;
        }  
 
        for (int i = 0 ; i < (Width) ; i++ ) { 
                N[i] =  1;
        }
 
        using clock = std::chrono::system_clock;
 
        using sec = std::chrono::duration<double>;
 
        const auto before = clock::now();

        // MatrixMultiplication(M, N, P, Width); 
 
        MatrixMultiplicationVector(M , N , P , Width) ; 

        const sec duration = clock::now() - before;
 
        std::cout <<  "time to multiply : " << duration.count() << std::endl << std::endl;

        printf("%f %f %f \t \n", P[1] , P[2], P[100]);
    	printf("Computation time of GPU: %f ms.\n This is a change", elapsed_time_ms);  // exe. time
 
        free (M) ; 
        free (N) ; 
        free (P) ;
 
        return 0;
}

In [None]:
%%cu

#include<cuda.h>
#include<stdio.h>
#include<stdlib.h>
#include <cuda_runtime.h> 
#include <chrono>
#include <iostream>

cudaEvent_t start, stop;     			
float elapsed_time_ms;       			

__global__ void MatrixVectorMulKernel(float *Md, float *Nd, float *Pd, int Width)
{
        //1D Thread ID
 
        int x = threadIdx.x + blockIdx.x*blockDim.x;

        //Pvalue stores the Pd element that is computed by the thread
        float Pvalue = 0 ; 

        if (x < Width) 
        {
            for(int k = 0; k < Width ; ++k)  {
                float Mdelement = Md[x*Width + k];
                float Ndelement = Nd[k];
                Pvalue += (Mdelement*Ndelement);
            }
            Pd[x] = Pvalue;
            // printf ("x : %d \t || val : %f \n" ,x,Pd[x]) ; 
        }
}

void MatrixMultiplicationVector (float * M , float * N , float * P , int Width) { 
        int size = Width*Width*sizeof(float);
        int size_vec = Width * sizeof (float) ; 
        float *Md, *Nd, *Pd;

	    cudaEventCreate(&start);
	    cudaEventCreate(&stop);

        cudaMalloc((void**)&Md, size);
        cudaMemcpy(Md,M,size,cudaMemcpyHostToDevice);
        
        cudaMalloc((void**)&Nd, size_vec);
        cudaMemcpy(Nd,N,size_vec,cudaMemcpyHostToDevice);

        cudaMalloc((void**)&Pd,size_vec);
	
        int dimBlock = 512 ; 
        int dimGrid = ceil((double) Width/512) ;

        std :: cout << dimBlock << " " << dimGrid << std::endl ;  

	    cudaEventRecord(start, 0);		
 
        MatrixVectorMulKernel<<<dimGrid,dimBlock>>>(Md,Nd,Pd,Width);

	    cudaEventRecord(stop, 0);     		

	    cudaEventSynchronize(stop);
	    cudaEventElapsedTime(&elapsed_time_ms, start, stop );
        cudaMemcpy(P,Pd,size_vec,cudaMemcpyDeviceToHost);

        cudaFree(Md);
        cudaFree(Nd);
        cudaFree(Pd);
}

int main(void) 
{
        void MatrixMultiplication(float *, float *, float *, int);
	    const int Width= 16000;
 
        const int size = Width * Width ;
        const int size_vec = Width * 1 ; 

        float* M = (float*)malloc(size*sizeof(float)) ;
        float* N = (float*)malloc(size_vec*sizeof(float)) ;
        float* P = (float*)malloc(size_vec*sizeof(float)) ;
        
        
        for(int i = 0; i < (Width*Width) ; i++){
                M[i] = 1;
        }  
 
        for (int i = 0 ; i < (Width) ; i++ ) { 
                N[i] =  1;
        }
 
        using clock = std::chrono::system_clock;
 
        using sec = std::chrono::duration<double>;
 
        const auto before = clock::now();

        // MatrixMultiplication(M, N, P, Width); 
 
        MatrixMultiplicationVector(M , N , P , Width) ; 

        const sec duration = clock::now() - before;
 
        std::cout <<  "time to multiply : " << duration.count() << std::endl << std::endl;

        printf("%f %f %f \t \n", P[1] , P[2], P[100]);
    	printf("Computation time of GPU: %f ms.\n This is a change", elapsed_time_ms);  // exe. time
 
        free (M) ; 
        free (N) ; 
        free (P) ;
 
        return 0;
}


UsageError: Cell magic `%%cu` not found.


In [None]:
%%cuda --name my_curand.cu 

#include <math.h>
#include <time.h>
#include <iostream>
#include <stdexcept>
#include <stdlib.h>
#include <chrono>

#include <iostream>
#include <cstdlib> 
#include <ctime> 
#include <cuda.h>
#include <curand.h>
#include <cublas_v2.h>
#include <chrono>

void gpu_blas_mmul(const float *A, const float *B, float *C, const int m, const int k, const int n) {
    int lda=m,ldb=k,ldc=m;
    const float alf = 1;
    const float bet = 0;
    const float *alpha = &alf;
    const float *beta = &bet;

    cublasHandle_t handle;
    cublasCreate(&handle);
 
     // Do the actual multiplication
    cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);

    cublasDestroy(handle);

}


int main() {
    int nr_rows_A, nr_cols_A, nr_rows_B, nr_cols_B, nr_rows_C, nr_cols_C;

    // nr_rows_A = nr_cols_A = nr_rows_B = nr_cols_B = nr_rows_C = nr_cols_C = 10000;

    nr_rows_A = nr_rows_B = nr_cols_A = nr_rows_C = 10000;
    nr_cols_B = nr_cols_C = 1;


    float *h_A = (float *)malloc(nr_rows_A * nr_cols_A * sizeof(float));
    float *h_B = (float *)malloc(nr_rows_B * nr_cols_B * sizeof(float));
    float *h_C = (float *)malloc(nr_rows_C * nr_cols_C * sizeof(float));

    
    for (int i = 0 ; i < nr_rows_A * nr_rows_A; i++) {
       h_A[i] = 1;
    }

    for (int i = 0 ; i< nr_rows_B * nr_cols_B; i++) { 
        h_B[i] = 1;
    }

    float *d_A, *d_B, *d_C ;

    cudaMalloc(&d_A,nr_rows_A * nr_cols_A * sizeof(float)) ;
    cudaMalloc(&d_B,nr_rows_B * nr_cols_B * sizeof(float)) ;
    cudaMalloc(&d_C,nr_rows_C * nr_cols_C * sizeof(float)) ;

    cudaMemcpy(d_A, h_A, nr_rows_A * nr_cols_A * sizeof(float) , cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, nr_rows_B * nr_cols_B * sizeof(float) , cudaMemcpyHostToDevice);

    using clock = std::chrono::system_clock;
    using sec = std::chrono::duration<double>;

    const auto before = clock::now();

    gpu_blas_mmul(d_A, d_B, d_C, nr_rows_A, nr_cols_A, nr_cols_B);

    const sec duration = clock::now() - before;

    std::cout <<  "time to multiply : " << duration.count() << std::endl << std::endl;

    cudaMemcpy(h_C,d_C,nr_rows_C * nr_cols_C * sizeof(float),cudaMemcpyDeviceToHost);

    std::cout << h_C[0] << " " << h_C[1] << " " << h_C[2] << std::endl ;
 
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    free(h_A);
    free(h_B);
    free(h_C);
 
    return 0;
}



'File written in /content/src/my_curand.cu'

In [None]:
!nvcc -o /content/src/my_curand /content/src/my_curand.cu -lcurand -lcublas

In [None]:
!/content/src/my_curand

time to multiply : 0.228571

10000 10000 10000
