<a href="https://colab.research.google.com/github/Bhanudutta/WrapLap/blob/master/CUDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
#Running this may take sometime, this is for initially setting up CUDA for COLAB
#takes around 6 minutes
#Uninstall any previous versions of CUDA completely.
!apt-get --purge remove cuda nvidia* libnvidia-*
!dpkg -l | grep cuda- | awk '{print $2}' | xargs -n1 dpkg --purge
!apt-get remove cuda-*
!apt autoremove
!apt-get update
#Install CUDA Version 9.
!wget https://developer.nvidia.com/compute/cuda/9.2/Prod/local_installers/cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64 -O cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64.deb
!dpkg -i cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64.deb
!apt-key add /var/cuda-repo-9-2-local/7fa2af80.pub
!apt-get update
!apt-get install cuda-9.2
#check nvcc version
!nvcc --version
#cuda magic tool
!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git
%load_ext nvcc_plugin

Reading package lists... 0%Reading package lists... 0%Reading package lists... 0%Reading package lists... 7%Reading package lists... 7%Reading package lists... 7%Reading package lists... 7%Reading package lists... 66%Reading package lists... 66%Reading package lists... 67%Reading package lists... 67%Reading package lists... 74%Reading package lists... 74%Reading package lists... 74%Reading package lists... 74%Reading package lists... 77%Reading package lists... 83%Reading package lists... 83%Reading package lists... 83%Reading package lists... 83%Reading package lists... 83%Reading package lists... 83%Reading package lists... 83%Reading package lists... 83%Reading package lists... 87%Reading package lists... 87%Reading package lists... 87%Reading package lists... 87%Reading package lists... 93%Reading package lists... 93%Reading package lists... 93%Reading package lists... 93%Reading package lists... 94%Reading package 

In [0]:
#@title Cuda linear search example
#test run
#searching
%%cu
#include<iostream>
#include<ctime>

void random_ints(int*, int);

__global__ void find(int *a,int k,int *first)
{
    int index = threadIdx.x + blockIdx.x * blockDim.x;
    if(a[index] == k)
        {
            if(index<*first)
            {
                *first = index;
            }
        }
}
#define N (2048*2048)
#define THREADS_PER_BLOCK 4096
int main(void) 
{
    int *a,f=N;
    int *d_a,*d_f;
    int size = N * sizeof(int);
 
    cudaMalloc((void **)&d_a, size);
    cudaMalloc((void **)&d_f,sizeof(int));
 
    a = (int *)malloc(size); random_ints(a, N);
 
    cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_f, &f, sizeof(int), cudaMemcpyHostToDevice);
 
    int k=4100000;
    const clock_t begin_time = clock();
    find<<<N/THREADS_PER_BLOCK,THREADS_PER_BLOCK>>>(d_a,k,d_f);
    cudaMemcpy(&f, d_f, sizeof(int), cudaMemcpyDeviceToHost);
    std::cout <<"Time "<< float( clock () - begin_time )/CLOCKS_PER_SEC<<"\n";
    std::cout<<"First occurance is "<<f;
    
    cudaFree(d_a); 
    cudaFree(d_f);
    free(a);
 
    return 0;
}

void random_ints(int* a, int n)
{
   int i;
   for (i = 0; i < n; ++i)
    a[i] = i;//rand();
}

In [0]:
%%writefile hparams.h
#define THREADS_PER_BLOCK 1024

Overwriting hparams.h


In [0]:
%%writefile funcs.h
#include "hparams.h"
namespace CUDA
{
 
    template<typename T>
    using bi_func = T (*) (T, T);
 

    //basic binary functions device codes
    template<typename T>
    __device__ T add(T a,T b)
    {
        return a+b;
    }

    template<typename T>
    __device__ T sub(T a,T b)
    {
        return a-b;
    }
    template<typename T>
    __device__ T mul(T a,T b)
    {
        return a*b;
    }
    template<typename T>
    __device__ T div(T a,T b)
    {
        return a/b;
    }
 
    template<typename T>
    __device__ bi_func<T> c_add = add<T>;
    

    //binary operation kernels
    template<typename T>
    __global__ void bin_op(T *a,T *b,T *c,bi_func<T> fun)
    {
        *c = (*fun)(*a,*b);
    }

    template<typename T>
    __global__ void v_bin_op(T *a,T *b,T *c,bi_func<T> fun)
    {
        int index = threadIdx.x + blockIdx.x * blockDim.x;
        c[index] = (*fun)(a[index],b[index]);
    }
};

Overwriting funcs.h


In [0]:
%%writefile cudavar.h
#include "hparams.h"
namespace CUDA
{
    template<typename T>
    class Var
    {
        protected:
        T *t;
        bool autofree;
        private:
        void autofreeon()
        {
            autofree = true;
        }
        void autofreeoff()
        {
            autofree = false;
        }
        void Init()
        {
            cudaMalloc((void **)&t, sizeof(T));
            autofreeon();
        }
        void assign(T x)
        {
            cudaMemcpy(t, &x, sizeof(T), cudaMemcpyHostToDevice);
        }
        T get()
        {
            T x;
            cudaMemcpy(&x,t, sizeof(T), cudaMemcpyDeviceToHost);
            return x;
        }
        public:
        Var()
        {
            Init();
        }
        Var(T x)
        {
            Init();
            assign(x);
        }
        Var(const Var &v)
        {
            t = v.t;
            autofree = v.autofree;
        }
        void operator=(T x)
        {
            assign(x);
        }
        operator T()
        {
            return get();
        }
        void free()
        {
            cudaFree(t);
        }
        ~Var()
        {
            if(autofree)
                free();
            else autofreeon();
        }
        Var operator+(Var v)
        {
            Var x;
            x.autofreeoff();
            bi_func<T> h;
            cudaMemcpyFromSymbol(&h,c_add<T>,sizeof(bi_func<T>));
            bin_op<T><<<1,1>>>(this->t,v.t,x.t,h);
            return x;
        }
    };
};

Overwriting cudavar.h


In [0]:
%%writefile cudavector.h
#include "hparams.h"
namespace CUDA
{
    template<typename T>
    class Vector
    {
        protected:
        T *t;
        int size;
        bool autofree;
        private:
        void autofreeon()
        {
            autofree = true;
        }
        void autofreeoff()
        {
            autofree = false;
        }
        void Init(int s)
        {
            size = s;
            cudaMalloc((void **)&t, sizeof(T)*size);
            autofreeon();
        }
        void assign(T *x)
        {
            cudaMemcpy(t, x, sizeof(T)*size, cudaMemcpyHostToDevice);
        }
        T* get()
        {
            T *x;
            x = new T[size];
            cudaMemcpy(x,t, sizeof(T)*size, cudaMemcpyDeviceToHost);
            return x;
        }
        T geti(int i)
        {
            T x;
            cudaMemcpy(&x,t+i, sizeof(T), cudaMemcpyDeviceToHost);
            return x;
        }
        
        public:
        Vector(int s)
        {
            Init(s);
        }
        Vector(T *x,int s)
        {
            Init(s);
            assign(x);
        }
        Vector(const Vector &v)
        {
            t = v.t;
            size = v.size;
            autofree = v.autofree;
        }
        T operator[](int i)
        {
            return geti(i);
        }
        void puti(int i,T x)
        {
            cudaMemcpy(t+i, &x, sizeof(T), cudaMemcpyHostToDevice);
        }
        void operator=(T *x)
        {
            assign(x);
        }
        operator T*()
        {
            return get();
        }
        void free()
        {
            cudaFree(t);
        }
        ~Vector()
        {
            if(autofree)
                free();
            else autofreeon();
        }
        
        Vector operator+(Vector v)
        {
            Vector x(size);
            x.autofreeoff();
            int LN = size%THREADS_PER_BLOCK;
            int K = size/THREADS_PER_BLOCK;
         
            bi_func<T> h;
            cudaMemcpyFromSymbol(&h,c_add<T>,sizeof(bi_func<T>));
            v_bin_op<T><<<K,THREADS_PER_BLOCK>>>(this->t,v.t,x.t,h);
            v_bin_op<T><<<1,LN>>>(this->t,v.t,x.t,h);
            return x;
        }
    };
};

Overwriting cudavector.h


In [0]:
!pwd

/content/drive/My Drive/Git/WrapLap


In [0]:
%%writefile test.cu
#include<iostream>
#include "funcs.h"
#include "cudavar.h"
#include "cudavector.h"
#include <iostream>
void random_ints(int* a, int n)
{
   int i;
   for (i = 0; i < n; ++i)
    a[i] = i;//rand();
}

#define N 1024

int main()
{
    CUDA::Vector<int> a(1),b(1);
    CUDA::Vector<int> c(1);
    a.puti(0,20);
    b.puti(0,20);
    c = (a+b);
    std::cout<<c[0];
}

Overwriting test.cu


In [0]:
!nvcc -o a.out test.cu
!./a.out

40

In [0]:
%%cu
#include<iostream>

template<typename T>
using func_t = T (*) (T, T);

template <typename T> 
__device__ T add_func (T x, T y)
{
    return x + y;
}

template <typename T> 
__device__ T mul_func (T x, T y)
{
    return x * y;
}

// Required for functional pointer argument in kernel function
// Static pointers to device functions
template <typename T> 
__device__ func_t<T> p_add_func = add_func<T>;
template <typename T> 
__device__ func_t<T> p_mul_func = mul_func<T>;


template <typename T> 
__global__ void kernel(func_t<T> op, T * d_x, T * d_y, T * result)
{
    *result = (*op)(*d_x, *d_y);
}

template <typename T> 
void test(T x, T y)
{
    func_t<T> h_add_func;
    func_t<T> h_mul_func;

    T * d_x, * d_y;
    cudaMalloc(&d_x, sizeof(T));
    cudaMalloc(&d_y, sizeof(T));
    cudaMemcpy(d_x, &x, sizeof(T), cudaMemcpyHostToDevice);
    cudaMemcpy(d_y, &y, sizeof(T), cudaMemcpyHostToDevice);

    T result;
    T * d_result, * h_result;
    cudaMalloc(&d_result, sizeof(T));
    h_result = &result;

    // Copy device function pointer to host side
    cudaMemcpyFromSymbol(&h_add_func, p_add_func<T>, sizeof(func_t<T>));
    cudaMemcpyFromSymbol(&h_mul_func, p_mul_func<T>, sizeof(func_t<T>));

    kernel<T><<<1,1>>>(h_add_func, d_x, d_y, d_result);
    cudaDeviceSynchronize();
    cudaMemcpy(h_result, d_result, sizeof(T), cudaMemcpyDeviceToHost);
    std::cout << "Sum: " << result << std::endl;

    kernel<T><<<1,1>>>(h_mul_func, d_x, d_y, d_result);
    cudaDeviceSynchronize();
    cudaMemcpy(h_result, d_result, sizeof(T), cudaMemcpyDeviceToHost);
    std::cout << "Product: " << result << std::endl;
}

int main()
{
    std::cout << "Test int for type int ..." << std::endl;
    test<int>(2.05, 10.00);

    std::cout << "Test float for type float ..." << std::endl;
    test<float>(2.05, 10.00);

    std::cout << "Test double for type double ..." << std::endl;
    test<double>(2.05, 10.00);
}

Test int for type int ...
Sum: 12
Product: 20
Test float for type float ...
Sum: 12.05
Product: 20.5
Test double for type double ...
Sum: 12.05
Product: 20.5

