<a href="https://colab.research.google.com/github/Andres8bit/parallel-computing/blob/main/1D_Convolutions_cu.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2019 NVIDIA Corporation
Built on Sun_Jul_28_19:07:16_PDT_2019
Cuda compilation tools, release 10.1, V10.1.243


In [None]:
!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git

Collecting git+git://github.com/andreinechaev/nvcc4jupyter.git
  Cloning git://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-e7pvrs9m
  Running command git clone -q git://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-e7pvrs9m
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-cp36-none-any.whl size=4308 sha256=f1956ebee792ced57629ac50eac4be17329d6ebb22c27b75b699ce8239a20023
  Stored in directory: /tmp/pip-ephem-wheel-cache-jau8re1n/wheels/10/c2/05/ca241da37bff77d60d31a9174f988109c61ba989e4d4650516
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2


In [None]:
%load_ext nvcc_plugin

created output directory at /content/src
Out bin /content/result.out


In [None]:
# The following cells implement 1D parallel convolutional alogrithims.
# A Convolution is an array operation, in which the output is a wieghted sum 
# of nieghbor elements within the array. Here the wieghts are provided as a 1D mask.
# Each implemtation uses different layers of memory  inorder to speedup computation,
# by using fast local cache memory, by taking advantage that while each thread my 
# make calcuatlations indepently.They each use the same mask. We therefore take 
# advantage of L1 cache memory, 
# by storing our mask in a __shared__ array. This is done by the last two cells,
# while the first uses does not.

In [None]:

%%cu 
#include <iostream> 
#include <math.h>
#include <chrono>
#include <sys/time.h>
#include <time.h>

#define MAX_WIDTH 5 
#define WIDTH  7

void print_array(float* p, int width);
__global__ void conv_1D_kernel(float *input, float *mask, float *out, int mask_width, int width);


int main() 
{
    float host_in[] = {1,2,3,4,5,6,7};
    float host_mask[] = {3,4,5,4,3};
    float host_out[WIDTH];
    float *device_in, *device_mask, *device_out;
    
    printf("input: ");
    print_array(host_in, WIDTH);
    printf("\nmask: ");
    print_array(host_mask, MAX_WIDTH);
 
    size_t device_size = WIDTH * sizeof (float);
    
    cudaMalloc ((void **) &device_in, device_size);
    cudaMalloc ((void **) &device_mask,MAX_WIDTH * sizeof (float));
    cudaMalloc ((void **) &device_out, device_size);
     
    cudaMemcpy(device_in, host_in,device_size, cudaMemcpyHostToDevice);
    cudaMemcpy(device_mask, host_mask, MAX_WIDTH *sizeof(float) ,cudaMemcpyHostToDevice);
     
    dim3 dimBlock(WIDTH);
    dim3 dimGrid(32);

    conv_1D_kernel<<<dimGrid, dimBlock>>> (device_in, device_mask, device_out
                                             ,MAX_WIDTH,WIDTH);

    cudaMemcpy(host_out, device_out,device_size, cudaMemcpyDeviceToHost);
 
    printf("\noutput: ");
    print_array(host_out, WIDTH);
    
    cudaFree(device_in);
    cudaFree(device_out);
    cudaFree(device_mask);
 
	return 0; 
} 

__global__ void conv_1D_kernel(float *input,float *mask, float *out, int mask_width, int width)
{
    
    int i = blockIdx.x * blockDim.x + threadIdx.x;  
    float temp = 0.0;
    int start = i - (mask_width/2);

    for(int j = 0; j < mask_width;j++)
        if(start + j >= 0 && start + j < width) 
         temp += input[start +j] *mask[j];      
    
    out[i] = temp; 
}

void print_array(float* p, int width){
    for(int i = 0; i < width; i++)
        printf("%d\t", (int)p[i]);
    
  printf("\n");
}

input: 1	2	3	4	5	6	7	

mask: 3	4	5	4	3	

output: 22	38	57	76	95	90	74	



TIled 1D Convolution kernel which uses caching and shared memory. 

In [None]:
%%cu 
#include <iostream> 
#include <math.h>
#include <chrono>
#include <sys/time.h>
#include <time.h>

#define MAX_MASK_WIDTH 10 
#define WIDTH  16
#define TILE_SIZE 8

__constant__ float MASK[MAX_MASK_WIDTH];

__global__ void tiled_conv_1D_kernel(float *input, float* out,int mask_width,int width);
void print_array(float* p, int width);
  
int main()
{ float host_in[] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
  float host_out[WIDTH];
  float host_mask[] = {3,4,5,4,3,2,4,5,6,7};
  float *device_in, *device_out, *device_mask;
  
  size_t array_size = WIDTH * sizeof(float);
  size_t mask_size = MAX_MASK_WIDTH * sizeof(float);
 
  cudaMalloc((void **) &device_in,array_size);
  cudaMalloc((void **) &device_out,array_size);
  cudaMalloc((void **) &device_mask,mask_size);

  cudaMemcpy(device_in,host_in,array_size,cudaMemcpyHostToDevice);
  cudaMemcpyToSymbol(MASK,host_mask,mask_size);
  
  dim3 dim_block(4);
  dim3 dim_grid(4);

  tiled_conv_1D_kernel<<<dim_grid,dim_block>>>(device_in,device_out,MAX_MASK_WIDTH,WIDTH);
  cudaMemcpy(host_out,device_out,array_size,cudaMemcpyDeviceToHost);
 

  printf("\ninput:");
  print_array(host_in,WIDTH);
 
  printf("\nmask:");
  print_array(host_mask,MAX_MASK_WIDTH);
 
  printf("\noutput");
  print_array(host_out,WIDTH);
  cudaFree(device_in);
  cudaFree(device_out);
  cudaFree(device_mask);
 
 
  return 0;
}

void print_array(float* p, int width){
    for(int i = 0; i < width; i++)
        printf("%d\t", (int)p[i]); 
    
  printf("\n");
}

__global__ void tiled_conv_1D_kernel(float *input,float* out,int mask_width,int width)
{
  int i = blockIdx.x * blockDim.x + threadIdx.x;
  __shared__ float  tile[TILE_SIZE + MAX_MASK_WIDTH - 1];
  int n = mask_width/2;  
  int l_halo_index = (blockIdx.x - 1) * blockDim.x + threadIdx.x;

  if (threadIdx.x -(blockDim.x - n) >= 0){
       tile[threadIdx.x - (blockDim.x - n)] =  (l_halo_index < 0) ? 0: input[l_halo_index];

   }
  tile[n + blockDim.x + threadIdx.x] = input[i];
  
  int r_halo_index = (blockIdx.x + 1) * blockDim.x + threadIdx.x;
  if (threadIdx.x < n){
     tile[n + blockDim.x + threadIdx.x] = 
                                (r_halo_index >= width) ? 0 : input[r_halo_index];
  }
  __syncthreads();

  float temp = 0;
  for(int j = 0; j < mask_width;j++){
    temp += tile[threadIdx.x + j] * MASK[j];
  }

  out[i] =  temp;                                     
}



input:0	1	2	3	4	5	6	7	8	9	10	11	12	13	14	15	

mask:3	4	5	4	3	2	4	5	6	7	

output28	59	92	126	78	137	190	232	170	253	310	348	150	154	122	74	



A simpler version of a tiled cached 1D convolution

In [None]:
%%cu
#include <iostream>

#define MAX_MASK_WIDTH 10 
#define WIDTH  16
#define TILE_SIZE 8

__constant__ float MASK[MAX_MASK_WIDTH];
__global__ void tiled_cache_conv1D_kernel(float* input,float* output, int mask_width, int width);

void print_array(float* p, int width);
  
int main()
{ float host_in[] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
  float host_out[WIDTH];
  float host_mask[] = {3,4,5,4,3,2,4,5,6,7};
  float *device_in, *device_out, *device_mask;
  
  size_t array_size = WIDTH * sizeof(float);
  size_t mask_size = MAX_MASK_WIDTH * sizeof(float);
 
  cudaMalloc((void **) &device_in,array_size);
  cudaMalloc((void **) &device_out,array_size);
  cudaMalloc((void **) &device_mask,mask_size);

  cudaMemcpy(device_in,host_in,array_size,cudaMemcpyHostToDevice);
  cudaMemcpyToSymbol(MASK,host_mask,mask_size);
  
  dim3 dim_block(4);
  dim3 dim_grid(4);

  tiled_cache_conv1D_kernel<<<dim_grid,dim_block>>>(device_in,device_out,MAX_MASK_WIDTH,WIDTH);
  cudaMemcpy(host_out,device_out,array_size,cudaMemcpyDeviceToHost);
 

  printf("\ninput:");
  print_array(host_in,WIDTH);
 
  printf("\nmask:");
  print_array(host_mask,MAX_MASK_WIDTH);
 
  printf("\noutput");
  print_array(host_out,WIDTH);
  cudaFree(device_in);
  cudaFree(device_out);
  cudaFree(device_mask);
 
 
  return 0;
}

void print_array(float* p, int width){
    for(int i = 0; i < width; i++)
        printf("%d\t", (int)p[i]); 
    
  printf("\n");
}
__global__ void tiled_cache_conv1D_kernel(float* input,float* output, int mask_width, int width)
{
  int i = blockIdx.x*blockDim.x + threadIdx.x;
  __shared__ float tile[TILE_SIZE];  

  __syncthreads();
  int cur_start = blockIdx.x * blockDim.x;
  int next_start = (blockIdx.x +1) * blockDim.x;
  int start = i - (mask_width/2);
  int index = 0;
  float temp = 0;
  
  for (int j = 0; j < mask_width;j++){
    index = start + j;
    if(index >= 0 && index < width){
        if (index >= cur_start && index < next_start){
            temp += tile[threadIdx.x + j - (mask_width/2)] * MASK[j];
        }else{
            temp +=input[index] * MASK[j];
        }
    }
  } 
    output[i] = temp;
}


input:0	1	2	3	4	5	6	7	8	9	10	11	12	13	14	15	

mask:3	4	5	4	3	2	4	5	6	7	

output28	59	92	126	78	137	190	232	179	253	310	348	171	154	122	74	

