# Parallel Processing - Example

Shift contents of an array to the left by one element.

In [8]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git

Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-7rrsso69
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-7rrsso69
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit 0a71d56e5dce3ff1f0dd2c47c29367629262f527
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [9]:
%load_ext nvcc_plugin

The nvcc_plugin extension is already loaded. To reload it, use:
  %reload_ext nvcc_plugin


In [16]:
%%cu

#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <cuda.h>

// our kernel function that takes the array and the size of array as arguments
__global__ void kernel(int* a, int *n){

    // get the value of i for each thread
    int i = threadIdx.x + blockIdx.x * blockDim.x;

    // ensure the value of i is less than the size of the array
    if(i < *n){
        int temp = a[i+1];
        __syncthreads;
        a[i] = temp;
        __syncthreads;
    }
}

int main(){
    int *h_arr; // host array
    int *d_arr; // device array
    int n = 5;  // size of array
    int *d_n;   // size of array (send to device)

    // allocate host memory for the array
    h_arr = (int*)malloc(n * sizeof(int));

    // initialize the array
    for(int i = 0; i < n; i++){
        h_arr[i] = i;
    }

    // print the elements of the array
    printf("Array Elements: \n");
    for(int i = 0; i < n; i++){
        printf("%d ", h_arr[i]);
    }
    printf("\n");

    // initialize the size of the block and grid for the kernel
    dim3 grid_size(1,1);
    dim3 block_size(n);

    // allocate device memory for the array and its size variable
    cudaMalloc((void**)&d_arr, n * sizeof(int));
    cudaMalloc((void**)&d_n, sizeof(int));

    // copy the host variable onto the device variables
    cudaMemcpy(d_arr, h_arr, n * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_n, &n, sizeof(int), cudaMemcpyHostToDevice);

    // run the kernel function
    kernel<<<grid_size, block_size>>>(d_arr, d_n);

    // save the new device array by replacing the host array
    cudaMemcpy(h_arr, d_arr, n * sizeof(int), cudaMemcpyDeviceToHost);

    // print the contents of the new array
    printf("Array Elements After Kernel Function: \n");
    for(int i = 0; i < n; i++){
        printf("%d ", h_arr[i]);
    }
    printf("\n");

    // free host memory
    free(h_arr);

    // free device memory
    cudaFree(d_arr);
    cudaFree(d_n);



    return 0;
}

Array Elements: 
0 1 2 3 4 
Array Elements After Kernel Function: 
1 2 3 4 0 

