In [None]:
!apt-get --purge remove cuda nvidia* libnvidia-*
!dpkg -l | grep cuda- | awk '{print $2}' | xargs -n1 dpkg --purge
!apt-get remove cuda-*
!apt autoremove
!apt-get update

In [None]:
!wget https://developer.nvidia.com/compute/cuda/9.2/Prod/local_installers/cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64 -O cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64.deb
!dpkg -i cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64.deb
!apt-key add /var/cuda-repo-9-2-local/7fa2af80.pub
!apt-get update
!apt-get install cuda-9.2

In [None]:
!nvcc --version

In [None]:
!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git

In [None]:
%load_ext nvcc_plugin

In [None]:
%%cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>

#define ARR_LEN 12

/*
 * Sort an array of size ARR_LEN using parallel selection sort. 
*/

__global__ void selectionSort(int *arr, int *result, int n) {
  int id = blockIdx.x * blockDim.x + threadIdx.x;

  if(id > n)
    return;
  
  int pos = 0;
  for(int i=0; i<n; i++)
    if(arr[id] > arr[i] || (arr[id] == arr[i] && id > i))
      pos++;
  
  result[pos] = arr[id];
}

int main() {
  // host copies of variables arr, result
  int arr[ARR_LEN] = {1, 7, 8, 2, 3, 6, 9, 5, 4 , 12, 11, 10};
  int result[ARR_LEN];

  printf("Original Array: ");
  for(int i = 0; i < ARR_LEN; i++)
    printf("%d ", arr[i]);
  printf("\n");

  
  // device copies of array arr and result
  int *d_arr, *d_result;

  int size = ARR_LEN * sizeof(int);

  // Allocate space for device copy of arr and result
  cudaMalloc((void **)&d_arr, size);
  cudaMalloc((void **)&d_result, size);

  // Copy inputs to device
  cudaMemcpy(d_arr, arr, size, cudaMemcpyHostToDevice);
  
  // Launch selection sort kernel on GPU
  selectionSort<<<1, ARR_LEN>>>(d_arr, d_result, ARR_LEN);

  // Copy result to result array
  cudaError err = cudaMemcpy(result, d_result, size, cudaMemcpyDeviceToHost);

  if(err != cudaSuccess) 
    printf("CUDA error copying to Host: %s\n", cudaGetErrorString(err));
  
  printf("Sorted Array:   ");
  for(int i = 0; i < ARR_LEN; i++)
    printf("%d ", result[i]);
  printf("\n");

  // Cleanup
  cudaFree(d_arr);
  cudaFree(d_result);

  return 0;
}


Original Array: 1 7 8 2 3 6 9 5 4 12 11 10 
Sorted Array:   1 2 3 4 5 6 7 8 9 10 11 12 

