In [None]:
!apt-get --purge remove cuda nvidia* libnvidia-*
!dpkg -l | grep cuda- | awk '{print $2}' | xargs -n1 dpkg --purge
!apt-get remove cuda-*
!apt autoremove
!apt-get update

In [None]:
!wget https://developer.nvidia.com/compute/cuda/9.2/Prod/local_installers/cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64 -O cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64.deb
!dpkg -i cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64.deb
!apt-key add /var/cuda-repo-9-2-local/7fa2af80.pub
!apt-get update
!apt-get install cuda-9.2

In [None]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2020 NVIDIA Corporation
Built on Wed_Jul_22_19:09:09_PDT_2020
Cuda compilation tools, release 11.0, V11.0.221
Build cuda_11.0_bu.TC445_37.28845127_0


In [None]:
!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git

Collecting git+git://github.com/andreinechaev/nvcc4jupyter.git
  Cloning git://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-q98vmpu_
  Running command git clone -q git://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-q98vmpu_
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-cp37-none-any.whl size=4307 sha256=dc4a9c98d2fc0ae1075cacb63d25268457f5704ed010a00b8c90fad8939c3e04
  Stored in directory: /tmp/pip-ephem-wheel-cache-iux5mwu3/wheels/10/c2/05/ca241da37bff77d60d31a9174f988109c61ba989e4d4650516
Successfully built NVCCPlugin


In [None]:
%load_ext nvcc_plugin

The nvcc_plugin extension is already loaded. To reload it, use:
  %reload_ext nvcc_plugin


In [None]:
%%cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <device_functions.h>

#define ARR_LEN 12
#define swap(A,B) {int temp=A; A=B; B=temp;}

/*
 * Sort an array of size ARR_LEN using parallel odd-even transposition sort. 
*/

__global__ void oddEvenSort(int *arr, int n) {
  int len = (n + 1) /2;

  int index = threadIdx.x;
  int isOdd = index & 1;  
  int isWithinBounds = (index < (n-1));

  for(int i = 0; i < len; i++){
    // even cycle
    if(!isOdd && isWithinBounds){
        if(arr[index] > arr[index + 1])
            swap(arr[index], arr[index + 1]);
    }
    __syncthreads();

    // odd cycle
    if(isOdd && isWithinBounds){
        if(arr[index] > arr[index + 1])
            swap(arr[index], arr[index + 1]);
    }

    __syncthreads();
  }
}

int main() {
  // host copies of variables arr, result
  int arr[ARR_LEN] = {1, 7, 8, 2, 3, 6, 9, 5, 4 , 12, 11, 10};
  int result[ARR_LEN];

  printf("Original Array: ");
  for(int i = 0; i < ARR_LEN; i++)
    printf("%d ", arr[i]);
  printf("\n");

  
  // device copies of array arr
  int *d_arr;

  int size = ARR_LEN * sizeof(int);

  // Allocate space for device copy of arr
  cudaMalloc((void **)&d_arr, size);

  // Copy inputs to device
  cudaMemcpy(d_arr, arr, size, cudaMemcpyHostToDevice);

  
  // Launch oddEven sort kernel on GPU
  oddEvenSort<<<1, ARR_LEN>>>(d_arr, ARR_LEN);

  // Copy result to result array
  cudaError err = cudaMemcpy(&result, d_arr, size, cudaMemcpyDeviceToHost);

  if(err != cudaSuccess) 
    printf("CUDA error copying to Host: %s\n", cudaGetErrorString(err));
  
  printf("Sorted Array:   ");
  for(int i = 0; i < ARR_LEN; i++)
    printf("%d ", result[i]);
  printf("\n");

  // Cleanup
  cudaFree(d_arr);
  return 0;
}

Original Array: 1 7 8 2 3 6 9 5 4 12 11 10 
Sorted Array:   1 2 3 4 5 6 7 8 9 10 11 12 

