In [52]:
# Install CUDA C++ plugin for Colab:
!pip install nvcc4jupyter
%load_ext nvcc4jupyter

The nvcc4jupyter extension is already loaded. To reload it, use:
  %reload_ext nvcc4jupyter


In [53]:
# Detect selected GPU and its NVIDA architecture:
import subprocess
gpu_info = subprocess.getoutput("nvidia-smi --query-gpu=name,compute_cap --format=csv,noheader,nounits")
if "not found" in gpu_info.lower(): raise RuntimeError("Error: No GPU found. Please select a GPU runtime environment.")
gpu_name, compute_cap = map(str.strip, gpu_info.split(','))
gpu_arch = f"sm_{compute_cap.replace('.', '')}"

print(f"{'GPU Name':<15}: {gpu_name}")
print(f"{'Architecture':<15}: {gpu_arch}")

GPU Name       : Tesla T4
Architecture   : sm_75


In [54]:
# @title Default title text
%%cuda -c "--gpu-architecture $gpu_arch"

#include <time.h>
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void bitonic_stage(int *data, int n, int i, int j){

    int k = blockIdx.x * blockDim.x + threadIdx.x;
    if (k >= n) return;

    int comp_dist = 1 << j; // Distance for comparison (e.g., 1, 2, 4)
    int partner = k ^ comp_dist;    // Partner index using XOR

    // Ensure we only process each pair once
    if (k >= partner) return;
    if (partner >= n) return; // Ensure partner is within bound

    // Determine sorting direction for the overall bitonic sequence of length 2^(i+1).
    // If the (i+1)-th bit of index thread 'k' is 0, sort ascending for this block. Else descending.
    int ascending_block_dir = ((k & (1 << (i + 1))) == 0);

    int a = data[k];
    int b = data[partner];

    if (ascending_block_dir == (a > b)) {
        data[k]       = b;
        data[partner] = a;
    }
}

__global__ void bitonic_stage_desc(int *data, int n, int i, int j){

    int k = blockIdx.x * blockDim.x + threadIdx.x;
    if (k >= n) return;

    int comp_dist = 1 << j;
    int partner   = k ^ comp_dist;

    if (k >= partner) return;
    if (partner >= n) return;
    int descending_block_dir = ((k & (1 << (i + 1))) == 0);
    int a = data[k];
    int b = data[partner];

    if (descending_block_dir == (a < b)) {
        data[k]       = b;
        data[partner] = a;
    }
}


void bitonic_sort_gpu(int *h_arr, int n, int asc){

    int *d_arr;
    cudaMalloc(&d_arr, n * sizeof(int));
    cudaMemcpy(d_arr, h_arr, n * sizeof(int), cudaMemcpyHostToDevice);

    int threads = 256;
    int blocks  = (n + threads - 1) / threads;

    // steps = log2(n) for n power of 2
    int steps = 0;
    for (int tmp = n; tmp > 1; tmp >>= 1) {
        steps++;
    }

    int order=asc;
    if(order == 0){
         for (int i = 1; i <= steps; i++) { // i: 1-indexed stage length (1 to steps)
        for (int j = i; j >= 1; j--) { // j: 1-indexed comparison distance (i down to 1)
            bitonic_stage<<<blocks, threads>>>(d_arr, n, i - 1, j - 1);

        }cudaDeviceSynchronize();
    }
   }
    if(order == 1){

    for (int i = 1; i <= steps; i++) { // i: 1-indexed stage length (1 to steps)
        for (int j = i; j >= 1; j--) { // j: 1-indexed comparison distance (i down to 1)
            bitonic_stage_desc<<<blocks, threads>>>(d_arr, n, i - 1, j - 1);

        }cudaDeviceSynchronize();
    }
   }
    cudaMemcpy(h_arr, d_arr, n * sizeof(int), cudaMemcpyDeviceToHost);
    cudaFree(d_arr);
}

int is_bitonic(int arr[], int n) {

    int i = 0;
    int increasing = 0;
    int decreasing = 0;

    // Check increasing part
    while (i < n - 1 && arr[i] <= arr[i + 1]) {
        increasing = 1;
        i++;
    }
    // Check decreasing part
    while (i < n - 1 && arr[i] >= arr[i + 1]) {
        decreasing = 1;
        i++;
    }
    // if array increases then decreases then it's bitonic
    if (increasing && decreasing && i == n - 1) {
        return 1;
    }
    // case where sequence decreases then increases:
    i = 0;
    increasing = 0;
    decreasing = 0;
    // Check the decreasing part first
    while (i < n - 1 && arr[i] >= arr[i + 1]) {
        decreasing = 1;
        i++;
    }
    // Check the increasing part after decreasing part
    while (i < n - 1 && arr[i] <= arr[i + 1]) {
        increasing = 1;
        i++;
    }
    if (decreasing && increasing && i == n - 1) {
        return 1;
    }
    return 0;
}


int main(){

    // Array size MUST be a power of 2
    int arr[] = {10, 30, 50, 120, 210, 190, 140, 44,7};
    int n = sizeof(arr) / sizeof(arr[0]);
    int count = 0;


    if((n > 0) && ((n & (n - 1)) == 0) ){
      printf("The size of the Array size is to the power of 2\n");
      printf("Unsorted: ");
      for (int i = 0; i < n; i++) {
          printf("%d ", arr[i]);
          count++;
      }
     printf("\n");
     printf(" Number of element: %d\n" , count);

   // if you would like to make it descending change 0 --> 1
    int order = 0;

    if(is_bitonic(arr,n)){
    bitonic_sort_gpu(arr, n, order);

    printf("Sorted:   ");
    for (int i = 0; i < n; i++) {
        printf("%d ", arr[i]);
    }
    printf("\n");
    }
    else{
         printf("Array is not a bitonic sequence");
    }
    }
    else{
        printf("Array size is not a power of 2\n");
        printf("Unsorted: ");
      for (int i = 0; i < n; i++) {
          printf("%d ", arr[i]);
          count++;
    }
        printf("\n");
        printf(" Number of element: %d\n" , count);
    return 0;
}
}

Array size is not a power of 2
Unsorted: 10 30 50 120 210 190 140 44 7 
 Number of element: 9

