In [1]:
# Install CUDA C++ plugin for Colab:
!pip install nvcc4jupyter
%load_ext nvcc4jupyter

Collecting nvcc4jupyter
  Downloading nvcc4jupyter-1.2.1-py3-none-any.whl.metadata (5.1 kB)
Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1
Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmp8u45sm0f".


In [2]:
# Detect selected GPU and its NVIDA architecture:
import subprocess
gpu_info = subprocess.getoutput("nvidia-smi --query-gpu=name,compute_cap --format=csv,noheader,nounits")
if "not found" in gpu_info.lower(): raise RuntimeError("Error: No GPU found. Please select a GPU runtime environment.")
gpu_name, compute_cap = map(str.strip, gpu_info.split(','))
gpu_arch = f"sm_{compute_cap.replace('.', '')}"

print(f"{'GPU Name':<15}: {gpu_name}")
print(f"{'Architecture':<15}: {gpu_arch}")

GPU Name       : Tesla T4
Architecture   : sm_75


In [3]:
%%cuda -c "--gpu-architecture $gpu_arch"
#include <stdio.h>

#include <stdio.h>
#include <cuda_runtime.h>


__global__ void bitonic_stage(int *data, int n, int stage_0idx, int step_0idx){

    int k = blockIdx.x * blockDim.x + threadIdx.x;
    if (k >= n) return;

    int comp_dist = 1 << step_0idx; // Distance for comparison (e.g., 1, 2, 4)
    int partner = k ^ comp_dist;    // Partner index using XOR

    // Ensure we only process each pair once (k should be the lower-indexed thread)
    if (k >= partner) return;
    if (partner >= n) return; // Ensure partner is within bounds

    // Determine sorting direction for the overall bitonic sequence of length 2^(stage_0idx+1).
    // If the (stage_0idx+1)-th bit of 'k' is 0, sort ascending for this block. Else descending.
    int ascending_block_dir = ((k & (1 << (stage_0idx + 1))) == 0);

    int a = data[k];
    int b = data[partner];

    // Conditional swap: if (ascending_block_dir is true AND a > b) OR (ascending_block_dir is false AND a < b)
    if (ascending_block_dir == (a > b)) {
        data[k]       = b;
        data[partner] = a;
    }
}


void bitonic_sort_gpu(int *h_arr, int n){

    int *d_arr;
    cudaMalloc(&d_arr, n * sizeof(int));
    cudaMemcpy(d_arr, h_arr, n * sizeof(int), cudaMemcpyHostToDevice);

    int threads = 256;
    int blocks  = (n + threads - 1) / threads;

    // steps = log2(n) for n power of 2
    int steps = 0;
    for (int tmp = n; tmp > 1; tmp >>= 1) {
        steps++;
    }

    for (int i = 1; i <= steps; i++) { // i: 1-indexed stage length (1 to steps)
        for (int j = i; j >= 1; j--) { // j: 1-indexed comparison distance (i down to 1)
            // Convert to 0-indexed parameters for the kernel call
            bitonic_stage<<<blocks, threads>>>(d_arr, n, i - 1, j - 1);
            cudaDeviceSynchronize();
        }
    }

    cudaMemcpy(h_arr, d_arr, n * sizeof(int), cudaMemcpyDeviceToHost);
    cudaFree(d_arr);
}


int is_bitonic(int arr[], int n) {
    int i = 0;
    int increasing = 0;
    int decreasing = 0;

    // Check for the increasing part
    while (i < n - 1 && arr[i] < arr[i + 1]) {
        increasing = 1;
        i++;
    }
    // Check for the decreasing part
    while (i < n - 1 && arr[i] > arr[i + 1]) {
        decreasing = 1;
        i++;
    }
    // If we went through the whole array and had an increasing part followed by a decreasing part, it's bitonic
    if (increasing && decreasing && i == n - 1) {
        return 1; // It's bitonic
    }
    // Check for the case where the sequence first decreases then increases
    i = 0;
    increasing = 0;
    decreasing = 0;

    // Check for the decreasing part first
    while (i < n - 1 && arr[i] > arr[i + 1]) {
        decreasing = 1;
        i++;
    }

    // Check for the increasing part after the decreasing part
    while (i < n - 1 && arr[i] < arr[i + 1]) {
        increasing = 1;
        i++;
    }

    // If we went through the whole array and had a decreasing part followed by an increasing part, it's bitonic
    if (decreasing && increasing && i == n - 1) {
        return 1; // It's bitonic
    }

    return 0; // Not bitonic
}


int main(){

    // Array size MUST be a power of 2
    int arr[] = {10, 30, 50, 60, 70, 90, 120, 210, 190, 140, 75, 65, 55, 45, 25, 5};
    int n = sizeof(arr) / sizeof(arr[0]);

    if((n > 0) && ((n & (n - 1)) == 0) ){


     // printf("The size of the Array size is to the power of 2\n");

      printf("Unsorted: ");
      for (int i = 0; i < n; i++) {
          printf("%d ", arr[i]);
      }
    printf("\n");

    if(is_bitonic(arr,n)){
    bitonic_sort_gpu(arr, n);

    printf("Sorted:   ");
    for (int i = 0; i < n; i++) {
        printf("%d ", arr[i]);
    }
    printf("\n");
    }

    else{
            printf("Array is not a bitonic sequence");

    }
    }
    else{
        printf("Array size is not a power of 2");
    }
    return 0;
}


Unsorted: 10 30 50 60 70 90 120 210 190 140 75 65 55 45 25 5 
Sorted:   5 10 25 30 45 50 55 60 65 70 75 90 120 140 190 210 

