In [5]:
# Load the extension that allows us to compile CUDA code in python notebooks
# Documentation is here: https://nvcc4jupyter.readthedocs.io/en/latest/
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git
%load_ext nvcc4jupyter

Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to c:\users\tinyr\appdata\local\temp\pip-req-build-vx3b75ha
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit 28f872a2f99a1b201bcd0db14fdbc5a496b9bfd7
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Source files will be saved in "C:\Users\tinyr\AppData\Local\Temp\tmprkrh1gpo".


  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git 'C:\Users\tinyr\AppData\Local\Temp\pip-req-build-vx3b75ha'

[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
%%cuda_group_save -g "source" -n "data_types.h"
/**
 * A collection of commonly used data types throughout this project.
 */
#pragma once

#include <iostream> // for std::ostream
#include <vector>

namespace csc485b{
namespace a2{

using node_t = int;
using edge_t = int2;

using edge_list_t = std::vector< edge_t >;
using node_list_t = std::vector< node_t >;

} // namespace a2
} // namespace csc485b


In [7]:
%%cuda_group_save -g "source" -n "cuda_common.h"
/**
 * Standard macros that can be useful for error checking.
 * https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__ERROR.html
 */
#pragma once

#include <cuda.h>

#define CUDA_CALL(exp)                                       \
    do {                                                     \
        cudaError res = (exp);                               \
        if(res != cudaSuccess) {                             \
            printf("Error at %s:%d\n %s\n",                  \
                __FILE__,__LINE__, cudaGetErrorString(res)); \
           exit(EXIT_FAILURE);                               \
        }                                                    \
    } while(0)

#define CHECK_ERROR(msg)                                             \
    do {                                                             \
        cudaError_t err = cudaGetLastError();                        \
        if(cudaSuccess != err) {                                     \
            printf("Error (%s) at %s:%d\n %s\n",                     \
                (msg), __FILE__, __LINE__, cudaGetErrorString(err)); \
            exit(EXIT_FAILURE);                                      \
        }                                                            \
    } while (0)

In [8]:
%%cuda_group_save -g "source" -n "data_generator.h"
/**
 * Functions for generating random input data with a fixed seed
 */
#pragma once

#include <cassert>  // for assert()
#include <cstddef>  // std::size_t type
#include <random>   // for std::mt19937, std::uniform_int_distribution
#include <vector>

#include "data_types.h"

namespace csc485b {
namespace a2 {

/**
 * Generates and returns a vector of random edges
 * for a graph `G=(V,E)` with `n=|V|=n` and expected `m=|E|`.
 * Referred to as an Erdős-Rényi graph.
 *
 * @see https://networkx.org/documentation/stable/reference/generated/networkx.generators.random_graphs.fast_gnp_random_graph.html#networkx.generators.random_graphs.fast_gnp_random_graph
 */
edge_list_t generate_graph( std::size_t n, std::size_t m )
{
    assert( "At most n(n-1) edges in a simple graph" && m < n * ( n - 1 ) );

    int const probability = ( 100 * m ) / ( n * ( n - 1 ) );

    // for details of random number generation, see:
    // https://en.cppreference.com/w/cpp/numeric/random/uniform_int_distribution
    std::size_t random_seed = 20241008;  // use magic seed
    std::mt19937 rng( random_seed );     // use mersenne twister generator
    std::uniform_int_distribution<> distrib(0, 100);

    edge_list_t random_edges;
    random_edges.reserve( 2 * m );

    for( node_t u = 0; u < n; ++u )
    {
        for( node_t v = u + 1; v < n; ++v )
        {
            auto const dice_roll = distrib( rng );
            if( dice_roll <= probability )
            {
                random_edges.push_back( make_int2( u, v ) );
                random_edges.push_back( make_int2( v, u ) );
            }
        }
    }

    random_edges.resize( random_edges.size() );


    return random_edges;
}

} // namespace a2
} // namespace csc485b


In [169]:
%%cuda_group_save -g "source" -n "dense_graph.h"
/**
 * The file in which you will implement your DenseGraph GPU solutions!
 */

#include <cstddef>  // std::size_t type

#include "cuda_common.h"
#include "data_types.h"

namespace csc485b {
namespace a2      {

/**
 * A DenseGraph is optimised for a graph in which the number of edges
 * is close to n(n-1). It is represented using an adjacency matrix.
 */
struct DenseGraph
{
  std::size_t n; /**< Number of nodes in the graph. */
  node_t * adjacencyMatrix; /** Pointer to an n x n adj. matrix */

  /** Returns number of cells in the adjacency matrix. */
  __device__ __host__ __forceinline__
  std::size_t matrix_size() const { return n * n; }
};


namespace gpu {

/**
 * Constructs a DenseGraph from an input edge list of m edges.
 *
 * @pre The pointers in DenseGraph g have already been allocated.
 */
__global__
void build_graph( DenseGraph g, edge_t const * edge_list, std::size_t m )
{
    // IMPLEMENT ME!
    int const th_id = blockIdx.x * blockDim.x + threadIdx.x;

    if(th_id < m) {
        edge_t e = edge_list[th_id];

        g.adjacencyMatrix[ (e.x * g.n) + e.y ] = 1;
    }

    return;
}

const int tile_dim = 32;

/**
  * Repopulates the adjacency matrix as a new graph that represents
  * the two-hop neighbourhood of input graph g
  */
__global__
void two_hop_reachability( DenseGraph g, DenseGraph hop )
{
    // IMPLEMENT ME!
    // square adjacencyMatrix
    // then remove the diagonal and clamp values back to [0,1]

    int const row = blockIdx.y * blockDim.y + threadIdx.y;
    int const col = blockIdx.x * blockDim.x + threadIdx.x;

    // __shared__ node_t a_tile[tile_dim * tile_dim * 4];
    __shared__ node_t b_tile[tile_dim * tile_dim * 4];    

    int dot_product = 0;

    // Move tile
    for(int i = 0; i < g.n; i += tile_dim) {
        int a = g.adjacencyMatrix[ row * g.n + i + threadIdx.x ];
        int b = g.adjacencyMatrix[ i * g.n + threadIdx.y * g.n + col ];

        // a_tile[threadIdx.y * tile_dim + threadIdx.x] = a;
        b_tile[threadIdx.y * tile_dim + threadIdx.x] = b;

        __syncthreads();

        for(int j = 0; j < tile_dim; j++) {
            // dot_product += a_tile[threadIdx.y * tile_dim + j] * b_tile[j * tile_dim + threadIdx.x];
            dot_product += __shfl_sync(__activemask(), a, (threadIdx.y * tile_dim + j) % 32) * b_tile[j * tile_dim + threadIdx.x];
        }

        __syncthreads();
    }

    if(row == col) {
        hop.adjacencyMatrix[row * hop.n + col] = 0;
    } else {
        hop.adjacencyMatrix[row * hop.n + col] = dot_product != 0;
    }

    return;
}

int run_two_hop( DenseGraph g, DenseGraph hop ) {
    if(g.n < tile_dim) {
        return -1;
    }

    std::size_t const num_blocks = ( g.n + tile_dim - 1 ) / tile_dim;

    dim3 threads(tile_dim, tile_dim);
    dim3 blocks(num_blocks, num_blocks);

    two_hop_reachability<<< blocks, threads >>>( g, hop );

    return 0;
}

void run_build_graph( DenseGraph g, edge_t const * edge_list, std::size_t m ) {
    std::size_t const threads_per_block = 1;
    std::size_t const num_blocks =  ( m + threads_per_block - 1 ) / threads_per_block;

    build_graph<<< num_blocks, threads_per_block >>>( g, edge_list, m);
}

} // namespace gpu
} // namespace a2
} // namespace csc485b

In [170]:
%%cuda_group_save -g "source" -n "sparse_graph.h"
/**
 * The file in which you will implement your SparseGraph GPU solutions!
 */

#include <cstddef>  // std::size_t type

#include "cuda_common.h"
#include "data_types.h"

namespace csc485b {
namespace a2      {

/**
 * A SparseGraph is optimised for a graph in which the number of edges
 * is close to cn, for a small constanct c. It is represented in CSR format.
 */
struct SparseGraph
{
  std::size_t n; /**< Number of nodes in the graph. */
  std::size_t m; /**< Number of edges in the graph. */
  node_t * neighbours_start_at; /** Pointer to an n=|V| offset array */
  node_t * neighbours; /** Pointer to an m=|E| array of edge destinations */
};


namespace gpu {

/**
 * Constructs a SparseGraph from an input edge list of m edges.
 *
 * @pre The pointers in SparseGraph g have already been allocated.
 */
__global__
void bucket_count( SparseGraph g, edge_t const * edge_list, std::size_t m ) {
    int const th_id = blockIdx.x * blockDim.x + threadIdx.x;

    // Bucket
    if(th_id < m) {
        atomicAdd(g.neighbours_start_at + edge_list[th_id].x, 1);
    }
    
}

__global__
void prefix_sum( SparseGraph g, int step ) {
    int const th_id = blockIdx.x * blockDim.x + threadIdx.x;

    // Prefix sum
    if(th_id < g.n and th_id >= step) {
        g.neighbours_start_at[th_id] += g.neighbours_start_at[th_id - step];
    } 

}

__global__
void bucket_sort( SparseGraph g, edge_t const * edge_list, std::size_t m ) {
    int const th_id = blockIdx.x * blockDim.x + threadIdx.x;

    // Sort
    if(th_id < m) {
        edge_t e = edge_list[th_id];
        int pos = atomicAdd(g.neighbours_start_at + e.x, -1) - 1;
        g.neighbours[pos] = e.y;
    } 

}

__global__
void build_graph( SparseGraph g, edge_t const * edge_list, std::size_t m )
{
    // IMPLEMENT ME!

    return;
}

/**
  * Repopulates the adjacency lists as a new graph that represents
  * the two-hop neighbourhood of input graph g
  */
__global__
void two_hop_reachability( SparseGraph s, SparseGraph hop )
{
    // https://leimao.github.io/blog/CSR-Sparse-Matrix-Multiplication/
    // IMPLEMENT ME!
    // algorithm unknown
    int const row = blockIdx.x * blockDim.x + threadIdx.x;
    int const mat_row = blockIdx.y * blockDim.y + threadIdx.y;

    if(row < s.n && mat_row < s.n) {
        int start = s.neighbours_start_at[row];
        int end;
        if(row != s.n - 1) {
            end = s.neighbours_start_at[row+1];
        } else {
            end = s.m;
        }

        int row_size = end - start;
        int count = 0;

        int mat_row_size;
        if(mat_row != s.n - 1) {
            mat_row_size = s.neighbours_start_at[mat_row+1] - s.neighbours_start_at[mat_row];
        } else {
            mat_row_size = s.m - s.neighbours_start_at[mat_row];
        }
        
        int mat_i = 0;
        int row_i = 0;

        int val = 0;

        while(mat_i < mat_row_size && row_i < row_size) {
            if(s.neighbours[s.neighbours_start_at[mat_row] + mat_i] == s.neighbours[s.neighbours_start_at[row] + row_i]) {
                val++;
                mat_i++;
                row_i++;
            } else if(s.neighbours[s.neighbours_start_at[mat_row] + mat_i] < s.neighbours[s.neighbours_start_at[row] + row_i]) {
                row_i++;
            } else {
                mat_i++;
            }
        }

        if(val != 0) {
            int pos = atomicAdd(hop.neighbours_start_at + row, 1);
            hop.neighbours[row * s.n + pos] = mat_row;
        }

        return;
    }
}

__global__
void compress_neighbours( SparseGraph hop, node_t * temp_neighbours, int neighbours_size ) {
    int const th_id = blockIdx.x * blockDim.x + threadIdx.x;

    if(hop.neighbours[th_id] != -1 && th_id < hop.n * hop.n) {
        int node = th_id / hop.n;

        int pos = atomicAdd(hop.neighbours_start_at + node, -1) - 1;
        temp_neighbours[pos] = hop.neighbours[th_id];
    }
}

__global__
void copy_neighbours( SparseGraph hop, node_t * temp_neighbours, int neighbours_size ) {
    int const th_id = blockIdx.x * blockDim.x + threadIdx.x;

    if(th_id < neighbours_size) {
        hop.neighbours[th_id] = temp_neighbours[th_id];
    }
}

int run_two_hop( SparseGraph g, SparseGraph hop ) {
    int threads_per_block = 32;
    int num_blocks =  ( g.n + threads_per_block - 1 ) / threads_per_block;

    dim3 threads(threads_per_block, threads_per_block);
    dim3 blocks(num_blocks, num_blocks);

    two_hop_reachability<<< blocks, threads >>>( g, hop );

    for( int step = 1; step < g.n; step = step << 1 ) {
        prefix_sum<<< num_blocks, threads_per_block >>>( hop, step );
    }
    cudaDeviceSynchronize();

    int neighbours_size = 0;
    cudaMemcpy(&neighbours_size, hop.neighbours_start_at + hop.n - 1, sizeof(int), cudaMemcpyDeviceToHost);

    node_t * temp_neighbours;
    cudaMalloc( (void**)&temp_neighbours, sizeof( a2::node_t ) * neighbours_size );
    cudaMemset( temp_neighbours, 0, sizeof( a2::node_t ) * neighbours_size );

    threads_per_block = 1024;
    num_blocks = ( (g.n * g.n) + threads_per_block - 1 ) / threads_per_block;

    compress_neighbours<<< num_blocks, threads_per_block >>>( hop, temp_neighbours, neighbours_size );
    cudaDeviceSynchronize();

    threads_per_block = 1024;
    num_blocks = ( neighbours_size + threads_per_block - 1 ) / threads_per_block;

    copy_neighbours<<< num_blocks, threads_per_block >>>( hop, temp_neighbours, neighbours_size );
    cudaDeviceSynchronize();

    return neighbours_size;
}

void run_build_graph( SparseGraph g, edge_t const * edge_list, std::size_t m ) {

    std::size_t const threads_per_block = 1024;
    std::size_t const num_blocks =  ( m + threads_per_block - 1 ) / threads_per_block;

    bucket_count<<< num_blocks, threads_per_block >>>( g, edge_list, m);

    for( int step = 1; step < g.n; step = step << 1 ) {
        prefix_sum<<< num_blocks, threads_per_block >>>( g, step );
    }

    bucket_sort<<< num_blocks, threads_per_block >>>( g, edge_list, m );
}

} // namespace gpu
} // namespace a2
} // namespace csc485b

In [195]:
%%cuda_group_save -g "source" -n "main.cu"
/**
 * Driver for the benchmark comparison. Generates random data,
 * runs the CPU baseline, and then runs your code.
 */

#include <chrono>   // for timing
#include <iostream> // std::cout, std::endl
#include <iterator> // std::ostream_iterator
#include <vector>

#include "dense_graph.h"
#include "sparse_graph.h"

#include "data_generator.h"
#include "data_types.h"

/**
 * Runs timing tests on a CUDA graph implementation.
 * Consists of independently constructing the graph and then
 * modifying it to its two-hop neighbourhood.
 */
template < typename DeviceGraph >
int run( DeviceGraph g, DeviceGraph hop, csc485b::a2::edge_t const * d_edges, std::size_t m )
{
    cudaDeviceSynchronize();
    auto const build_start = std::chrono::high_resolution_clock::now();

    // this code doesn't work yet!
    csc485b::a2::gpu::run_build_graph( g, d_edges, m );

    cudaDeviceSynchronize();
    auto const reachability_start = std::chrono::high_resolution_clock::now();

    // neither does this!
    int neighbour_size = csc485b::a2::gpu::run_two_hop( g, hop );

    cudaDeviceSynchronize();
    auto const end = std::chrono::high_resolution_clock::now();

    std::cout << "Build time: "
              << std::chrono::duration_cast<std::chrono::microseconds>(reachability_start - build_start).count()
              << " us"
              << std::endl;

    std::cout << "Reachability time: "
              << std::chrono::duration_cast<std::chrono::microseconds>(end - reachability_start).count()
              << " us"
              << std::endl;

    return neighbour_size;
}

void dg_cpu( csc485b::a2::DenseGraph dg, csc485b::a2::DenseGraph hop_dg, csc485b::a2::edge_t const * d_edges, std::size_t n, std::size_t m) {
    using namespace csc485b;

    for(int i = 0; i < m; i++) {
        a2::edge_t e = d_edges[i];
        dg.adjacencyMatrix[(e.x * dg.n) + e.y] = 1;
    }

    for(int row = 0; row < dg.n; row++) {
        for(int col = 0; col < dg.n; col++) {
            for(int i = 0; i < dg.n; i++) {
                hop_dg.adjacencyMatrix[row * dg.n + col] += dg.adjacencyMatrix[row * dg.n + i] * dg.adjacencyMatrix[i * dg.n + col];
            }
        }
    }

    for(int row = 0; row < dg.n; row++) {
        for(int col = 0; col < dg.n; col++) {
            if(row == col) {
                hop_dg.adjacencyMatrix[row * dg.n + col] = 0;
            } else {
                hop_dg.adjacencyMatrix[row * dg.n + col] = hop_dg.adjacencyMatrix[row * dg.n + col] != 0;
            }
        }
    }

    return;
}

/**
 * Allocates space for a dense graph and then runs the test code on it.
 */
void run_dense( csc485b::a2::edge_t const * d_edges, csc485b::a2::edge_t const * h_edges, std::size_t n, std::size_t m )
{
    using namespace csc485b;

    // Allocate device DenseGraph
    a2::node_t * d_matrix;
    cudaMalloc( (void**)&d_matrix, sizeof( a2::node_t ) * n * n );
    a2::DenseGraph d_dg{ n, d_matrix };

    a2::node_t * d_hop_matrix;
    cudaMalloc( (void**)&d_hop_matrix, sizeof( a2::node_t ) * n * n );
    a2::DenseGraph d_hop_dg{ n, d_hop_matrix };

    // Run GPU solution
    std::cout << "Dense: " << std::endl;
    run( d_dg, d_hop_dg, d_edges, m );

    // Copy data back to host
    std::vector< a2::node_t > host_matrix( d_dg.matrix_size() );
    a2::DenseGraph dg{ n, host_matrix.data() };
    cudaMemcpy( dg.adjacencyMatrix, d_dg.adjacencyMatrix, sizeof( a2::node_t ) * d_dg.matrix_size(), cudaMemcpyDeviceToHost );

    std::vector< a2::node_t > hop_host_matrix( d_hop_dg.matrix_size() );
    a2::DenseGraph hop_dg{ n, hop_host_matrix.data() };
    cudaMemcpy( hop_dg.adjacencyMatrix, d_hop_dg.adjacencyMatrix, sizeof( a2::node_t ) * d_hop_dg.matrix_size(), cudaMemcpyDeviceToHost );

    
    // Run CPU solution
    std::vector< a2::node_t > cpu_matrix( d_dg.matrix_size() );
    a2::DenseGraph cpu_dg{ n, cpu_matrix.data() };

    std::vector< a2::node_t > cpu_hop_matrix( d_hop_dg.matrix_size() );
    a2::DenseGraph cpu_hop_dg{ n, cpu_hop_matrix.data() };

    /**
    dg_cpu( cpu_dg, cpu_hop_dg, h_edges, n, m );

    bool dg_match = true;
    bool hop_match = true;

    // Check against CPU
    for(int i=0; i < dg.matrix_size(); i++) {
        if(dg.adjacencyMatrix[i] != cpu_dg.adjacencyMatrix[i]) {
            dg_match = false;
            std::cout << "Built graph does not match! (" << i << ")" << std::endl;
            break;
        }

        if( i != 0 && i % dg.n == 0) { 
            //std::cout << std::endl;
        }
        //std::cout << dg.adjacencyMatrix[i] << " ";
    }
    std::cout << std::endl;
    std::cout << std::endl;

    for(int i=0; i < hop_dg.matrix_size(); i++) {
        if(hop_dg.adjacencyMatrix[i] != cpu_hop_dg.adjacencyMatrix[i]) {
            hop_match = false;
            std::cout << "Hop graph does not match! (" << i << ")" << std::endl;
            break;
        } 
        
        if( i != 0 && i % hop_dg.n == 0) { 
           //std::cout << std::endl;
        }
        //std::cout << hop_dg.adjacencyMatrix[i] << " ";
    }
    std::cout << std::endl;
    std::cout << std::endl;

    if(dg_match && hop_match) {
        std::cout << "Both graphs match!" << std::endl;
    }
    **/

    // clean up
    cudaFree( d_matrix );
    cudaFree( d_hop_matrix );
}

/**
 * Allocates space for a sparse graph and then runs the test code on it.
 */
void run_sparse( csc485b::a2::edge_t const * d_edges, std::size_t n, std::size_t m )
{
    using namespace csc485b;

    // allocate device SparseGraph
    a2::node_t * d_offsets, * d_neighbours;
    cudaMalloc( (void**)&d_offsets,    sizeof( a2::node_t ) * n );
    cudaMalloc( (void**)&d_neighbours, sizeof( a2::node_t ) * m );
    cudaMemset( d_offsets,    0, sizeof( a2::node_t ) * n );
    cudaMemset( d_neighbours, 0, sizeof( a2::node_t ) * m );
    a2::SparseGraph d_sg{ n, m, d_offsets, d_neighbours };

    a2::node_t * d_hop_offsets, * d_hop_neighbours;
    cudaMalloc( (void**)&d_hop_offsets,    sizeof( a2::node_t ) * n );
    cudaMalloc( (void**)&d_hop_neighbours, sizeof( a2::node_t ) * n * n );
    cudaMemset( d_hop_offsets,    0, sizeof( a2::node_t ) * n );
    cudaMemset( d_hop_neighbours, -1, sizeof( a2::node_t ) * n * n );
    a2::SparseGraph d_hop_sg{ n, n*n, d_hop_offsets, d_hop_neighbours };

    std::cout << "Sparse:" << std::endl;
    int neighbour_size = run( d_sg, d_hop_sg, d_edges, m );

    std::vector< a2::node_t > host_offsets( d_sg.n );
    std::vector< a2::node_t > host_neighbours( d_sg.m );
    a2::SparseGraph sg{ n, m, host_offsets.data(), host_neighbours.data() };
    cudaMemcpy( sg.neighbours_start_at, d_sg.neighbours_start_at, sizeof( a2::node_t ) * n, cudaMemcpyDeviceToHost );
    cudaMemcpy( sg.neighbours, d_sg.neighbours, sizeof( a2::node_t ) * m, cudaMemcpyDeviceToHost );

    std::vector< a2::node_t > hop_host_offsets( d_hop_sg.n );
    std::vector< a2::node_t > hop_host_neighbours( d_hop_sg.m );
    a2::SparseGraph hop_sg{ n, m, hop_host_offsets.data(), hop_host_neighbours.data() };
    cudaMemcpy( hop_sg.neighbours_start_at, d_hop_sg.neighbours_start_at, sizeof( a2::node_t ) * n, cudaMemcpyDeviceToHost );
    cudaMemcpy( hop_sg.neighbours, d_hop_sg.neighbours, sizeof( a2::node_t ) * neighbour_size, cudaMemcpyDeviceToHost );

    /**
    for(int i = 0; i < n; i++) {
        std::cout << sg.neighbours_start_at[i] << " ";
    }
    std::cout << std::endl;

    for(int i = 0; i < m; i++) {
        std::cout << sg.neighbours[i] << " ";
    }
    std::cout << std::endl;

    for(int i = 0; i < n; i++) {
        std::cout << hop_sg.neighbours_start_at[i] << " ";
    }
    std::cout << std::endl;

    for(int i = 0; i < neighbour_size; i++) {
        std::cout << hop_sg.neighbours[i] << " ";
    }
    std::cout << std::endl;
    **/

    // clean up
    cudaFree( d_neighbours );
    cudaFree( d_offsets );
    cudaFree( d_hop_neighbours );
    cudaFree( d_hop_offsets );
}

int main()
{
    using namespace csc485b;

    // Create input
    std::size_t constexpr n = 4096;
    std::size_t constexpr expected_degree = n >> 1;

    // a2::edge_list_t const graph = a2::generate_graph( n, n * expected_degree );
    a2::edge_list_t const graph = a2::generate_graph( n, 2 );
    std::size_t const m = graph.size();

    // lazily echo out input graph
    /**
    for( auto const& e : graph )
    {
        std::cout << "(" << e.x << "," << e.y << ") ";
    }
    std::cout << std::endl;
    **/

    // allocate and memcpy input to device
    a2::edge_t * d_edges;
    cudaMalloc( (void**)&d_edges, sizeof( a2::edge_t ) * m );
    cudaMemcpyAsync( d_edges, graph.data(), sizeof( a2::edge_t ) * m, cudaMemcpyHostToDevice );

    // run your code!
    run_dense ( d_edges, graph.data(), n, m );
    run_sparse( d_edges, n, m );

    return EXIT_SUCCESS;
}

In [196]:
%cuda_group_run --group "source" --compiler-args "-O3 -g -std=c++20 -arch=sm_75"

Dense: 
Build time: 837 us
Reachability time: 179455 us
Sparse:
Build time: 340 us
Reachability time: 8460 us

