In [1]:
# Load the extension that allows us to compile CUDA code in python notebooks
# Documentation is here: https://nvcc4jupyter.readthedocs.io/en/latest/
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git
%load_ext nvcc4jupyter

Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to c:\users\tinyr\appdata\local\temp\pip-req-build-k_5lp60x
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit 28f872a2f99a1b201bcd0db14fdbc5a496b9bfd7
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Source files will be saved in "C:\Users\tinyr\AppData\Local\Temp\tmp3my80itd".


  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git 'C:\Users\tinyr\AppData\Local\Temp\pip-req-build-k_5lp60x'

[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
%%cuda_group_save -g "source" -n "data_types.h"
/**
 * A collection of commonly used data types throughout this project.
 */
#pragma once

#include <iostream> // for std::ostream
#include <vector>

namespace csc485b{
namespace a2{

using node_t = int;
using edge_t = int2;

using edge_list_t = std::vector< edge_t >;
using node_list_t = std::vector< node_t >;

} // namespace a2
} // namespace csc485b


In [3]:
%%cuda_group_save -g "source" -n "cuda_common.h"
/**
 * Standard macros that can be useful for error checking.
 * https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__ERROR.html
 */
#pragma once

#include <cuda.h>

#define CUDA_CALL(exp)                                       \
    do {                                                     \
        cudaError res = (exp);                               \
        if(res != cudaSuccess) {                             \
            printf("Error at %s:%d\n %s\n",                  \
                __FILE__,__LINE__, cudaGetErrorString(res)); \
           exit(EXIT_FAILURE);                               \
        }                                                    \
    } while(0)

#define CHECK_ERROR(msg)                                             \
    do {                                                             \
        cudaError_t err = cudaGetLastError();                        \
        if(cudaSuccess != err) {                                     \
            printf("Error (%s) at %s:%d\n %s\n",                     \
                (msg), __FILE__, __LINE__, cudaGetErrorString(err)); \
            exit(EXIT_FAILURE);                                      \
        }                                                            \
    } while (0)

In [148]:
%%cuda_group_save -g "source" -n "data_generator.h"
/**
 * Functions for generating random input data with a fixed seed
 */
#pragma once

#include <cassert>  // for assert()
#include <cstddef>  // std::size_t type
#include <random>   // for std::mt19937, std::uniform_int_distribution
#include <vector>

#include "data_types.h"

namespace csc485b {
namespace a2 {

/**
 * Generates and returns a vector of random edges
 * for a graph `G=(V,E)` with `n=|V|=n` and expected `m=|E|`.
 * Referred to as an Erdős-Rényi graph.
 *
 * @see https://networkx.org/documentation/stable/reference/generated/networkx.generators.random_graphs.fast_gnp_random_graph.html#networkx.generators.random_graphs.fast_gnp_random_graph
 */
edge_list_t generate_graph( std::size_t n, std::size_t m )
{
    assert( "At most n(n-1) edges in a simple graph" && m < n * ( n - 1 ) );

    int const probability = ( 100 * m ) / ( n * ( n - 1 ) );

    // for details of random number generation, see:
    // https://en.cppreference.com/w/cpp/numeric/random/uniform_int_distribution
    std::size_t random_seed = 20241008;  // use magic seed
    std::mt19937 rng( random_seed );     // use mersenne twister generator
    std::uniform_int_distribution<> distrib(0, 100);

    edge_list_t random_edges;
    random_edges.reserve( 2 * m );

    for( node_t u = 0; u < n; ++u )
    {
        for( node_t v = u + 1; v < n; ++v )
        {
            auto const dice_roll = distrib( rng );
            if( dice_roll <= probability )
            {
                random_edges.push_back( make_int2( u, v ) );
                random_edges.push_back( make_int2( v, u ) );
            }
        }
    }

    random_edges.resize( random_edges.size() );


    return random_edges;
}

void generate_matrix( half * matrix, int n, int m ) {
    srand(777);

    for(int i=0; i<n*m; i++) {
        matrix[i] = (float)(rand()) / (float)(rand());
        // matrix[i] = 2;
    }
}

} // namespace a2
} // namespace csc485b


In [149]:
%%cuda_group_save -g "source" -n "dense_graph.h"
/**
 * The file in which you will implement your DenseGraph GPU solutions!
 */

#include <cstddef>  // std::size_t type

#include "cuda_common.h"
#include "data_types.h"

#include <mma.h>
using namespace nvcuda;

namespace csc485b {
namespace a2      {

/**
 * A DenseGraph is optimised for a graph in which the number of edges
 * is close to n(n-1). It is represented using an adjacency matrix.
 */
struct DenseGraph
{
    int rows;
    int cols;

    float* matrix;
};


namespace gpu {

const int WMMA_M = 16;
const int WMMA_N = 16;
const int WMMA_K = 16;

/**
  * Repopulates the adjacency matrix as a new graph that represents
  * the two-hop neighbourhood of input graph g
  */
__global__
void two_hop_reachability( half * input, float * output, int n, int m )
{
    /** From tensor core slides **/
    int warpM = blockIdx.x;
    int warpN = blockIdx.y;

    wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_N, WMMA_K, half, wmma::row_major> a_frag;
    wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_N, WMMA_K, half, wmma::row_major> b_frag;
    wmma::fragment<wmma::accumulator, WMMA_M, WMMA_N, WMMA_K, float> c_frag;

    wmma::fill_fragment(c_frag, 0.0f);

    for(int i=0; i < n; i += WMMA_K) {
        int aRow = warpM * WMMA_M;
        int aCol = i;
        int bRow = i;
        int bCol = warpN * WMMA_N;

        wmma::load_matrix_sync(a_frag, input + (aRow * n + aCol), 16);
        wmma::load_matrix_sync(b_frag, input + (bRow * n + bCol), 16);

        wmma::mma_sync(c_frag, a_frag, b_frag, c_frag);
    }

    int out_row = warpM * WMMA_M;
    int out_col = warpN * WMMA_N;
    wmma::store_matrix_sync(output + (out_row * n + out_col), c_frag, n, wmma::mem_row_major);

    return;
}

int run_two_hop( half * input, float * output, int n, int m ) {

    dim3 block_dim(32, 1);
    dim3 grid_dim(n / WMMA_M, n / WMMA_N);

    two_hop_reachability<<< grid_dim, block_dim >>>( input, output, n, m );

    return 0;
}

} // namespace gpu
} // namespace a2
} // namespace csc485b

In [150]:
%%cuda_group_save -g "source" -n "main.cu"
/**
 * Driver for the benchmark comparison. Generates random data,
 * runs the CPU baseline, and then runs your code.
 */

#include <chrono>   // for timing
#include <iostream> // std::cout, std::endl
#include <iterator> // std::ostream_iterator
#include <vector>

#include "dense_graph.h"

#include "data_generator.h"
#include "data_types.h"

int main()
{
    using namespace csc485b;

    int const n = 16;
    int const m = 16;

    half matrix[n * m];

    a2::generate_matrix(matrix, n, m);

    // allocate and memcpy input to device
    half * d_matrix;
    cudaMalloc( (void**)&d_matrix, sizeof( half ) * n * m );
    cudaMemcpyAsync( d_matrix, matrix, sizeof( half ) * n * m, cudaMemcpyHostToDevice );

    float * d_result_matrix;
    cudaMalloc( (void**)&d_result_matrix, sizeof( float ) * n * m );

    cudaDeviceSynchronize();
    auto const reachability_start = std::chrono::high_resolution_clock::now();

    int neighbour_size = csc485b::a2::gpu::run_two_hop( d_matrix, d_result_matrix, n, m );

    cudaDeviceSynchronize();
    auto const end = std::chrono::high_resolution_clock::now();

    float result_matrix[n * m];
    cudaMemcpy( result_matrix, d_result_matrix, sizeof( float ) * n * m, cudaMemcpyDeviceToHost );

    cudaFree( d_matrix );
    cudaFree( d_result_matrix );

    for(int i=0; i<n; i++) {
        for(int j=0; j<m; j++) {
            std::cout << (float)(matrix[(n * i) + j]) << " ";
        }
        std::cout << std::endl;
    }

    std::cout << std::endl;

    for(int i=0; i<n; i++) {
        for(int j=0; j<m; j++) {
            std::cout << result_matrix[(n * i) + j] << " ";
        }
        std::cout << std::endl;
    }

    return EXIT_SUCCESS;
}

In [152]:
%cuda_group_run --group "source" --compiler-args "-O3 -g -std=c++20 -arch=sm_75"

0.759277 0.368896 1.80566 0.101562 5.62109 0.744629 2.7832 1.39648 0.263428 1.86719 5.46875 5.70312 3.10156 0.206177 0.700684 1.08203 
0.853027 3.83789 0.586426 12.4688 0.327637 0.529785 6.3125 9.98438 0.660156 0.683594 0.778809 11.9609 15.6953 2.39844 1.63574 0.822266 
0.0877686 0.0650024 0.577637 1.73633 0.887207 1.12012 4.32422 0.913574 0.890137 0.358887 0.111084 0.563965 11.3281 0.891113 1.44043 10.5625 
6.625 2.10156 8.51562 1.84766 2.29883 0.983887 1.35352 0.808594 1.51172 3.44141 0.0393066 0.248657 1.86914 3.35156 10.625 2.05273 
0.0176544 0.339355 0.891602 1.09766 0.191406 0.113586 0.411133 1.17773 0.96582 3.92969 0.134521 1.46973 0.76123 0.340576 0.542969 1.6709 
2.88867 1.21387 0.454834 0.608887 0.859863 1.00781 4.00391 0.989746 0.286865 1.79688 1.03613 0.847168 1.97656 0.992676 1.40039 1.20215 
0.662598 21.2969 1.44141 0.763672 2.02539 0.0375061 1.06641 3.20312 1.04492 0.617676 0.150513 1.47754 2.81055 7.95312 0.355957 0.0563354 
0.364502 0.844727 0.637695 1.58496 9.21094 0.