In [1]:
import Pkg
Pkg.add("KernelAbstractions")
Pkg.add("CUDA")

[32m[1m    Updating[22m[39m registry at `~/.julia/registries/General.toml`
[32m[1m   Resolving[22m[39m package versions...
[32m[1m    Updating[22m[39m `~/.julia/environments/v1.11/Project.toml`
  [90m[63c18a36] [39m[92m+ KernelAbstractions v0.9.39[39m
[32m[1m  No Changes[22m[39m to `~/.julia/environments/v1.11/Manifest.toml`
[32m[1m   Resolving[22m[39m package versions...
[32m[1m  No Changes[22m[39m to `~/.julia/environments/v1.11/Project.toml`
[32m[1m  No Changes[22m[39m to `~/.julia/environments/v1.11/Manifest.toml`


In [14]:
using CUDA
using KernelAbstractions

# ------------------------------
# 1️⃣ Streaming MTX loader
# ------------------------------
function load_mtx_as_csr_stream(filename::String)
    open(filename, "r") do io
        # Skip comments
        line = ""
        while !eof(io)
            line = strip(readline(io))
            !startswith(line, "%") && !isempty(line) && break
        end

        nrows, ncols, nnz = parse.(Int32, split(line))
        max_edges = nnz * 2
        edges_u = Vector{Int32}(undef, max_edges)
        edges_v = Vector{Int32}(undef, max_edges)
        edge_count = 0

        for line in eachline(io)
         line = strip(line)
         isempty(line) && continue
         startswith(line, "%") && continue

         u_str, v_str = first(split(line, r"\s+")), last(split(line, r"\s+"))
         u = parse(Int32, u_str)
         v = parse(Int32, v_str)

         edge_count += 1
         edges_u[edge_count] = u
         edges_v[edge_count] = v

          if u != v
          edge_count += 1
          edges_u[edge_count] = v
          edges_v[edge_count] = u
            end
        end


        resize!(edges_u, edge_count)
        resize!(edges_v, edge_count)

        # Build CSR
        rowptr = zeros(Int32, nrows + 1)
        for u in edges_u
            rowptr[u + 1] += 1
        end
        for i in 1:nrows
            rowptr[i+1] += rowptr[i]
        end
        colind = Vector{Int32}(undef, edge_count)
        tmp_rowptr = copy(rowptr)
        for k in 1:edge_count
            u, v = edges_u[k], edges_v[k]
            idx = tmp_rowptr[u] + 1
            colind[idx] = v
            tmp_rowptr[u] += 1
        end

        return rowptr, colind
    end
end

# ------------------------------
# 2️⃣ Binary loader / saver
# ------------------------------
function save_csr_binary(rowptr, colind, rowptr_file, colind_file)
    open(rowptr_file, "w") do io write(io, rowptr) end
    open(colind_file, "w") do io write(io, colind) end
end

function load_csr_binary(rowptr_file, colind_file)
    rowptr = reinterpret(Int32, read(rowptr_file))
    colind = reinterpret(Int32, read(colind_file))
    return rowptr, colind
end

# ------------------------------
# 3️⃣ GPU Connected Components
# ------------------------------
@kernel function cc_kernel!(rowptr, colind, label, changed, n)
    v = @index(Global)
    if v <= n
        best = label[v]
        for e in rowptr[v]:(rowptr[v+1]-1)
            u = colind[e]
            best = min(best, label[u])
        end
        if best < label[v]
            label[v] = best
            changed[v] = 1
        end
    end
end

function connected_components_gpu(rowptr_h, colind_h)
    n = Int32(length(rowptr_h) - 1)

    rowptr  = CuArray(rowptr_h)
    colind  = CuArray(colind_h)
    label   = CuArray(Int32.(1:n))
    changed = CuArray(zeros(Int32, n))

    backend = CUDABackend()
    kernel  = cc_kernel!(backend)

    iter = 0
    CHECK_EVERY = 5

    while true
        iter += 1
        changed .= 0
        kernel(rowptr, colind, label, changed, n; ndrange=n)

        if iter % CHECK_EVERY == 0
            KernelAbstractions.synchronize(backend)
            if sum(Array(changed)) == 0
                break
            end
        end
    end

    KernelAbstractions.synchronize(backend)
    println("Iterations: ", iter)
    return Array(label)
end

# ------------------------------
# 4️⃣ Main
# ------------------------------
mtx_file       = "/content/sample_data/graph.mtx"
rowptr_file    = "/content/sample_data/friendster_rowptr.bin"
colind_file    = "/content/sample_data/friendster_colind.bin"

# Load CSR: check if binary exists
if isfile(rowptr_file) && isfile(colind_file)
    println("Loading CSR from binary files...")
    t0 = time()
    rowptr, colind = load_csr_binary(rowptr_file, colind_file)
    t1 = time()
    println("Loaded CSR in ", round(t1-t0, digits=3), " s")
else
    println("Binary CSR not found. Loading MTX and building CSR...")
    t0 = time()
    rowptr, colind = load_mtx_as_csr_stream(mtx_file)
    t1 = time()
    println("Loaded MTX in ", round(t1-t0, digits=3), " s")

    println("Saving CSR binary...")
    save_csr_binary(rowptr, colind, rowptr_file, colind_file)
end

# Run GPU Connected Components
println("Running Connected Components on GPU...")
t2 = time()
labels = connected_components_gpu(rowptr, colind)
t3 = time()
println("CC time: ", round(t3-t2, digits=3), " s")
println("Total time: ", round(t3-t0, digits=3), " s")
println("Connected components: ", length(unique(labels)))
println("First 10 labels: ", labels[1:min(10,end)])



Loading CSR from binary files...
Loaded CSR in 0.241 s
Running Connected Components on GPU...
Iterations: 10
CC time: 0.839 s
Total time: 1.083 s
Connected components: 14860
First 10 labels: Int32[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
