No augmented forward pass found for `cuOccupancyMaxPotentialBlockSize` #1061

pxl-th · 2023-09-18T12:32:11Z

Hi!
I'm trying to use fused kernel compute_α_fused to compute alpha-composing weights and use Enzyme to generate gradient kernel in Reverse mode instead of compute_α.

But the compilation fails. Is this the issue with CUDA.jl?

Error:

No augmented forward pass found for cuOccupancyMaxPotentialBlockSize
declare i32 @cuOccupancyMaxPotentialBlockSize(i64, i64, i64, i64, i64, i32) local_unnamed_addr



Stacktrace:
  [1] julia_error(cstr::Cstring, val::Ptr{…}, errtype::Enzyme.API.ErrorType, data::Ptr{…}, data2::Ptr{…}, B::Ptr{…})
    @ Enzyme.Compiler ~/.julia/packages/Enzyme/0SYwj/src/compiler.jl:5768
  [2] EnzymeCreateAugmentedPrimal(logic::Enzyme.Logic, todiff::LLVM.Function, retType::Enzyme.API.CDIFFE_TYPE, constant_args::Vector{…}, TA::Enzyme.TypeAnalysis, returnUsed::Bool, shadowReturnUsed::Bool, typeInfo::Enzyme.FnTypeInfo, uncacheable_args::Vector{…}, forceAnonymousTape::Bool, width::Int64, atomicAdd::Bool)
    @ Enzyme.API ~/.julia/packages/Enzyme/0SYwj/src/api.jl:164
  [3] enzyme!(job::GPUCompiler.CompilerJob{…}, mod::LLVM.Module, primalf::LLVM.Function, TT::Type, mode::Enzyme.API.CDerivativeMode, width::Int64, parallel::Bool, actualRetType::Type, wrap::Bool, modifiedBetween::NTuple{…}, returnPrimal::Bool, jlrules::Vector{…}, expectedTapeType::Type)
    @ Enzyme.Compiler ~/.julia/packages/Enzyme/0SYwj/src/compiler.jl:7541
  [4] codegen(output::Symbol, job::GPUCompiler.CompilerJob{…}; libraries::Bool, deferred_codegen::Bool, optimize::Bool, toplevel::Bool, strip::Bool, validate::Bool, only_entry::Bool, parent_job::Nothing)
    @ Enzyme.Compiler ~/.julia/packages/Enzyme/0SYwj/src/compiler.jl:9119
  [5] codegen
    @ Enzyme.Compiler ~/.julia/packages/Enzyme/0SYwj/src/compiler.jl:8723 [inlined]
  [6] _thunk(job::GPUCompiler.CompilerJob{Enzyme.Compiler.EnzymeTarget, Enzyme.Compiler.EnzymeCompilerParams}, postopt::Bool) (repeats 2 times)
    @ Enzyme.Compiler ~/.julia/packages/Enzyme/0SYwj/src/compiler.jl:9671
  [7] cached_compilation
    @ Enzyme.Compiler ~/.julia/packages/Enzyme/0SYwj/src/compiler.jl:9705 [inlined]
  [8] (::Enzyme.Compiler.var"#475#476"{…})(ctx::LLVM.Context)
    @ Enzyme.Compiler ~/.julia/packages/Enzyme/0SYwj/src/compiler.jl:9768
  [9] JuliaContext(f::Enzyme.Compiler.var"#475#476"{…})
    @ GPUCompiler ~/.julia/packages/GPUCompiler/YO8Uj/src/driver.jl:47
 [10] #s292#474
    @ Enzyme.Compiler ~/.julia/packages/Enzyme/0SYwj/src/compiler.jl:9723 [inlined]
 [11] 
    @ Enzyme.Compiler ./none:0
 [12] (::Core.GeneratedFunctionStub)(::UInt64, ::LineNumberNode, ::Any, ::Vararg{Any})
    @ Core ./boot.jl:600
 [13] runtime_generic_augfwd(activity::Type{…}, width::Val{…}, ModifiedBetween::Val{…}, RT::Val{…}, f::CUDA.CUDAKernels.var"##_#6", df::Nothing, primal_1::Int64, shadow_1_1::Nothing, primal_2::Nothing, shadow_2_1::Nothing, primal_3::KernelAbstractions.Kernel{…}, shadow_3_1::Nothing, primal_4::CuArray{…}, shadow_4_1::CuArray{…}, primal_5::CuArray{…}, shadow_5_1::CuArray{…}, primal_6::CuArray{…}, shadow_6_1::Nothing, primal_7::CuArray{…}, shadow_7_1::Nothing)
    @ Enzyme.Compiler ~/.julia/packages/Enzyme/0SYwj/src/compiler.jl:1361
 [14] Kernel
    @ ~/.julia/packages/CUDA/35NC6/src/CUDAKernels.jl:103 [inlined]
 [15] _compute_α_fused!
    @ ~/code/ZipNerf.jl/src/t.jl:56 [inlined]
 [16] _compute_α_fused!
    @ ~/code/ZipNerf.jl/src/t.jl:0 [inlined]
 [17] diffejulia__compute___fused__2929_inner_1wrap
    @ ~/code/ZipNerf.jl/src/t.jl:0
 [18] macro expansion
    @ Enzyme.Compiler ~/.julia/packages/Enzyme/0SYwj/src/compiler.jl:9619 [inlined]
 [19] enzyme_call
    @ Enzyme.Compiler ~/.julia/packages/Enzyme/0SYwj/src/compiler.jl:9297 [inlined]
 [20] CombinedAdjointThunk
    @ Enzyme.Compiler ~/.julia/packages/Enzyme/0SYwj/src/compiler.jl:9260 [inlined]
 [21] autodiff
    @ Enzyme ~/.julia/packages/Enzyme/0SYwj/src/Enzyme.jl:213 [inlined]
 [22] autodiff
    @ Enzyme ~/.julia/packages/Enzyme/0SYwj/src/Enzyme.jl:236 [inlined]
 [23] autodiff
    @ Enzyme ~/.julia/packages/Enzyme/0SYwj/src/Enzyme.jl:222 [inlined]
 [24] _pullback
    @ Main ~/code/ZipNerf.jl/src/t.jl:88 [inlined]
 [25] ZBack
    @ Zygote ~/.julia/packages/Zygote/4SSHS/src/compiler/chainrules.jl:211 [inlined]
 [26] kw_zpullback
    @ Zygote ~/.julia/packages/Zygote/4SSHS/src/compiler/chainrules.jl:237 [inlined]
 [27] #2
    @ Main ~/code/ZipNerf.jl/src/t.jl:25 [inlined]
 [28] (::Zygote.var"#75#76"{Zygote.Pullback{Tuple{…}, Tuple{…}}})(Δ::Float32)
    @ Zygote ~/.julia/packages/Zygote/4SSHS/src/compiler/interface.jl:45
 [29] gradient(f::Function, args::CuArray{Float32, 2, CUDA.Mem.DeviceBuffer})
    @ Zygote ~/.julia/packages/Zygote/4SSHS/src/compiler/interface.jl:97
 [30] main()
    @ Main ~/code/ZipNerf.jl/src/t.jl:24
 [31] top-level scope
    @ REPL[2]:1
 [32] top-level scope
    @ ~/.julia/packages/CUDA/35NC6/src/initialization.jl:190
Some type information was truncated. Use `show(err)` to see complete types.

Code:

using Adapt
using ChainRulesCore
using CUDA
using KernelAbstractions
using Zygote
using Enzyme

import KernelAbstractions as KA

function main()
    kab = CUDABackend()

    tdist = adapt(kab, reshape(collect(range(0f0, 1f0, 65)), :, 1))
    directions = adapt(kab, reshape([0f0, 0f0, 1f0], 3, 1))

    σ = adapt(kab, ones(Float32, 64, 1))
    ω = compute_α(σ; tdist, directions)
    ω2 = compute_α_fused(σ; tdist, directions)
    @assert all(ω .≈ ω2)

    g1 = Zygote.gradient(σ) do σ
        sum(compute_α(σ; tdist, directions))
    end
    g2 = Zygote.gradient(σ) do σ
        sum(compute_α_fused(σ; tdist, directions))
    end
    @assert all(g1[1] .≈ g2[1])
    return
end

function compute_α(σ; tdist, directions)
    kab = get_backend(σ)
    N = size(σ, 2)

    tδ = tdist[2:end, :] .- tdist[1:end - 1, :]
    δ = tδ .* sqrt.(sum(directions.^2; dims=1))
    σδ = σ .* δ

    α = 1f0 .- exp.(-σδ)
    T = vcat(
        @ignore_derivatives(KA.ones(kab, Float32, 1, N)),
        exp.(-cumsum(σδ[1:end - 1, :]; dims=1)))
    ω = α .* T
    return ω
end

function compute_α_fused(
    σ::AbstractMatrix{Float32}; tdist::AbstractMatrix{Float32},
    directions::AbstractMatrix{Float32},
)
    ω = KA.allocate(get_backend(σ), Float32, size(σ))
    _compute_α_fused!(ω, σ, tdist, directions)
    return ω
end

_compute_α_fused!(ω, σ, tdist, directions) =
    _compute_α!(get_backend(ω))(ω, σ, tdist, directions; ndrange=size(ω, 2))

@kernel function _compute_α!(
    # Output.
    ω::AbstractMatrix{Float32},
    # Input.
    Σ::AbstractMatrix{Float32},
    tdist::AbstractMatrix{Float32},
    directions::AbstractMatrix{Float32},
)
    @uniform K = size(ω, 1)

    i = @index(Global)
    δ_scale = sqrt(directions[1, i]^2 + directions[2, i]^2 + directions[3, i]^2)

    T::Float32 = 1f0
    for k in 1:K
        σ = Σ[k, i]
        tδ = tdist[k + 1, i] - tdist[k, i]
        δ = tδ * δ_scale

        α = 1f0 - exp(-σ * δ)
        ω[k, i] = α * T
        T *= 1f0 - α
    end
end

function ChainRulesCore.rrule(::typeof(compute_α_fused), σ; tdist, directions)
    ω = compute_α_fused(σ; tdist, directions)
    function _pullback(Δ)
        ∂σ = KA.allocate(get_backend(ω), eltype(ω), size(ω))
        Enzyme.autodiff(Reverse, _compute_α_fused!,
            Duplicated(ω, Δ), Duplicated(σ, ∂σ),
            Const(tdist), Const(directions))
        return NoTangent(), ∂σ
    end
    ω, _pullback
end

The text was updated successfully, but these errors were encountered:

vchuravy · 2023-09-18T13:57:41Z

I think this is due to the EnzymeRules for KernelAbstractions not supporting reverse mode yet

pxl-th · 2023-09-18T15:05:42Z

Oh, I see. I saw tests in KernelAbstractions for reverse mode and though that it works.

wsmoses · 2023-09-18T15:40:02Z

the KA custom rule is implemented for any backend in forward mode, and the CPU backend in reverse

vchuravy · 2023-09-18T17:03:57Z

I don't actually remember what was needed for reverse GPU support

wsmoses · 2023-09-18T17:06:01Z

We needed to prexompite the GPU relevant/interpreted tape size from outside the kernel. So we need a variant of thunk tape computation that allows for a different device

…

On Mon, Sep 18, 2023 at 12:04 PM Valentin Churavy ***@***.***> wrote: I don't actually remember what was needed for reverse GPU support — Reply to this email directly, view it on GitHub <#1061 (comment)>, or unsubscribe <https://github.com/notifications/unsubscribe-auth/AAJTUXD7CG46R7XEC5NBCF3X3B5IRANCNFSM6AAAAAA44UTTAU> . You are receiving this because you commented.Message ID: <EnzymeAD/Enzyme. ***@***.***>

pxl-th · 2023-09-18T18:26:55Z

I think this is due to the EnzymeRules for KernelAbstractions not supporting reverse mode yet

Actually, is this also the case if I want to differentiate just the kernel (no host code involved)?

wsmoses · 2023-09-18T18:36:43Z

nope that would be fine

pxl-th · 2023-09-18T18:38:51Z

I see there are tests for reverse for CUDA.jl:

Enzyme.jl/test/cuda.jl

Line 14 in 7d99eec

autodiff_deferred(Reverse, mul_kernel, Const, Duplicated(A, dA))

But when I try the same with KA, it errors:

ERROR: return type is Union{}, giving up.
Stacktrace:
 [1] error(s::String)
   @ Base ./error.jl:35
 [2] autodiff_deferred
   @ Main ~/.julia/packages/Enzyme/0SYwj/src/Enzyme.jl:456 [inlined]
 [3] autodiff_deferred
   @ Main ~/.julia/packages/Enzyme/0SYwj/src/Enzyme.jl:442 [inlined]
 [4] main2()
   @ Main ~/code/t.jl:110
 [5] top-level scope
   @ REPL[3]:1
 [6] top-level scope
   @ ~/.julia/packages/CUDA/35NC6/src/initialization.jl:190

using CUDA
using KernelAbstractions
using Enzyme

@kernel function ker(x)
    i = @index(Global)
    x[i] *= x[i]
end

function main()
    kab = CUDABackend()
    x = KA.ones(kab, Float32, 16)
    dx = KA.ones(kab, Float32, 16)
    Enzyme.autodiff_deferred(Reverse, ker(kab), Duplicated(x, dx))
    return
end
main()

I'm probably doing things incorrectly, but I haven't found the example with KA with just a single kernel... :/

pxl-th · 2023-09-18T19:42:49Z

Actually, test for CUDA.jl also gives this error:

function mul_kernel(A)
    i = threadIdx().x
    if i <= length(A)
        A[i] *= A[i]
    end
    return nothing
end

function main()
    A = CUDA.ones(64,)
    dA = CUDA.ones(64,)
    autodiff_deferred(Reverse, mul_kernel, Const, Duplicated(A, dA))
    return
end

I'm using CUDA 4.4.1, Enzyme 0.11.7 and Julia 1.10-beta2

pxl-th · 2023-09-19T07:48:11Z

So I got confused, but with CUDA.jl if you wrap in

function mul_kernel(A)
    i = threadIdx().x
    A[i] *= A[i]
    return nothing
end


function grad(A, dA)
    autodiff_deferred(Reverse, mul_kernel, Duplicated(A, dA))
    return nothing
end

And call @cuda threads=length(A) grad(A, dA), then it works (which is still confusing a bit).

But with KernelAbstractions I cannot figure out how to do this.
The only example involves host code:
https://github.com/JuliaGPU/KernelAbstractions.jl/blob/3165d35b9b707e73d19e7f8fc9f442bafaf415ac/test/extensions/enzyme.jl#L10

Is there a way to AD just the kernel?

pxl-th · 2023-09-19T18:04:52Z

@wsmoses, sorry for spamming, but are there any examples with KA not involving host code (just the kernel)?

wsmoses · 2023-09-23T20:12:41Z

You should be able to use autodiff_deferred inside the kernel itself (like your grad case). The KA example you showed is for the custom rules nicer support, but that's only enabled for forward mode in KA.jl rn.

For reverse mode, you'll have to set it up manually like your mul_kernel above where the autodiff call is inside the device code entirely

pxl-th · 2023-09-25T13:36:31Z

autodiff call is inside the device code entirely

Oh, I see! Now it works! A note somewhere in the docs might be useful (unless I missed one).
Thanks for the help!

pxl-th · 2023-09-25T14:13:44Z

It works for the mul_kernel, however fails when using with more complex kernels.
For example, with sin function.

Error:

ERROR: InvalidIRError: compiling MethodInstance for gpu_gker(::KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}}}, ::AMDGPU.Device.ROCDeviceVector{Float32, 1}, ::AMDGPU.Device.ROCDeviceVector{Float32, 1}) resulted in invalid LLVM IR
Reason: unsupported call through a literal pointer (call to )
Stacktrace:
  [1] #sin
    @ ~/.julia/dev/AMDGPU/src/device/gcn/math.jl:32
  [2] ker
    @ ~/code/ZipNerf.jl/t.jl:7
  [3] ker
    @ ~/code/ZipNerf.jl/t.jl:0
  [4] diffejulia_ker_5228_inner_1wrap
    @ ~/code/ZipNerf.jl/t.jl:0
  [5] macro expansion
    @ ~/.julia/packages/Enzyme/VS5jo/src/compiler.jl:9774
  [6] enzyme_call
    @ ~/.julia/packages/Enzyme/VS5jo/src/compiler.jl:9452
  [7] CombinedAdjointThunk
    @ ~/.julia/packages/Enzyme/VS5jo/src/compiler.jl:9415
  [8] autodiff_deferred
    @ ~/.julia/packages/Enzyme/VS5jo/src/Enzyme.jl:372
  [9] autodiff_deferred
    @ ~/.julia/packages/Enzyme/VS5jo/src/Enzyme.jl:459
 [10] autodiff_deferred
    @ ~/.julia/packages/Enzyme/VS5jo/src/Enzyme.jl:442
 [11] macro expansion
    @ ~/code/ZipNerf.jl/t.jl:18
 [12] gpu_gker
    @ ~/.julia/packages/KernelAbstractions/cWlFz/src/macros.jl:90
 [13] gpu_gker
    @ ./none:0
Reason: unsupported call through a literal pointer (call to )
Stacktrace:
 [1] #sin
   @ ~/.julia/dev/AMDGPU/src/device/gcn/math.jl:32
 [2] ker
   @ ~/code/ZipNerf.jl/t.jl:7
 [3] ker
   @ ~/code/ZipNerf.jl/t.jl:0
 [4] diffejulia_ker_5228_inner_1wrap
   @ ~/code/ZipNerf.jl/t.jl:0
...

Code:

using AMDGPU
using KernelAbstractions
using Enzyme
import KernelAbstractions as KA

@inline function ker(x, i)
    x[i] *= sin(x[i])
    return
end

@kernel function fker(x)
    i = @index(Global)
    ker(x, i)
end

@kernel function gker(x, dx)
    i = @index(Global)
    Enzyme.autodiff_deferred(Reverse, ker, Duplicated(x, dx), i)
end

function main()
    kab = ROCBackend()
    x = KA.ones(kab, Float32, 16)
    dx = KA.ones(kab, Float32, 16)

    fker(kab)(x; ndrange=length(x))
    @show x
    gker(kab)(x, dx; ndrange=length(x))
    @show dx
    return
end

wsmoses · 2023-09-25T14:17:34Z

Yeah that's the same as #683

…

On Mon, Sep 25, 2023 at 9:13 AM Anton Smirnov ***@***.***> wrote: It works for the mul_kernel, however fails when using with more complex kernels. For example, with sin function. Error: ERROR: InvalidIRError: compiling MethodInstance for gpu_gker(::KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}}}, ::AMDGPU.Device.ROCDeviceVector{Float32, 1}, ::AMDGPU.Device.ROCDeviceVector{Float32, 1}) resulted in invalid LLVM IR Reason: unsupported call through a literal pointer (call to ) Stacktrace: [1] #sin @ ~/.julia/dev/AMDGPU/src/device/gcn/math.jl:32 [2] ker @ ~/code/ZipNerf.jl/t.jl:7 [3] ker @ ~/code/ZipNerf.jl/t.jl:0 [4] diffejulia_ker_5228_inner_1wrap @ ~/code/ZipNerf.jl/t.jl:0 [5] macro expansion @ ~/.julia/packages/Enzyme/VS5jo/src/compiler.jl:9774 [6] enzyme_call @ ~/.julia/packages/Enzyme/VS5jo/src/compiler.jl:9452 [7] CombinedAdjointThunk @ ~/.julia/packages/Enzyme/VS5jo/src/compiler.jl:9415 [8] autodiff_deferred @ ~/.julia/packages/Enzyme/VS5jo/src/Enzyme.jl:372 [9] autodiff_deferred @ ~/.julia/packages/Enzyme/VS5jo/src/Enzyme.jl:459 [10] autodiff_deferred @ ~/.julia/packages/Enzyme/VS5jo/src/Enzyme.jl:442 [11] macro expansion @ ~/code/ZipNerf.jl/t.jl:18 [12] gpu_gker @ ~/.julia/packages/KernelAbstractions/cWlFz/src/macros.jl:90 [13] gpu_gker @ ./none:0 Reason: unsupported call through a literal pointer (call to ) Stacktrace: [1] #sin @ ~/.julia/dev/AMDGPU/src/device/gcn/math.jl:32 [2] ker @ ~/code/ZipNerf.jl/t.jl:7 [3] ker @ ~/code/ZipNerf.jl/t.jl:0 [4] diffejulia_ker_5228_inner_1wrap @ ~/code/ZipNerf.jl/t.jl:0... Code: using AMDGPUusing KernelAbstractionsusing Enzymeimport KernelAbstractions as KA @inline function ker(x, i) x[i] *= sin(x[i]) returnend @kernel function fker(x) i = @index(Global) ker(x, i)end @kernel function gker(x, dx) i = @index(Global) Enzyme.autodiff_deferred(Reverse, ker, Duplicated(x, dx), i)end function main() kab = ROCBackend() x = KA.ones(kab, Float32, 16) dx = KA.ones(kab, Float32, 16) fker(kab)(x; ndrange=length(x)) @show x gker(kab)(x, dx; ndrange=length(x)) @show dx returnend — Reply to this email directly, view it on GitHub <#1061 (comment)>, or unsubscribe <https://github.com/notifications/unsubscribe-auth/AAJTUXB5HYQPM4ZYTI45AS3X4GGSJANCNFSM6AAAAAA44UTTAU> . You are receiving this because you were mentioned.Message ID: ***@***.***>

pxl-th · 2023-09-25T21:46:43Z

Yeah that's the same as #683

Just curious if the fix is coming relatively soon or is it more involved?

wsmoses · 2023-10-08T20:43:13Z

It's unfortunately more involved.

@aviatesk do you have cycles to help us with the nested abstract interpreter issues?

cc @ChrisRackauckas

wsmoses · 2024-06-18T20:15:11Z

@pxl-th the AMDGPU issues are resolved by #1537

wsmoses closed this as completed Jun 18, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

No augmented forward pass found for `cuOccupancyMaxPotentialBlockSize` #1061

No augmented forward pass found for `cuOccupancyMaxPotentialBlockSize` #1061

pxl-th commented Sep 18, 2023

vchuravy commented Sep 18, 2023

pxl-th commented Sep 18, 2023

wsmoses commented Sep 18, 2023

vchuravy commented Sep 18, 2023

wsmoses commented Sep 18, 2023 via email

pxl-th commented Sep 18, 2023

wsmoses commented Sep 18, 2023

pxl-th commented Sep 18, 2023 •

edited

Loading

pxl-th commented Sep 18, 2023

pxl-th commented Sep 19, 2023

pxl-th commented Sep 19, 2023

wsmoses commented Sep 23, 2023 •

edited

Loading

pxl-th commented Sep 25, 2023

pxl-th commented Sep 25, 2023

wsmoses commented Sep 25, 2023 via email

pxl-th commented Sep 25, 2023 •

edited

Loading

wsmoses commented Oct 8, 2023

wsmoses commented Jun 18, 2024

No augmented forward pass found for cuOccupancyMaxPotentialBlockSize #1061

No augmented forward pass found for cuOccupancyMaxPotentialBlockSize #1061

Comments

pxl-th commented Sep 18, 2023

vchuravy commented Sep 18, 2023

pxl-th commented Sep 18, 2023

wsmoses commented Sep 18, 2023

vchuravy commented Sep 18, 2023

wsmoses commented Sep 18, 2023 via email

pxl-th commented Sep 18, 2023

wsmoses commented Sep 18, 2023

pxl-th commented Sep 18, 2023 • edited Loading

pxl-th commented Sep 18, 2023

pxl-th commented Sep 19, 2023

pxl-th commented Sep 19, 2023

wsmoses commented Sep 23, 2023 • edited Loading

pxl-th commented Sep 25, 2023

pxl-th commented Sep 25, 2023

wsmoses commented Sep 25, 2023 via email

pxl-th commented Sep 25, 2023 • edited Loading

wsmoses commented Oct 8, 2023

wsmoses commented Jun 18, 2024

No augmented forward pass found for `cuOccupancyMaxPotentialBlockSize` #1061

No augmented forward pass found for `cuOccupancyMaxPotentialBlockSize` #1061

pxl-th commented Sep 18, 2023 •

edited

Loading

wsmoses commented Sep 23, 2023 •

edited

Loading

pxl-th commented Sep 25, 2023 •

edited

Loading