From 5ee69048dea0dc49948da007e281f312cbdfef72 Mon Sep 17 00:00:00 2001 From: Dennis Yatunin Date: Tue, 31 Jan 2023 12:02:07 -0800 Subject: [PATCH] Fix memory issues (remove copyto! inlining and enable scalar Tuple broadcasts) --- .buildkite/pipeline.yml | 44 ------------------- src/DataLayouts/broadcast.jl | 35 ++++++++------- src/Fields/broadcast.jl | 31 ++++++++++--- src/Operators/finitedifference.jl | 7 +-- test/Fields/field_opt.jl | 42 ++++++++++-------- .../finitedifference/opt_examples.jl | 4 +- 6 files changed, 72 insertions(+), 91 deletions(-) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 1bb0f16fe0..e4b65ce2e8 100755 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -46,8 +46,6 @@ steps: key: "cpu_unittests" command: - "julia --color=yes --check-bounds=yes --project=test test/runtests.jl" - agents: - slurm_mem: 64GB - label: ":computer: unit tests expensive" key: "cpu_unittests_expensive" @@ -55,15 +53,11 @@ steps: - "julia --color=yes --check-bounds=yes --project=test test/runtests_expensive.jl" env: OMPI_MCA_rmaps_base_oversubscribe: 1 - agents: - slurm_mem: 64GB - label: ":computer: Field broadcast performance" key: "cpu_field_perf" command: - "julia --color=yes --project=test test/Fields/field_opt.jl" - agents: - slurm_mem: 20GB - label: ":flower_playing_cards: unit tests" key: "gpu_unittests" @@ -72,21 +66,16 @@ steps: - "julia --color=yes --check-bounds=yes --project=test test/runtests.jl CUDA" agents: slurm_gres: "gpu:1" - slurm_mem: 32GB - label: ":computer: test implicit_stencil Float32" key: "cpu_implicit_stencil_float32" command: - "julia -O0 --color=yes --check-bounds=yes --project=test test/Operators/finitedifference/implicit_stencils.jl --float_type Float32" - agents: - slurm_mem: 10GB - label: ":computer: test implicit_stencil Float64" key: "cpu_implicit_stencil_float64" command: - "julia -O0 --color=yes --check-bounds=yes --project=test test/Operators/finitedifference/implicit_stencils.jl --float_type Float64" - agents: - slurm_mem: 10GB - label: ":computer: test implicit_stencil opt" key: "cpu_implicit_stencil_opt" @@ -335,8 +324,6 @@ steps: - "julia --color=yes --project=examples examples/hybrid/box/bubble_3d_rhotheta.jl" artifact_paths: - "examples/hybrid/box/output/bubble_3d_rhotheta/*" - agents: - slurm_mem: 20GB - label: ":computer: Rising Bubble 2D hybrid invariant (ρθ)" key: "cpu_rising_bubble_2d_hybrid_invariant_rhotheta" @@ -358,8 +345,6 @@ steps: - "julia --color=yes --project=examples examples/hybrid/box/bubble_3d_invariant_rhotheta.jl" artifact_paths: - "examples/hybrid/box/output/bubble_3d_invariant_rhotheta/*" - agents: - slurm_mem: 20GB - label: ":computer: Rising Bubble 3D hybrid invariant (ρe)" key: "cpu_rising_bubble_3d_hybrid_invariant_rhoe" @@ -552,8 +537,6 @@ steps: - "julia --color=yes --project=examples examples/hybrid/sphere/deformation_flow.jl" artifact_paths: - "examples/hybrid/sphere/output/deformation_flow/*" - agents: - slurm_mem: 20GB - label: ":computer: 3D sphere Hadley circulation" key: "cpu_3d_hadley_circulation" @@ -561,8 +544,6 @@ steps: - "julia --color=yes --project=examples examples/hybrid/sphere/hadley_circulation.jl" artifact_paths: - "examples/hybrid/sphere/output/hadley_circulation/*" - agents: - slurm_mem: 40GB - label: ":computer: Float 64 3D sphere baroclinic wave (ρe)" key: "cpu_baroclinic_wave_rho_e_float64" @@ -573,8 +554,6 @@ steps: env: TEST_NAME: "sphere/baroclinic_wave_rhoe" FLOAT_TYPE: "Float64" - agents: - slurm_mem: 40GB - label: ":computer: 3D sphere baroclinic wave (ρe)" key: "cpu_baroclinic_wave_rho_e" @@ -584,8 +563,6 @@ steps: - "examples/hybrid/sphere/output/baroclinic_wave_rhoe/Float32/*" env: TEST_NAME: "sphere/baroclinic_wave_rhoe" - agents: - slurm_mem: 10GB - label: ":computer: MPI 3D sphere baroclinic wave (ρe)" key: "cpu_mpi_baroclinic_wave_rho_e" @@ -598,7 +575,6 @@ steps: CLIMACORE_DISTRIBUTED: "MPI" agents: slurm_ntasks: 2 - slurm_mem: 20GB - label: ":computer: 3D sphere baroclinic wave (ρθ)" @@ -609,8 +585,6 @@ steps: - "examples/hybrid/sphere/output/baroclinic_wave_rhotheta/Float32/*" env: TEST_NAME: "sphere/baroclinic_wave_rhotheta" - agents: - slurm_mem: 10GB - label: ":computer: 3D sphere nonhydrostatic gravity wave" key: "cpu_nonhydrostatic_gravity_wave" @@ -632,8 +606,6 @@ steps: - "examples/hybrid/sphere/output/balanced_flow_rhoe/Float32/*" env: TEST_NAME: "sphere/balanced_flow_rhoe" - agents: - slurm_mem: 20GB - label: ":computer: 3D sphere hydrostatically and geostrophically balanced flow (ρθ)" key: "cpu_balanced_flow_rho_theta" @@ -652,8 +624,6 @@ steps: - "examples/hybrid/sphere/output/held_suarez_rhoe/Float32/*" env: TEST_NAME: "sphere/held_suarez_rhoe" - agents: - slurm_mem: 10GB - label: ":computer: Float64 3D sphere dry Held-Suarez (ρθ)" key: "cpu_held_suarez_rho_theta_float64" @@ -664,8 +634,6 @@ steps: env: TEST_NAME: "sphere/held_suarez_rhotheta" FLOAT_TYPE: "Float64" - agents: - slurm_mem: 20GB - label: ":computer: 3D sphere dry Held-Suarez (ρθ)" key: "cpu_held_suarez_rho_theta" @@ -675,8 +643,6 @@ steps: - "examples/hybrid/sphere/output/held_suarez_rhotheta/Float32/*" env: TEST_NAME: "sphere/held_suarez_rhotheta" - agents: - slurm_mem: 20GB - label: ":computer: 3D sphere dry Held-Suarez (ρe_int)" key: "cpu_held_suarez_rho_e_int" @@ -686,8 +652,6 @@ steps: - "examples/hybrid/sphere/output/held_suarez_rhoe_int/Float32/*" env: TEST_NAME: "sphere/held_suarez_rhoe_int" - agents: - slurm_mem: 10GB - group: "Examples hybrid plane" steps: @@ -700,8 +664,6 @@ steps: - "examples/hybrid/plane/output/inertial_gravity_wave/Float32/*" env: TEST_NAME: "plane/inertial_gravity_wave" - agents: - slurm_mem: 20GB - label: ":computer: stretched 2D plane inertial gravity wave" key: "cpu_stretch_inertial_gravity_wave" @@ -712,8 +674,6 @@ steps: env: TEST_NAME: "plane/inertial_gravity_wave" Z_STRETCH: "true" - agents: - slurm_mem: 10GB - group: "Analysis" steps: @@ -724,8 +684,6 @@ steps: - "julia --color=yes --project=perf perf/allocs.jl" artifact_paths: - "perf/allocations_output/*" - agents: - slurm_mem: 40GB - label: ":rocket::computer: Flamegraph profile" key: "cpu_flamegraph" @@ -745,8 +703,6 @@ steps: key: "cpu_invalidations" command: - "julia --color=yes --project=perf perf/invalidations.jl" - agents: - slurm_mem: 20GB - wait - label: ":chart_with_downwards_trend: build history" diff --git a/src/DataLayouts/broadcast.jl b/src/DataLayouts/broadcast.jl index 01e504b220..69d7308bf5 100644 --- a/src/DataLayouts/broadcast.jl +++ b/src/DataLayouts/broadcast.jl @@ -407,10 +407,13 @@ end # broadcasting scalar assignment # Performance optimization for the common identity scalar case: dest .= val -@inline function Base.copyto!( +function Base.copyto!( dest::AbstractData, bc::Base.Broadcast.Broadcasted{Style}, -) where {Style <: Base.Broadcast.AbstractArrayStyle{0}} +) where { + Style <: + Union{Base.Broadcast.AbstractArrayStyle{0}, Base.Broadcast.Style{Tuple}}, +} bc = Base.Broadcast.instantiate( Base.Broadcast.Broadcasted{Style}(bc.f, bc.args, ()), ) @@ -418,7 +421,7 @@ end fill!(dest, bc0) end -@inline function Base.copyto!( +function Base.copyto!( dest::DataF{S}, bc::Union{DataF{S, A}, Base.Broadcast.Broadcasted{DataFStyle{A}}}, ) where {S, A} @@ -426,7 +429,7 @@ end return dest end -@inline function Base.copyto!( +function Base.copyto!( dest::IJFH{S, Nij}, bc::Union{IJFH{S, Nij}, Base.Broadcast.Broadcasted{<:IJFHStyle{Nij}}}, ) where {S, Nij} @@ -439,7 +442,7 @@ end return dest end -@inline function Base.copyto!( +function Base.copyto!( dest::IFH{S, Ni}, bc::Union{IFH{S, Ni}, Base.Broadcast.Broadcasted{<:IFHStyle{Ni}}}, ) where {S, Ni} @@ -453,7 +456,7 @@ end end # inline inner slab(::DataSlab2D) copy -@inline function Base.copyto!( +function Base.copyto!( dest::IJF{S, Nij}, bc::Union{IJF{S, Nij, A}, Base.Broadcast.Broadcasted{IJFStyle{Nij, A}}}, ) where {S, Nij, A} @@ -465,7 +468,7 @@ end end # inline inner slab(::DataSlab1D) copy -@inline function Base.copyto!( +function Base.copyto!( dest::IF{S, Ni}, bc::Base.Broadcast.Broadcasted{IFStyle{Ni, A}}, ) where {S, Ni, A} @@ -477,7 +480,7 @@ end end # inline inner column(::DataColumn) copy -@inline function Base.copyto!( +function Base.copyto!( dest::VF{S}, bc::Union{VF{S, A}, Base.Broadcast.Broadcasted{VFStyle{A}}}, ) where {S, A} @@ -489,7 +492,7 @@ end return dest end -@inline function _serial_copyto!( +function _serial_copyto!( dest::VIFH{S, Ni}, bc::Union{VIFH{S, Ni, A}, Base.Broadcast.Broadcasted{VIFHStyle{Ni, A}}}, ) where {S, Ni, A} @@ -503,7 +506,7 @@ end return dest end -@inline function _threaded_copyto!( +function _threaded_copyto!( dest::VIFH{S, Ni}, bc::Base.Broadcast.Broadcasted{VIFHStyle{Ni, A}}, ) where {S, Ni, A} @@ -522,14 +525,14 @@ end return dest end -@inline function Base.copyto!( +function Base.copyto!( dest::VIFH{S, Ni}, source::VIFH{S, Ni, A}, ) where {S, Ni, A} return _serial_copyto!(dest, source) end -@inline function Base.copyto!( +function Base.copyto!( dest::VIFH{S, Ni}, bc::Base.Broadcast.Broadcasted{VIFHStyle{Ni, A}}, ) where {S, Ni, A} @@ -539,7 +542,7 @@ end return _serial_copyto!(dest, bc) end -@inline function _serial_copyto!( +function _serial_copyto!( dest::VIJFH{S, Nij}, bc::Union{VIJFH{S, Nij, A}, Base.Broadcast.Broadcasted{VIJFHStyle{Nij, A}}}, ) where {S, Nij, A} @@ -553,7 +556,7 @@ end return dest end -@inline function _threaded_copyto!( +function _threaded_copyto!( dest::VIJFH{S, Nij}, bc::Base.Broadcast.Broadcasted{VIJFHStyle{Nij, A}}, ) where {S, Nij, A} @@ -572,14 +575,14 @@ end return dest end -@inline function Base.copyto!( +function Base.copyto!( dest::VIJFH{S, Nij}, source::VIJFH{S, Nij, A}, ) where {S, Nij, A} return _serial_copyto!(dest, source) end -@inline function Base.copyto!( +function Base.copyto!( dest::VIJFH{S, Nij}, bc::Base.Broadcast.Broadcasted{VIJFHStyle{Nij, A}}, ) where {S, Nij, A} diff --git a/src/Fields/broadcast.jl b/src/Fields/broadcast.jl index fc0d247fa2..9ba25dc58d 100644 --- a/src/Fields/broadcast.jl +++ b/src/Fields/broadcast.jl @@ -18,10 +18,15 @@ FieldStyle(x::Base.Broadcast.Unknown) = x Base.Broadcast.BroadcastStyle(::Type{Field{V, S}}) where {V, S} = FieldStyle(DataStyle(V)) +# Broadcasting over scalars (Ref or Tuple) Base.Broadcast.BroadcastStyle( ::Base.Broadcast.AbstractArrayStyle{0}, fs::AbstractFieldStyle, ) = fs +Base.Broadcast.BroadcastStyle( + ::Base.Broadcast.Style{Tuple}, + fs::AbstractFieldStyle, +) = fs Base.Broadcast.BroadcastStyle( ::FieldStyle{DS1}, @@ -42,15 +47,14 @@ _first(data::DataLayouts.VF) = data[] _first(field::Field) = _first_data_layout(field_values(column(field, 1, 1, 1))) _first(space::Spaces.AbstractSpace) = _first_data_layout(field_values(column(space, 1, 1, 1))) -_first(bc::Base.Broadcast.Broadcasted) = _first.(bc.args) # Is this case necessary? -_first(x::Base.RefValue{T}) where {T} = x -unref(x::Ref{T}) where {T} = x.x -unref(T) = T +_first(bc::Base.Broadcast.Broadcasted) = _first(copy(bc)) +_first(x::Ref{T}) where {T} = x.x +_first(x::Tuple{T}) where {T} = x[1] function call_with_first(bc) # Try calling with first applied to all arguments: bc′ = Base.Broadcast.preprocess(nothing, bc) - first_args = map(arg -> unref(_first(arg)), bc′.args) + first_args = map(arg -> _first(arg), bc′.args) bc.f(first_args...) end @@ -183,8 +187,8 @@ end end return space1 end -@inline Base.Broadcast.broadcast_shape(space::AbstractSpace, ::Tuple{}) = space -@inline Base.Broadcast.broadcast_shape(::Tuple{}, space::AbstractSpace) = space +@inline Base.Broadcast.broadcast_shape(space::AbstractSpace, ::Tuple) = space +@inline Base.Broadcast.broadcast_shape(::Tuple, space::AbstractSpace) = space @inline Base.Broadcast.broadcast_shape( pointspace::AbstractPointSpace, @@ -232,6 +236,12 @@ end ) return nothing end +@inline function Base.Broadcast.check_broadcast_shape( + ::AbstractSpace, + ::Tuple{T}, +) where {T} + return nothing +end @inline function Base.Broadcast.check_broadcast_shape( ::AbstractSpace, ::AbstractPointSpace, @@ -380,6 +390,13 @@ function Base.Broadcast.copyto!( copyto!(Fields.field_values(field), bc) return field end +function Base.Broadcast.copyto!( + field::Field, + bc::Base.Broadcast.Broadcasted{Base.Broadcast.Style{Tuple}}, +) + copyto!(Fields.field_values(field), bc) + return field +end function Base.Broadcast.copyto!(field::Field, nt::NamedTuple) copyto!( diff --git a/src/Operators/finitedifference.jl b/src/Operators/finitedifference.jl index 6e8fd39a25..27851f9c1a 100644 --- a/src/Operators/finitedifference.jl +++ b/src/Operators/finitedifference.jl @@ -3109,6 +3109,7 @@ Base.@propagate_inbounds function getidx( end # unwap boxed scalars +@inline getidx(scalar::Tuple{T}, loc::Location, idx, hidx) where {T} = scalar[1] @inline getidx(scalar::Ref, loc::Location, idx, hidx) = scalar[] @inline getidx(field::Fields.PointField, loc::Location, idx, hidx) = field[] @inline getidx(field::Fields.PointField, loc::Location, idx) = field[] @@ -3275,7 +3276,7 @@ function Base.similar( return Field(Eltype, sp) end -@inline function _serial_copyto!( +function _serial_copyto!( field_out::Field, bc::Base.Broadcast.Broadcasted{S}, Ni::Int, @@ -3288,7 +3289,7 @@ end return field_out end -@inline function _threaded_copyto!( +function _threaded_copyto!( field_out::Field, bc::Base.Broadcast.Broadcasted{S}, Ni::Int, @@ -3305,7 +3306,7 @@ end return field_out end -@inline function Base.copyto!( +function Base.copyto!( field_out::Field, bc::Base.Broadcast.Broadcasted{S}, ) where {S <: AbstractStencilStyle} diff --git a/test/Fields/field_opt.jl b/test/Fields/field_opt.jl index f2bd65baea..8539373b79 100644 --- a/test/Fields/field_opt.jl +++ b/test/Fields/field_opt.jl @@ -19,15 +19,15 @@ end include(joinpath(@__DIR__, "util_spaces.jl")) # https://github.com/CliMA/ClimaCore.jl/issues/946 -@testset "Allocations with broadcasting Refs" begin +@testset "Allocations with broadcasting Scalars" begin FT = Float64 function foo!(Yx::Fields.Field) - Yx .= Ref(1) .+ Yx + Yx .= (1,) .+ Yx return nothing end function foocolumn!(Yx::Fields.Field) Fields.bycolumn(axes(Yx)) do colidx - Yx[colidx] .= Ref(1) .+ Yx[colidx] + Yx[colidx] .= (1,) .+ Yx[colidx] nothing end return nothing @@ -58,7 +58,7 @@ end nothing end function callfill!(Y) - fill!(Y, Ref((; x = 2.0))) + fill!(Y, ((; x = 2.0),)) nothing end for space in all_spaces(FT) @@ -80,13 +80,13 @@ function allocs_test1!(Y) x = Y.x FT = Spaces.undertype(axes(x)) I = sc(FT) - x .= x .+ Ref(I) + x .= x .+ (I,) nothing end function allocs_test2!(Y) x = Y.x FT = Spaces.undertype(axes(x)) - IR = Ref(sc(FT)) + IR = (sc(FT),) @. x += IR nothing end @@ -96,7 +96,7 @@ function allocs_test1_column!(Y) FT = Spaces.undertype(axes(x)) # I = sc(FT) I = Operators.StencilCoefs{-1, 1}((zero(FT), one(FT), zero(FT))) - x[colidx] .= x[colidx] .+ Ref(I) + x[colidx] .= x[colidx] .+ (I,) end nothing end @@ -104,7 +104,7 @@ function allocs_test2_column!(Y) Fields.bycolumn(axes(Y.x)) do colidx x = Y.x FT = Spaces.undertype(axes(x)) - IR = Ref(sc(FT)) + IR = (sc(FT),) @. x[colidx] += IR end nothing @@ -119,10 +119,10 @@ end function allocs_test3_column!(x) FT = Spaces.undertype(axes(x)) - IR = Ref(Operators.StencilCoefs{-1, 1}((zero(FT), one(FT), zero(FT)))) + IR = (Operators.StencilCoefs{-1, 1}((zero(FT), one(FT), zero(FT))),) @. x += IR I = Operators.StencilCoefs{-1, 1}((zero(FT), one(FT), zero(FT))) - x .+= Ref(I) + x .+= (I,) nothing end @@ -154,9 +154,9 @@ end end nothing -function allocs_test_Ref_with_compose!(S, ∂ᶠ𝕄ₜ∂ᶜρ, ∂ᶜρₜ∂ᶠ𝕄) +function allocs_test_scalar_with_compose!(S, ∂ᶠ𝕄ₜ∂ᶜρ, ∂ᶜρₜ∂ᶠ𝕄) Fields.bycolumn(axes(S)) do colidx - allocs_test_Ref_with_compose_column!( + allocs_test_scalar_with_compose_column!( S[colidx], ∂ᶠ𝕄ₜ∂ᶜρ[colidx], ∂ᶜρₜ∂ᶠ𝕄[colidx], @@ -165,15 +165,15 @@ function allocs_test_Ref_with_compose!(S, ∂ᶠ𝕄ₜ∂ᶜρ, ∂ᶜρₜ∂ nothing end -function allocs_test_Ref_with_compose_column!(S, ∂ᶠ𝕄ₜ∂ᶜρ, ∂ᶜρₜ∂ᶠ𝕄) +function allocs_test_scalar_with_compose_column!(S, ∂ᶠ𝕄ₜ∂ᶜρ, ∂ᶜρₜ∂ᶠ𝕄) compose = Operators.ComposeStencils() FT = Spaces.undertype(axes(S)) - IR = Ref(Operators.StencilCoefs{-1, 1}((zero(FT), one(FT), zero(FT)))) + IR = (Operators.StencilCoefs{-1, 1}((zero(FT), one(FT), zero(FT))),) @. S = compose(∂ᶠ𝕄ₜ∂ᶜρ, ∂ᶜρₜ∂ᶠ𝕄) - IR nothing end -@testset "Allocations StencilCoefs Ref with ComposeStencils broadcasting" begin +@testset "Allocations StencilCoefs scalar with ComposeStencils broadcasting" begin FT = Float64 for space in all_spaces(FT) space isa Spaces.CenterExtrudedFiniteDifferenceSpace || continue @@ -185,12 +185,16 @@ end tridiag_type = Operators.StencilCoefs{-1, 1, NTuple{3, FT}} S = Fields.Field(tridiag_type, fspace) - allocs_test_Ref_with_compose!(S, ∂ᶠ𝕄ₜ∂ᶜρ, ∂ᶜρₜ∂ᶠ𝕄) - p = @allocated allocs_test_Ref_with_compose!(S, ∂ᶠ𝕄ₜ∂ᶜρ, ∂ᶜρₜ∂ᶠ𝕄) + allocs_test_scalar_with_compose!(S, ∂ᶠ𝕄ₜ∂ᶜρ, ∂ᶜρₜ∂ᶠ𝕄) + p = @allocated allocs_test_scalar_with_compose!(S, ∂ᶠ𝕄ₜ∂ᶜρ, ∂ᶜρₜ∂ᶠ𝕄) @test p == 0 - allocs_test_Ref_with_compose_column!(S, ∂ᶠ𝕄ₜ∂ᶜρ, ∂ᶜρₜ∂ᶠ𝕄) - p = @allocated allocs_test_Ref_with_compose_column!(S, ∂ᶠ𝕄ₜ∂ᶜρ, ∂ᶜρₜ∂ᶠ𝕄) + allocs_test_scalar_with_compose_column!(S, ∂ᶠ𝕄ₜ∂ᶜρ, ∂ᶜρₜ∂ᶠ𝕄) + p = @allocated allocs_test_scalar_with_compose_column!( + S, + ∂ᶠ𝕄ₜ∂ᶜρ, + ∂ᶜρₜ∂ᶠ𝕄, + ) @test p == 0 end end diff --git a/test/Operators/finitedifference/opt_examples.jl b/test/Operators/finitedifference/opt_examples.jl index 64590e65fa..757a11648f 100644 --- a/test/Operators/finitedifference/opt_examples.jl +++ b/test/Operators/finitedifference/opt_examples.jl @@ -98,7 +98,7 @@ function alloc_test_derivative(cfield, ffield, ∇c, ∇f) p = @allocated begin c∇closure() end - @test_broken p == 0 + @test p == 0 ##### C2F # wvec = Geometry.WVector # cannot re-define, otherwise many allocations @@ -168,7 +168,7 @@ function alloc_test_operators_in_loops(cfield, ffield) p = @allocated begin c∇closure() end - @test_broken p == 0 + @test p == 0 end end function alloc_test_nested_expressions_1(cfield, ffield)