Skip to content

cmd/compile: performance regression in 1.20 - inlining regression due to missed export #57505

Open
@changkun

Description

@changkun

What version of Go are you using (go version)?

$ go version
go version go1.19.4 linux/amd64
$ gotip version
go version devel go1.20-e870de9 Tue Dec 27 21:10:04 2022 +0000 linux/amd64

Does this issue reproduce with the latest release?

Yes, also in 1.20rc1

What did you do?

$ cat go.mod
module mymodule/math

go 1.20
$ cat math.go
package math

import (
        "math"
        "math/rand"
)

const Epsilon = 1e-7

type Float interface {
        ~float32 | ~float64
}

type Mat4[T Float] struct {
        X00, X01, X02, X03 T
        X10, X11, X12, X13 T
        X20, X21, X22, X23 T
        X30, X31, X32, X33 T
}

func (m Mat4[T]) Eq(n Mat4[T]) bool {
        return ApproxEq(m.X00, n.X00, Epsilon) &&
                ApproxEq(m.X10, n.X10, Epsilon) &&
                ApproxEq(m.X20, n.X20, Epsilon) &&
                ApproxEq(m.X30, n.X30, Epsilon) &&
                ApproxEq(m.X01, n.X01, Epsilon) &&
                ApproxEq(m.X11, n.X11, Epsilon) &&
                ApproxEq(m.X21, n.X21, Epsilon) &&
                ApproxEq(m.X31, n.X31, Epsilon) &&
                ApproxEq(m.X02, n.X02, Epsilon) &&
                ApproxEq(m.X12, n.X12, Epsilon) &&
                ApproxEq(m.X22, n.X22, Epsilon) &&
                ApproxEq(m.X32, n.X32, Epsilon) &&
                ApproxEq(m.X03, n.X03, Epsilon) &&
                ApproxEq(m.X13, n.X13, Epsilon) &&
                ApproxEq(m.X23, n.X23, Epsilon) &&
                ApproxEq(m.X33, n.X33, Epsilon)
}

func Abs[T Float](x T) T {
        return T(math.Abs(float64(x)))
}

func ApproxEq[T Float](v1, v2, epsilon T) bool {
        return Abs(v1-v2) <= epsilon
}

type Vec4[T Float] struct {
        X, Y, Z, W T
}

func NewRandVec4[T Float]() Vec4[T] {
        return Vec4[T]{
                T(rand.Float64()),
                T(rand.Float64()),
                T(rand.Float64()),
                T(rand.Float64()),
        }
}

func (v Vec4[T]) Dot(u Vec4[T]) T {
        return FMA(v.X, u.X, FMA(v.Y, u.Y, FMA(v.Z, u.Z, v.W*u.W)))
}

func FMA[T Float](x, y, z T) T {
        return T(math.FMA(float64(x), float64(y), float64(z)))
}
$ cat bench_test.go 
package math_test

import (
        "testing"

        "mymodule/math"
)

func BenchmarkMat4_Eq(b *testing.B) {
        m1 := math.Mat4[float32]{
                5, 1, 5, 6,
                8, 71, 2, 47,
                5, 1, 582, 4,
                2, 1, 7, 25,
        }
        m2 := math.Mat4[float32]{
                5, 1, 5, 6,
                8, 71, 2, 47,
                5, 1, 582, 4,
                2, 1, 7, 25,
        }

        b.ResetTimer()
        b.ReportAllocs()
        var m bool
        for i := 0; i < b.N; i++ {
                m = m1.Eq(m2)
        }
        _ = m
}

var v float32

func BenchmarkVec_Dot(b *testing.B) {
        b.Run("Vec4", func(b *testing.B) {
                v1 := math.NewRandVec4[float32]()
                v2 := math.NewRandVec4[float32]()

                b.ReportAllocs()
                b.ResetTimer()
                for i := 0; i < b.N; i++ {
                        v = v1.Dot(v2)
                }
        })
}
$ perflock go test -run=none -bench=. -count=10 | tee bench119.txt
goos: linux
goarch: amd64
pkg: mymodule/math
cpu: Intel(R) Core(TM) i7-9700K CPU @ 3.60GHz
BenchmarkMat4_Eq-8      64214283                18.66 ns/op            0 B/op          0 allocs/op
BenchmarkMat4_Eq-8      64270538                18.66 ns/op            0 B/op          0 allocs/op
BenchmarkMat4_Eq-8      64261249                18.66 ns/op            0 B/op          0 allocs/op
...
$ perflock gotip test -run=none -bench=. -count=10 | tee bench120.txt
goos: linux
goarch: amd64
pkg: mymodule/math
cpu: Intel(R) Core(TM) i7-9700K CPU @ 3.60GHz
BenchmarkMat4_Eq-8      35130938                35.00 ns/op            0 B/op          0 allocs/op
BenchmarkMat4_Eq-8      35127861                34.20 ns/op            0 B/op          0 allocs/op
BenchmarkMat4_Eq-8      34658744                34.21 ns/op            0 B/op          0 allocs/op
...

What did you expect to see?

Same performance.

What did you see instead?

$ benchstat bench119.txt bench120.txt
name            old time/op    new time/op    delta
Mat4_Eq-8         18.7ns ± 0%    34.2ns ± 0%   +83.24%  (p=0.000 n=8+9)
Vec_Dot/Vec4-8    4.44ns ± 0%    9.30ns ± 0%  +109.24%  (p=0.000 n=8+8)

name            old alloc/op   new alloc/op   delta
Mat4_Eq-8          0.00B          0.00B           ~     (all equal)
Vec_Dot/Vec4-8     0.00B          0.00B           ~     (all equal)

name            old allocs/op  new allocs/op  delta
Mat4_Eq-8           0.00           0.00           ~     (all equal)
Vec_Dot/Vec4-8      0.00           0.00           ~     (all equal)

Metadata

Metadata

Assignees

Labels

NeedsInvestigationSomeone must examine and confirm this is a valid issue and not a duplicate of an existing one.Performancecompiler/runtimeIssues related to the Go compiler and/or runtime.

Type

No type

Projects

Status

Todo

Relationships

None yet

Development

No branches or pull requests

Issue actions