Skip to content

Commit

Permalink
Merge branch 'upstream-master'
Browse files Browse the repository at this point in the history
  • Loading branch information
Datadog Syncup Service committed Jun 11, 2024
2 parents 68c64bb + aaaa86b commit 938bab3
Show file tree
Hide file tree
Showing 160 changed files with 1,231 additions and 6,977 deletions.
62 changes: 42 additions & 20 deletions src/hotspot/cpu/aarch64/aarch64_vector.ad
Original file line number Diff line number Diff line change
Expand Up @@ -135,9 +135,9 @@ source %{
(opcode == Op_VectorCastL2X && bt == T_FLOAT) ||
(opcode == Op_CountLeadingZerosV && bt == T_LONG) ||
(opcode == Op_CountTrailingZerosV && bt == T_LONG) ||
// The vector implementation of Op_AddReductionVD/F is for the Vector API only.
// It is not suitable for auto-vectorization because it does not add the elements
// in the same order as sequential code, and FP addition is non-associative.
// The implementations of Op_AddReductionVD/F in Neon are for the Vector API only.
// They are not suitable for auto-vectorization because the result would not conform
// to the JLS, Section Evaluation Order.
opcode == Op_AddReductionVD || opcode == Op_AddReductionVF ||
opcode == Op_MulReductionVD || opcode == Op_MulReductionVF ||
opcode == Op_MulVL) {
Expand Down Expand Up @@ -2858,26 +2858,28 @@ instruct reduce_addL_sve(iRegLNoSp dst, iRegL isrc, vReg vsrc, vRegD tmp) %{
%}

// reduction addF
// Floating-point addition is not associative, so the rules for AddReductionVF
// on NEON can't be used to auto-vectorize floating-point reduce-add.
// Currently, on NEON, AddReductionVF is only generated by Vector API.
instruct reduce_add2F_neon(vRegF dst, vRegF fsrc, vReg vsrc) %{
predicate(UseSVE == 0 && Matcher::vector_length(n->in(2)) == 2);

instruct reduce_non_strict_order_add2F_neon(vRegF dst, vRegF fsrc, vReg vsrc) %{
// Non-strictly ordered floating-point add reduction for a 64-bits-long vector. This rule is
// intended for the VectorAPI (which allows for non-strictly ordered add reduction).
predicate(Matcher::vector_length(n->in(2)) == 2 && !n->as_Reduction()->requires_strict_order());
match(Set dst (AddReductionVF fsrc vsrc));
effect(TEMP_DEF dst);
format %{ "reduce_add2F_neon $dst, $fsrc, $vsrc" %}
format %{ "reduce_non_strict_order_add2F_neon $dst, $fsrc, $vsrc" %}
ins_encode %{
__ faddp($dst$$FloatRegister, $vsrc$$FloatRegister, __ S);
__ fadds($dst$$FloatRegister, $dst$$FloatRegister, $fsrc$$FloatRegister);
%}
ins_pipe(pipe_slow);
%}

instruct reduce_add4F_neon(vRegF dst, vRegF fsrc, vReg vsrc, vReg tmp) %{
predicate(UseSVE == 0 && Matcher::vector_length(n->in(2)) == 4);
instruct reduce_non_strict_order_add4F_neon(vRegF dst, vRegF fsrc, vReg vsrc, vReg tmp) %{
// Non-strictly ordered floating-point add reduction for 128-bits-long vector. This rule is
// intended for the VectorAPI (which allows for non-strictly ordered add reduction).
predicate(Matcher::vector_length(n->in(2)) == 4 && !n->as_Reduction()->requires_strict_order());
match(Set dst (AddReductionVF fsrc vsrc));
effect(TEMP_DEF dst, TEMP tmp);
format %{ "reduce_add4F_neon $dst, $fsrc, $vsrc\t# KILL $tmp" %}
format %{ "reduce_non_strict_order_add4F_neon $dst, $fsrc, $vsrc\t# KILL $tmp" %}
ins_encode %{
__ faddp($tmp$$FloatRegister, __ T4S, $vsrc$$FloatRegister, $vsrc$$FloatRegister);
__ faddp($dst$$FloatRegister, $tmp$$FloatRegister, __ S);
Expand All @@ -2886,11 +2888,21 @@ instruct reduce_add4F_neon(vRegF dst, vRegF fsrc, vReg vsrc, vReg tmp) %{
ins_pipe(pipe_slow);
%}

// This rule calculates the reduction result in strict order. Two cases will
// reach here:
// 1. Non strictly-ordered AddReductionVF when vector size > 128-bits. For example -
// AddReductionVF generated by Vector API. For vector size > 128-bits, it is more
// beneficial performance-wise to generate direct SVE instruction even if it is
// strictly ordered.
// 2. Strictly-ordered AddReductionVF. For example - AddReductionVF generated by
// auto-vectorization on SVE machine.
instruct reduce_addF_sve(vRegF dst_src1, vReg src2) %{
predicate(UseSVE > 0);
predicate(!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(2))) ||
n->as_Reduction()->requires_strict_order());
match(Set dst_src1 (AddReductionVF dst_src1 src2));
format %{ "reduce_addF_sve $dst_src1, $dst_src1, $src2" %}
ins_encode %{
assert(UseSVE > 0, "must be sve");
uint length_in_bytes = Matcher::vector_length_in_bytes(this, $src2);
assert(length_in_bytes == MaxVectorSize, "invalid vector length");
__ sve_fadda($dst_src1$$FloatRegister, __ S, ptrue, $src2$$FloatRegister);
Expand All @@ -2899,26 +2911,36 @@ instruct reduce_addF_sve(vRegF dst_src1, vReg src2) %{
%}

// reduction addD
// Floating-point addition is not associative, so the rule for AddReductionVD
// on NEON can't be used to auto-vectorize floating-point reduce-add.
// Currently, on NEON, AddReductionVD is only generated by Vector API.
instruct reduce_addD_neon(vRegD dst, vRegD dsrc, vReg vsrc) %{
predicate(UseSVE == 0);

instruct reduce_non_strict_order_add2D_neon(vRegD dst, vRegD dsrc, vReg vsrc) %{
// Non-strictly ordered floating-point add reduction for doubles. This rule is
// intended for the VectorAPI (which allows for non-strictly ordered add reduction).
predicate(!n->as_Reduction()->requires_strict_order());
match(Set dst (AddReductionVD dsrc vsrc));
effect(TEMP_DEF dst);
format %{ "reduce_addD_neon $dst, $dsrc, $vsrc\t# 2D" %}
format %{ "reduce_non_strict_order_add2D_neon $dst, $dsrc, $vsrc\t# 2D" %}
ins_encode %{
__ faddp($dst$$FloatRegister, $vsrc$$FloatRegister, __ D);
__ faddd($dst$$FloatRegister, $dst$$FloatRegister, $dsrc$$FloatRegister);
%}
ins_pipe(pipe_slow);
%}

// This rule calculates the reduction result in strict order. Two cases will
// reach here:
// 1. Non strictly-ordered AddReductionVD when vector size > 128-bits. For example -
// AddReductionVD generated by Vector API. For vector size > 128-bits, it is more
// beneficial performance-wise to generate direct SVE instruction even if it is
// strictly ordered.
// 2. Strictly-ordered AddReductionVD. For example - AddReductionVD generated by
// auto-vectorization on SVE machine.
instruct reduce_addD_sve(vRegD dst_src1, vReg src2) %{
predicate(UseSVE > 0);
predicate(!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(2))) ||
n->as_Reduction()->requires_strict_order());
match(Set dst_src1 (AddReductionVD dst_src1 src2));
format %{ "reduce_addD_sve $dst_src1, $dst_src1, $src2" %}
ins_encode %{
assert(UseSVE > 0, "must be sve");
uint length_in_bytes = Matcher::vector_length_in_bytes(this, $src2);
assert(length_in_bytes == MaxVectorSize, "invalid vector length");
__ sve_fadda($dst_src1$$FloatRegister, __ D, ptrue, $src2$$FloatRegister);
Expand Down
50 changes: 31 additions & 19 deletions src/hotspot/cpu/aarch64/aarch64_vector_ad.m4
Original file line number Diff line number Diff line change
Expand Up @@ -125,9 +125,9 @@ source %{
(opcode == Op_VectorCastL2X && bt == T_FLOAT) ||
(opcode == Op_CountLeadingZerosV && bt == T_LONG) ||
(opcode == Op_CountTrailingZerosV && bt == T_LONG) ||
// The vector implementation of Op_AddReductionVD/F is for the Vector API only.
// It is not suitable for auto-vectorization because it does not add the elements
// in the same order as sequential code, and FP addition is non-associative.
// The implementations of Op_AddReductionVD/F in Neon are for the Vector API only.
// They are not suitable for auto-vectorization because the result would not conform
// to the JLS, Section Evaluation Order.
opcode == Op_AddReductionVD || opcode == Op_AddReductionVF ||
opcode == Op_MulReductionVD || opcode == Op_MulReductionVF ||
opcode == Op_MulVL) {
Expand Down Expand Up @@ -1752,26 +1752,28 @@ REDUCE_ADD_INT_NEON_SVE_PAIRWISE(I, iRegIorL2I)
REDUCE_ADD_INT_NEON_SVE_PAIRWISE(L, iRegL)

// reduction addF
// Floating-point addition is not associative, so the rules for AddReductionVF
// on NEON can't be used to auto-vectorize floating-point reduce-add.
// Currently, on NEON, AddReductionVF is only generated by Vector API.
instruct reduce_add2F_neon(vRegF dst, vRegF fsrc, vReg vsrc) %{
predicate(UseSVE == 0 && Matcher::vector_length(n->in(2)) == 2);

instruct reduce_non_strict_order_add2F_neon(vRegF dst, vRegF fsrc, vReg vsrc) %{
// Non-strictly ordered floating-point add reduction for a 64-bits-long vector. This rule is
// intended for the VectorAPI (which allows for non-strictly ordered add reduction).
predicate(Matcher::vector_length(n->in(2)) == 2 && !n->as_Reduction()->requires_strict_order());
match(Set dst (AddReductionVF fsrc vsrc));
effect(TEMP_DEF dst);
format %{ "reduce_add2F_neon $dst, $fsrc, $vsrc" %}
format %{ "reduce_non_strict_order_add2F_neon $dst, $fsrc, $vsrc" %}
ins_encode %{
__ faddp($dst$$FloatRegister, $vsrc$$FloatRegister, __ S);
__ fadds($dst$$FloatRegister, $dst$$FloatRegister, $fsrc$$FloatRegister);
%}
ins_pipe(pipe_slow);
%}

instruct reduce_add4F_neon(vRegF dst, vRegF fsrc, vReg vsrc, vReg tmp) %{
predicate(UseSVE == 0 && Matcher::vector_length(n->in(2)) == 4);
instruct reduce_non_strict_order_add4F_neon(vRegF dst, vRegF fsrc, vReg vsrc, vReg tmp) %{
// Non-strictly ordered floating-point add reduction for 128-bits-long vector. This rule is
// intended for the VectorAPI (which allows for non-strictly ordered add reduction).
predicate(Matcher::vector_length(n->in(2)) == 4 && !n->as_Reduction()->requires_strict_order());
match(Set dst (AddReductionVF fsrc vsrc));
effect(TEMP_DEF dst, TEMP tmp);
format %{ "reduce_add4F_neon $dst, $fsrc, $vsrc\t# KILL $tmp" %}
format %{ "reduce_non_strict_order_add4F_neon $dst, $fsrc, $vsrc\t# KILL $tmp" %}
ins_encode %{
__ faddp($tmp$$FloatRegister, __ T4S, $vsrc$$FloatRegister, $vsrc$$FloatRegister);
__ faddp($dst$$FloatRegister, $tmp$$FloatRegister, __ S);
Expand All @@ -1783,11 +1785,21 @@ dnl
dnl REDUCE_ADD_FP_SVE($1, $2 )
dnl REDUCE_ADD_FP_SVE(type, size)
define(`REDUCE_ADD_FP_SVE', `
// This rule calculates the reduction result in strict order. Two cases will
// reach here:
// 1. Non strictly-ordered AddReductionV$1 when vector size > 128-bits. For example -
// AddReductionV$1 generated by Vector API. For vector size > 128-bits, it is more
// beneficial performance-wise to generate direct SVE instruction even if it is
// strictly ordered.
// 2. Strictly-ordered AddReductionV$1. For example - AddReductionV$1 generated by
// auto-vectorization on SVE machine.
instruct reduce_add$1_sve(vReg$1 dst_src1, vReg src2) %{
predicate(UseSVE > 0);
predicate(!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(2))) ||
n->as_Reduction()->requires_strict_order());
match(Set dst_src1 (AddReductionV$1 dst_src1 src2));
format %{ "reduce_add$1_sve $dst_src1, $dst_src1, $src2" %}
ins_encode %{
assert(UseSVE > 0, "must be sve");
uint length_in_bytes = Matcher::vector_length_in_bytes(this, $src2);
assert(length_in_bytes == MaxVectorSize, "invalid vector length");
__ sve_fadda($dst_src1$$FloatRegister, __ $2, ptrue, $src2$$FloatRegister);
Expand All @@ -1798,14 +1810,14 @@ dnl
REDUCE_ADD_FP_SVE(F, S)

// reduction addD
// Floating-point addition is not associative, so the rule for AddReductionVD
// on NEON can't be used to auto-vectorize floating-point reduce-add.
// Currently, on NEON, AddReductionVD is only generated by Vector API.
instruct reduce_addD_neon(vRegD dst, vRegD dsrc, vReg vsrc) %{
predicate(UseSVE == 0);

instruct reduce_non_strict_order_add2D_neon(vRegD dst, vRegD dsrc, vReg vsrc) %{
// Non-strictly ordered floating-point add reduction for doubles. This rule is
// intended for the VectorAPI (which allows for non-strictly ordered add reduction).
predicate(!n->as_Reduction()->requires_strict_order());
match(Set dst (AddReductionVD dsrc vsrc));
effect(TEMP_DEF dst);
format %{ "reduce_addD_neon $dst, $dsrc, $vsrc\t# 2D" %}
format %{ "reduce_non_strict_order_add2D_neon $dst, $dsrc, $vsrc\t# 2D" %}
ins_encode %{
__ faddp($dst$$FloatRegister, $vsrc$$FloatRegister, __ D);
__ faddd($dst$$FloatRegister, $dst$$FloatRegister, $dsrc$$FloatRegister);
Expand Down
1 change: 1 addition & 0 deletions src/hotspot/cpu/aarch64/nativeInst_aarch64.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,7 @@ class NativeCall: public NativeInstruction {
return_address_offset = 4
};

static int byte_size() { return instruction_size; }
address instruction_address() const { return addr_at(instruction_offset); }
address next_instruction_address() const { return addr_at(return_address_offset); }
int displacement() const { return (int_at(displacement_offset) << 6) >> 4; }
Expand Down
1 change: 1 addition & 0 deletions src/hotspot/cpu/arm/nativeInst_arm_32.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -415,6 +415,7 @@ inline NativeJump* nativeJump_at(address address) {

class NativeCall: public RawNativeCall {
public:
static int byte_size() { return instruction_size; }
// NativeCall::next_instruction_address() is used only to define the
// range where to look for the relocation information. We need not
// walk over composed instructions (as long as the relocation information
Expand Down
2 changes: 2 additions & 0 deletions src/hotspot/cpu/ppc/nativeInst_ppc.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,8 @@ class NativeCall: public NativeInstruction {
instruction_size = 16 // Used in shared code for calls with reloc_info.
};

static int byte_size() { return instruction_size; }

static bool is_call_at(address a) {
return Assembler::is_bl(*(int*)(a));
}
Expand Down
1 change: 1 addition & 0 deletions src/hotspot/cpu/riscv/nativeInst_riscv.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@ class NativeCall: public NativeInstruction {
return_address_offset = 4
};

static int byte_size() { return instruction_size; }
address instruction_address() const { return addr_at(instruction_offset); }
address next_instruction_address() const { return addr_at(return_address_offset); }
address return_address() const { return addr_at(return_address_offset); }
Expand Down
1 change: 1 addition & 0 deletions src/hotspot/cpu/s390/nativeInst_s390.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,7 @@ class NativeCall: public NativeInstruction {
call_far_pcrelative_displacement_alignment = 4
};

static int byte_size() { return instruction_size; }

// Maximum size (in bytes) of a call to an absolute address.
// Used when emitting call to deopt handler blob, which is a
Expand Down

0 comments on commit 938bab3

Please sign in to comment.