Closed as not planned
Description
Background and motivation
The various Vector
types have Sum
methods available for efficiently summing elements. However they all return the same type as the element types which means they can't be used if the sum of elements might overflow.
Actually writing efficient non-overflowing sum methods is a lot of work if you want it to be efficient on diverse platforms with different availability of SIMD instructions - the obvious solution of Widen + Sum is generally not the most performant choice.
API Proposal
namespace System.Numerics;
public static class Vector
{
public static ushort SumWidening(Vector<byte> value);
public static uint SumWidening(Vector<ushort> value);
public static ulong SumWidening(Vector<uint> value);
public static UInt128 SumWidening(Vector<ulong> value);
public static short SumWidening(Vector<sbyte> value);
public static int SumWidening(Vector<short> value);
public static long SumWidening(Vector<int> value);
public static Int128 SumWidening(Vector<long> value);
}
namespace System.Runtime.Intrinsics;
public static class Vector64
{
public static ushort SumWidening(Vector64<byte> value);
public static uint SumWidening(Vector64<ushort> value);
public static ulong SumWidening(Vector64<uint> value);
public static UInt128 SumWidening(Vector64<ulong> value);
public static short SumWidening(Vector64<sbyte> value);
public static int SumWidening(Vector64<short> value);
public static long SumWidening(Vector64<int> value);
public static Int128 SumWidening(Vector64<long> value);
}
namespace System.Runtime.Intrinsics;
public static class Vector128
{
public static ushort SumWidening(Vector128<byte> value);
public static uint SumWidening(Vector128<ushort> value);
public static ulong SumWidening(Vector128<uint> value);
public static UInt128 SumWidening(Vector128<ulong> value);
public static short SumWidening(Vector128<sbyte> value);
public static int SumWidening(Vector128<short> value);
public static long SumWidening(Vector128<int> value);
public static Int128 SumWidening(Vector128<long> value);
}
namespace System.Runtime.Intrinsics;
public static class Vector256
{
public static ushort SumWidening(Vector256<byte> value);
public static uint SumWidening(Vector256<ushort> value);
public static ulong SumWidening(Vector256<uint> value);
public static UInt128 SumWidening(Vector256<ulong> value);
public static short SumWidening(Vector256<sbyte> value);
public static int SumWidening(Vector256<short> value);
public static long SumWidening(Vector256<int> value);
public static Int128 SumWidening(Vector256<long> value);
}
namespace System.Runtime.Intrinsics;
public static class Vector512
{
public static ushort SumWidening(Vector512<byte> value);
public static uint SumWidening(Vector512<ushort> value);
public static ulong SumWidening(Vector512<uint> value);
public static UInt128 SumWidening(Vector512<ulong> value);
public static short SumWidening(Vector512<sbyte> value);
public static int SumWidening(Vector512<short> value);
public static long SumWidening(Vector512<int> value);
public static Int128 SumWidening(Vector512<long> value);
}
API Usage
public static ushort ConditionalSumByBitSet(uint bitset, Vector256<byte> items)
{
Vector256<byte> xbcast = Vector256.Create(bitset).AsByte();
// Each byte gets the source byte containing the corresponding bit
Vector256<byte> indices = Vector256.Create(
0x0000000000000000UL,
0x0101010101010101UL,
0x1E1E1E1E1E1E1E1EUL,
0x1F1F1F1F1F1F1F1FUL).AsByte();
Vector256<byte> shuf = Vector256.Shuffle(xbcast, indices);
Vector256<byte> andmask = Vector256.Create(0x08040201008040201UL).AsByte();
Vector256<byte> isolated = Vector256.BitwiseAnd(shuf, andmask);
Vector256<byte> notSelectedMask = Vector256.Equals(isolated, Vector256<byte>.Zero);
Vector256<byte> selected = Vector256.ConditionalSelect(notSelectedMask, Vector256<byte>.Zero, items);
return Vector256.SumWidening(selected); // <-- Used here
}
Alternative Designs
No response
Risks
None known