-
Notifications
You must be signed in to change notification settings - Fork 256
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1527 from NZJenkins/WalkIndexBuffer-SIMD
Add SSE WalkIndexBuffer implementation
- Loading branch information
Showing
6 changed files
with
199 additions
and
18 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
// https://stackoverflow.com/questions/1666093/cpuid-implementations-in-c | ||
|
||
#ifndef CPUID_H | ||
#define CPUID_H | ||
|
||
#ifdef _WIN32 | ||
#include <limits.h> | ||
#include <intrin.h> | ||
#include <bitset> | ||
typedef unsigned __int32 uint32_t; | ||
|
||
#else | ||
#include <stdint.h> | ||
#endif | ||
|
||
class CPUID { | ||
uint32_t regs[4]; | ||
|
||
public: | ||
explicit CPUID(unsigned i) { | ||
#ifdef _WIN32 | ||
__cpuid((int *)regs, (int)i); | ||
|
||
#else | ||
asm volatile | ||
("cpuid" : "=a" (regs[0]), "=b" (regs[1]), "=c" (regs[2]), "=d" (regs[3]) | ||
: "a" (i), "c" (0)); | ||
// ECX is set to zero for CPUID function 4 | ||
#endif | ||
} | ||
|
||
const std::bitset<32> &EAX() const { return regs[0]; } | ||
const std::bitset<32> &EBX() const { return regs[1]; } | ||
const std::bitset<32> &ECX() const { return regs[2]; } | ||
const std::bitset<32> &EDX() const { return regs[3]; } | ||
}; | ||
|
||
class SimdCaps { | ||
|
||
public: | ||
const bool SSE(void) { return f_1.EDX()[25]; } | ||
const bool SSE2(void) { return f_1.EDX()[26]; } | ||
const bool SSE3(void) { return f_1.ECX()[0]; } | ||
const bool SSSE3(void) { return f_1.ECX()[9]; } | ||
const bool SSE41(void) { return f_1.ECX()[19]; } | ||
const bool SSE42(void) { return f_1.ECX()[20]; } | ||
const bool AVX(void) { return f_1.ECX()[1]; } | ||
const bool AVX2(void) { return f_7.EBX()[5]; } | ||
|
||
private: | ||
const CPUID f_1 = CPUID(1); | ||
const CPUID f_7 = CPUID(7); | ||
}; | ||
|
||
static SimdCaps bob; | ||
|
||
#endif // CPUID_H |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
//#include <mmintrin.h> // - MMX | ||
//#include <xmmintrin.h> // SSE | ||
//#include <emmintrin.h> // SSE2 | ||
//#include <pmmintrin.h> // SSE3 | ||
#include <smmintrin.h> // SSE4.1 | ||
//#include <nmmintrin.h> // SSE4.2 | ||
//#include <immintrin.h> // AVX | ||
|
||
#include "core\kernel\support\Emu.h" | ||
#include "core\kernel\support\EmuXTL.h" | ||
|
||
#include "common\util\CPUID.h" | ||
#include "WalkIndexBuffer.h" | ||
|
||
void WalkIndexBuffer_SSE41(XTL::INDEX16 & LowIndex, XTL::INDEX16 & HighIndex, XTL::INDEX16 * pIndexData, DWORD dwIndexCount); | ||
|
||
void Init_SIMD | ||
( | ||
XTL::INDEX16 &LowIndex, | ||
XTL::INDEX16 &HighIndex, | ||
XTL::INDEX16 *pIndexData, | ||
DWORD dwIndexCount | ||
) | ||
{ | ||
SimdCaps supports; | ||
if (supports.SSE41()) | ||
WalkIndexBuffer_SIMD = WalkIndexBuffer_SSE41; | ||
else | ||
WalkIndexBuffer_SIMD = WalkIndexBuffer; | ||
|
||
WalkIndexBuffer_SIMD(LowIndex, HighIndex, pIndexData, dwIndexCount); | ||
} | ||
|
||
void WalkIndexBuffer(XTL::INDEX16 & LowIndex, XTL::INDEX16 & HighIndex, XTL::INDEX16 * pIndexData, DWORD dwIndexCount) | ||
{ | ||
// Determine highest and lowest index in use | ||
LowIndex = pIndexData[0]; | ||
HighIndex = LowIndex; | ||
for (uint i = 1; i < dwIndexCount; i++) { | ||
XTL::INDEX16 Index = pIndexData[i]; | ||
if (LowIndex > Index) | ||
LowIndex = Index; | ||
if (HighIndex < Index) | ||
HighIndex = Index; | ||
} | ||
} | ||
|
||
void WalkIndexBuffer_SSE41(XTL::INDEX16 & LowIndex, XTL::INDEX16 & HighIndex, XTL::INDEX16 * pIndexData, DWORD dwIndexCount) | ||
{ | ||
// We can fit 8 ushorts into 128 bit SIMD registers | ||
int iterations = dwIndexCount / 8; | ||
DWORD remainder = dwIndexCount % 8; | ||
|
||
// Fallback to basic function if we can't even min / max 2 registers together | ||
if (iterations < 2) { | ||
WalkIndexBuffer(LowIndex, HighIndex, pIndexData, dwIndexCount); | ||
return; | ||
} | ||
|
||
__m128i *unalignedIndices = (__m128i*) pIndexData;\ | ||
__m128i min = _mm_set1_epi16(USHRT_MAX); | ||
__m128i max = _mm_setzero_si128(); | ||
|
||
// Min / max over index data | ||
for (int i = 0; i < iterations; i++) { | ||
__m128i indices = _mm_loadu_si128(&unalignedIndices[i]); | ||
min = _mm_min_epu16(indices, min); | ||
max = _mm_max_epu16(indices, max); | ||
} | ||
|
||
// horizontal min | ||
min = _mm_minpos_epu16(min); | ||
|
||
// horizontal max (using minpos) | ||
max = _mm_subs_epu16(_mm_set1_epi16(USHRT_MAX), max); //invert | ||
max = _mm_minpos_epu16(max); | ||
|
||
// Get the min and max out | ||
LowIndex = (XTL::INDEX16) _mm_cvtsi128_si32(min); | ||
HighIndex = (XTL::INDEX16) USHRT_MAX - _mm_cvtsi128_si32(max); | ||
|
||
// Compare with the remaining values that didn't fit neatly into the SIMD registers | ||
for (DWORD i = dwIndexCount - remainder; i < dwIndexCount; i++) { | ||
if (pIndexData[i] < LowIndex) | ||
LowIndex = pIndexData[i]; | ||
else if (pIndexData[i] > HighIndex) | ||
HighIndex = pIndexData[i]; | ||
|
||
} | ||
} | ||
|
||
// TODO AVX2, AVX512 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
#ifndef WALKINDEXBUFFER_H | ||
#define WALKINDEXBUFFER_H | ||
|
||
#include "core\kernel\support\Emu.h" | ||
#include "core\kernel\support\EmuXTL.h" | ||
|
||
void WalkIndexBuffer | ||
( | ||
XTL::INDEX16 &LowIndex, | ||
XTL::INDEX16 &HighIndex, | ||
XTL::INDEX16 *pIndexData, | ||
DWORD dwIndexCount | ||
); | ||
|
||
|
||
void Init_SIMD | ||
( | ||
XTL::INDEX16 &LowIndex, | ||
XTL::INDEX16 &HighIndex, | ||
XTL::INDEX16 *pIndexData, | ||
DWORD dwIndexCount | ||
); | ||
|
||
static void(*WalkIndexBuffer_SIMD) | ||
( | ||
XTL::INDEX16 &LowIndex, | ||
XTL::INDEX16 &HighIndex, | ||
XTL::INDEX16 *pIndexData, | ||
DWORD dwIndexCount | ||
) = Init_SIMD; | ||
|
||
|
||
#endif |