Skip to content

Commit

Permalink
Merge pull request #1527 from NZJenkins/WalkIndexBuffer-SIMD
Browse files Browse the repository at this point in the history
Add SSE WalkIndexBuffer implementation
  • Loading branch information
LukeUsher committed Jan 17, 2019
2 parents aec27c0 + 962b500 commit 318d017
Show file tree
Hide file tree
Showing 6 changed files with 199 additions and 18 deletions.
3 changes: 3 additions & 0 deletions build/win32/Cxbx.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,7 @@
<ItemGroup>
<ClInclude Include="..\..\src\common\crypto\EmuDes.h" />
<ClInclude Include="..\..\src\common\Timer.h" />
<ClInclude Include="..\..\src\common\util\CPUID.h" />
<ClInclude Include="..\..\src\common\util\CxbxUtil.h" />
<ClInclude Include="..\..\src\common\input\InputConfig.h" />
<ClInclude Include="..\..\src\common\input\SDL2_Device.h" />
Expand All @@ -211,6 +212,7 @@
<ClInclude Include="..\..\src\common\XADPCM.h" />
<ClInclude Include="..\..\src\common\xbe\XbePrinter.h" />
<ClInclude Include="..\..\src\common\util\crc32c.h" />
<ClInclude Include="..\..\src\core\HLE\D3D8\Direct3D9\WalkIndexBuffer.h" />
<ClInclude Include="..\..\src\core\HLE\D3D8\XbD3D8Logging.h" />
<ClInclude Include="..\..\src\common\EmuEEPROM.h" />
<ClInclude Include="..\..\src\common\Logging.h" />
Expand Down Expand Up @@ -356,6 +358,7 @@
<ClCompile Include="..\..\src\common\CxbxDebugger.cpp" />
<ClCompile Include="..\..\src\common\win32\XBPortMapping.cpp" />
<ClCompile Include="..\..\src\common\xbe\XbePrinter.cpp" />
<ClCompile Include="..\..\src\core\HLE\D3D8\Direct3D9\WalkIndexBuffer.cpp" />
<ClCompile Include="..\..\src\common\util\crc32c.cpp" />
<ClCompile Include="..\..\src\core\hle\D3D8\XbD3D8Logging.cpp" />
<ClCompile Include="..\..\src\common\EmuEEPROM.cpp" />
Expand Down
11 changes: 11 additions & 0 deletions build/win32/Cxbx.vcxproj.filters
Original file line number Diff line number Diff line change
Expand Up @@ -629,6 +629,9 @@
<ClCompile Include="..\..\src\common\crypto\EmuDes.cpp">
<Filter>Emulator</Filter>
</ClCompile>
<ClCompile Include="..\..\src\core\HLE\D3D8\Direct3D9\WalkIndexBuffer.cpp">
<Filter>core\HLE\D3D8\Direct3D9</Filter>
</ClCompile>
<ClCompile Include="..\..\src\common\Timer.cpp">
<Filter>Emulator</Filter>
</ClCompile>
Expand Down Expand Up @@ -808,6 +811,14 @@
<Image Include="..\..\resource\Cxbx-R.ico" />
<Image Include="..\..\resource\Logo-License-CC4.bmp" />
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\..\src\core\HLE\D3D8\Direct3D9\WalkIndexBuffer.h">
<Filter>core\HLE\D3D8\Direct3D9</Filter>
</ClInclude>
<ClInclude Include="..\..\src\common\util\CPUID.h">
<Filter>Cross Platform</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\..\src\devices\video\nv2a_regs.h">
<Filter>Hardware\Video</Filter>
Expand Down
57 changes: 57 additions & 0 deletions src/common/util/CPUID.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
// https://stackoverflow.com/questions/1666093/cpuid-implementations-in-c

#ifndef CPUID_H
#define CPUID_H

#ifdef _WIN32
#include <limits.h>
#include <intrin.h>
#include <bitset>
typedef unsigned __int32 uint32_t;

#else
#include <stdint.h>
#endif

class CPUID {
uint32_t regs[4];

public:
explicit CPUID(unsigned i) {
#ifdef _WIN32
__cpuid((int *)regs, (int)i);

#else
asm volatile
("cpuid" : "=a" (regs[0]), "=b" (regs[1]), "=c" (regs[2]), "=d" (regs[3])
: "a" (i), "c" (0));
// ECX is set to zero for CPUID function 4
#endif
}

const std::bitset<32> &EAX() const { return regs[0]; }
const std::bitset<32> &EBX() const { return regs[1]; }
const std::bitset<32> &ECX() const { return regs[2]; }
const std::bitset<32> &EDX() const { return regs[3]; }
};

class SimdCaps {

public:
const bool SSE(void) { return f_1.EDX()[25]; }
const bool SSE2(void) { return f_1.EDX()[26]; }
const bool SSE3(void) { return f_1.ECX()[0]; }
const bool SSSE3(void) { return f_1.ECX()[9]; }
const bool SSE41(void) { return f_1.ECX()[19]; }
const bool SSE42(void) { return f_1.ECX()[20]; }
const bool AVX(void) { return f_1.ECX()[1]; }
const bool AVX2(void) { return f_7.EBX()[5]; }

private:
const CPUID f_1 = CPUID(1);
const CPUID f_7 = CPUID(7);
};

static SimdCaps bob;

#endif // CPUID_H
21 changes: 3 additions & 18 deletions src/core/hle/D3D8/Direct3D9/Direct3D9.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ namespace xboxkrnl
#include "core\hle\Intercept.hpp" // for bLLE_GPU
#include "devices\video\nv2a.h" // For GET_MASK, NV_PGRAPH_CONTROL_0
#include "gui\ResCxbx.h"
#include "WalkIndexBuffer.h"

#include <assert.h>
#include <process.h>
Expand Down Expand Up @@ -7063,22 +7064,6 @@ void CxbxDrawIndexedClosingLineUP(XTL::INDEX16 LowIndex, XTL::INDEX16 HighIndex,
g_dwPrimPerFrame++;
}

// TODO : Move to own file
//Walk through index buffer
void WalkIndexBuffer(XTL::INDEX16 &LowIndex, XTL::INDEX16 &HighIndex, XTL::INDEX16 *pIndexData, DWORD dwIndexCount)
{
// Determine highest and lowest index in use
LowIndex = pIndexData[0];
HighIndex = LowIndex;
for (uint i = 1; i < dwIndexCount; i++) {
XTL::INDEX16 Index = pIndexData[i];
if (LowIndex > Index)
LowIndex = Index;
if (HighIndex < Index)
HighIndex = Index;
}
}

// Requires assigned pIndexData
// Called by D3DDevice_DrawIndexedVertices and EmuExecutePushBufferRaw (twice)
void XTL::CxbxDrawIndexed(CxbxDrawContext &DrawContext)
Expand All @@ -7096,7 +7081,7 @@ void XTL::CxbxDrawIndexed(CxbxDrawContext &DrawContext)
//Walk through index buffer
// Determine highest and lowest index in use :
INDEX16 LowIndex, HighIndex;
WalkIndexBuffer(LowIndex, HighIndex, &(DrawContext.pIndexData[DrawContext.dwStartVertex]), DrawContext.dwVertexCount);
WalkIndexBuffer_SIMD(LowIndex, HighIndex, &(DrawContext.pIndexData[DrawContext.dwStartVertex]), DrawContext.dwVertexCount);
VertexBufferConverter.Apply(&DrawContext, LowIndex);

if (DrawContext.XboxPrimitiveType == X_D3DPT_QUADLIST) {
Expand Down Expand Up @@ -7605,7 +7590,7 @@ VOID WINAPI XTL::EMUPATCH(D3DDevice_DrawIndexedVerticesUP)
else {
// Walk through the index buffer
INDEX16 LowIndex, HighIndex;
WalkIndexBuffer(LowIndex, HighIndex, (INDEX16*)pIndexData, DrawContext.dwVertexCount);
WalkIndexBuffer_SIMD(LowIndex, HighIndex, (INDEX16*)pIndexData, DrawContext.dwVertexCount);

// LOG_TEST_CASE("DrawIndexedPrimitiveUP"); // Test-case : Burnout, Namco Museum 50th Anniversary
HRESULT hRet = g_pD3DDevice->DrawIndexedPrimitiveUP(
Expand Down
92 changes: 92 additions & 0 deletions src/core/hle/D3D8/Direct3D9/WalkIndexBuffer.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
//#include <mmintrin.h> // - MMX
//#include <xmmintrin.h> // SSE
//#include <emmintrin.h> // SSE2
//#include <pmmintrin.h> // SSE3
#include <smmintrin.h> // SSE4.1
//#include <nmmintrin.h> // SSE4.2
//#include <immintrin.h> // AVX

#include "core\kernel\support\Emu.h"
#include "core\kernel\support\EmuXTL.h"

#include "common\util\CPUID.h"
#include "WalkIndexBuffer.h"

void WalkIndexBuffer_SSE41(XTL::INDEX16 & LowIndex, XTL::INDEX16 & HighIndex, XTL::INDEX16 * pIndexData, DWORD dwIndexCount);

void Init_SIMD
(
XTL::INDEX16 &LowIndex,
XTL::INDEX16 &HighIndex,
XTL::INDEX16 *pIndexData,
DWORD dwIndexCount
)
{
SimdCaps supports;
if (supports.SSE41())
WalkIndexBuffer_SIMD = WalkIndexBuffer_SSE41;
else
WalkIndexBuffer_SIMD = WalkIndexBuffer;

WalkIndexBuffer_SIMD(LowIndex, HighIndex, pIndexData, dwIndexCount);
}

void WalkIndexBuffer(XTL::INDEX16 & LowIndex, XTL::INDEX16 & HighIndex, XTL::INDEX16 * pIndexData, DWORD dwIndexCount)
{
// Determine highest and lowest index in use
LowIndex = pIndexData[0];
HighIndex = LowIndex;
for (uint i = 1; i < dwIndexCount; i++) {
XTL::INDEX16 Index = pIndexData[i];
if (LowIndex > Index)
LowIndex = Index;
if (HighIndex < Index)
HighIndex = Index;
}
}

void WalkIndexBuffer_SSE41(XTL::INDEX16 & LowIndex, XTL::INDEX16 & HighIndex, XTL::INDEX16 * pIndexData, DWORD dwIndexCount)
{
// We can fit 8 ushorts into 128 bit SIMD registers
int iterations = dwIndexCount / 8;
DWORD remainder = dwIndexCount % 8;

// Fallback to basic function if we can't even min / max 2 registers together
if (iterations < 2) {
WalkIndexBuffer(LowIndex, HighIndex, pIndexData, dwIndexCount);
return;
}

__m128i *unalignedIndices = (__m128i*) pIndexData;\
__m128i min = _mm_set1_epi16(USHRT_MAX);
__m128i max = _mm_setzero_si128();

// Min / max over index data
for (int i = 0; i < iterations; i++) {
__m128i indices = _mm_loadu_si128(&unalignedIndices[i]);
min = _mm_min_epu16(indices, min);
max = _mm_max_epu16(indices, max);
}

// horizontal min
min = _mm_minpos_epu16(min);

// horizontal max (using minpos)
max = _mm_subs_epu16(_mm_set1_epi16(USHRT_MAX), max); //invert
max = _mm_minpos_epu16(max);

// Get the min and max out
LowIndex = (XTL::INDEX16) _mm_cvtsi128_si32(min);
HighIndex = (XTL::INDEX16) USHRT_MAX - _mm_cvtsi128_si32(max);

// Compare with the remaining values that didn't fit neatly into the SIMD registers
for (DWORD i = dwIndexCount - remainder; i < dwIndexCount; i++) {
if (pIndexData[i] < LowIndex)
LowIndex = pIndexData[i];
else if (pIndexData[i] > HighIndex)
HighIndex = pIndexData[i];

}
}

// TODO AVX2, AVX512
33 changes: 33 additions & 0 deletions src/core/hle/D3D8/Direct3D9/WalkIndexBuffer.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#ifndef WALKINDEXBUFFER_H
#define WALKINDEXBUFFER_H

#include "core\kernel\support\Emu.h"
#include "core\kernel\support\EmuXTL.h"

void WalkIndexBuffer
(
XTL::INDEX16 &LowIndex,
XTL::INDEX16 &HighIndex,
XTL::INDEX16 *pIndexData,
DWORD dwIndexCount
);


void Init_SIMD
(
XTL::INDEX16 &LowIndex,
XTL::INDEX16 &HighIndex,
XTL::INDEX16 *pIndexData,
DWORD dwIndexCount
);

static void(*WalkIndexBuffer_SIMD)
(
XTL::INDEX16 &LowIndex,
XTL::INDEX16 &HighIndex,
XTL::INDEX16 *pIndexData,
DWORD dwIndexCount
) = Init_SIMD;


#endif

0 comments on commit 318d017

Please sign in to comment.