Skip to content
Permalink
Browse files

MOD: Shifted from _mm_storeu_ps to _mm_stream_si128. This has improve…

…d our performance from 1.8ms to 1.5ms due to non-temporal stores.
  • Loading branch information...
AlexSabourinDev committed Feb 18, 2019
1 parent 2493716 commit fafc0a251ab0d02d4ef00d06cadcc277aa34b153

Large diffs are not rendered by default.

Oops, something went wrong.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -12,6 +12,7 @@

#include <xmmintrin.h>
#include <emmintrin.h>
#include <smmintrin.h>

#define SWAP(type, a, b) do{ type t = b; b = a; a = t; }while(0);

@@ -426,14 +427,14 @@ void game_init(Game_InstanceBuffer* buffer)

AI_FarmersMoveHot = (AI_FarmerMoveStateHot*)malloc(sizeof(AI_FarmerMoveStateHot) * AI_FarmerCount);
AI_FarmersMoveCold = (AI_FarmerMoveStateCold*)malloc(sizeof(AI_FarmerMoveStateCold) * AI_FarmerCount);
AI_FarmersMoveGen = (AI_FarmerMoveStateGen*)malloc(sizeof(AI_FarmerMoveStateGen) * AI_FarmerCount);
AI_FarmersMoveGen = (AI_FarmerMoveStateGen*)_mm_malloc(sizeof(AI_FarmerMoveStateGen) * AI_FarmerCount, 64);

AI_FarmersFarmHot = (AI_FarmerFarmStateHot*)malloc(sizeof(AI_FarmerFarmStateHot) * AI_FarmerCount);
AI_FarmersFarmCold = (AI_FarmerFarmStateCold*)malloc(sizeof(AI_FarmerFarmStateCold) * AI_FarmerCount);
AI_FarmersFarmGen = (AI_FarmerFarmStateGen*)malloc(sizeof(AI_FarmerFarmStateGen) * AI_FarmerCount);
AI_FarmersFarmGen = (AI_FarmerFarmStateGen*)_mm_malloc(sizeof(AI_FarmerFarmStateGen) * AI_FarmerCount, 64);

AI_FarmersSearchHot = (AI_FarmerSearchStateHot*)malloc(sizeof(AI_FarmerSearchStateHot) * AI_FarmerCount);
AI_FarmersSearchGen = (AI_FarmerSearchStateGen*)malloc(sizeof(AI_FarmerSearchStateGen) * AI_FarmerCount);
AI_FarmersSearchGen = (AI_FarmerSearchStateGen*)_mm_malloc(sizeof(AI_FarmerSearchStateGen) * AI_FarmerCount, 64);

AI_FarmerSearchCount = AI_FarmerCount;

@@ -476,19 +477,19 @@ void game_kill(void)
AI_FarmersMoveHot = NULL;
free(AI_FarmersMoveCold);
AI_FarmersMoveCold = NULL;
free(AI_FarmersMoveGen);
_mm_free(AI_FarmersMoveGen);
AI_FarmersMoveGen = NULL;

free(AI_FarmersFarmHot);
AI_FarmersFarmHot = NULL;
free(AI_FarmersFarmCold);
AI_FarmersFarmCold = NULL;
free(AI_FarmersFarmGen);
_mm_free(AI_FarmersFarmGen);
AI_FarmersFarmCold = NULL;

free(AI_FarmersSearchHot);
AI_FarmersSearchHot = NULL;
free(AI_FarmersSearchGen);
_mm_free(AI_FarmersSearchGen);
AI_FarmersSearchGen = NULL;

MIST_PROFILE_END("Game", "Game-Kill");
@@ -518,25 +519,28 @@ uint32_t game_gen_instance_buffer(Game_InstanceBuffer* buffer)

memcpy(&buffer->positions[writeIndex * 2], AI_FarmersSearchGen, sizeof(float) * 2 * AI_FarmerSearchCount);
__m128i searchAndScale = _mm_set_epi16(FarmerState_Search, AI_FarmerScale, FarmerState_Search, AI_FarmerScale, FarmerState_Search, AI_FarmerScale, FarmerState_Search, AI_FarmerScale);
for (uint32_t i = 0; i < AI_FarmerSearchCount; i+=4)
_mm_storeu_si128((__m128i*)&buffer->spriteIndicesAndScales[writeIndex * 2], searchAndScale);
for (uint32_t i = (4 - writeIndex % 4); i < AI_FarmerSearchCount; i+=4)
{
_mm_storeu_si128((__m128i*)&buffer->spriteIndicesAndScales[(writeIndex + i) * 2], searchAndScale);
_mm_stream_si128((__m128i*)&buffer->spriteIndicesAndScales[(writeIndex + i) * 2], searchAndScale);
}
writeIndex += AI_FarmerSearchCount;

memcpy(&buffer->positions[writeIndex * 2], AI_FarmersMoveGen, sizeof(float) * 2 * AI_FarmerMoveCount);
__m128i moveAndScale = _mm_set_epi16(FarmerState_Move, AI_FarmerScale, FarmerState_Move, AI_FarmerScale, FarmerState_Move, AI_FarmerScale, FarmerState_Move, AI_FarmerScale);
for (uint32_t i = 0; i < AI_FarmerMoveCount; i+=4)
_mm_storeu_si128((__m128i*)&buffer->spriteIndicesAndScales[writeIndex * 2], moveAndScale);
for (uint32_t i = (4 - writeIndex % 4); i < AI_FarmerMoveCount; i+=4)
{
_mm_storeu_si128((__m128i*)&buffer->spriteIndicesAndScales[(writeIndex + i) * 2], moveAndScale);
_mm_stream_si128((__m128i*)&buffer->spriteIndicesAndScales[(writeIndex + i) * 2], moveAndScale);
}
writeIndex += AI_FarmerMoveCount;

memcpy(&buffer->positions[writeIndex * 2], AI_FarmersFarmGen, sizeof(float) * 2 * AI_FarmerFarmCount);
__m128i farmAndScale = _mm_set_epi16(FarmerState_Farm, AI_FarmerScale, FarmerState_Farm, AI_FarmerScale, FarmerState_Farm, AI_FarmerScale, FarmerState_Farm, AI_FarmerScale);
for (uint32_t i = 0; i < AI_FarmerFarmCount; i+=4)
_mm_storeu_si128((__m128i*)&buffer->spriteIndicesAndScales[writeIndex * 2], farmAndScale);
for (uint32_t i = (4 - writeIndex % 4); i < AI_FarmerFarmCount; i+=4)
{
_mm_storeu_si128((__m128i*)&buffer->spriteIndicesAndScales[(writeIndex + i) * 2], farmAndScale);
_mm_stream_si128((__m128i*)&buffer->spriteIndicesAndScales[(writeIndex + i) * 2], farmAndScale);
}
writeIndex += AI_FarmerFarmCount;

@@ -18,6 +18,8 @@
#include <stdint.h>
#include <stdbool.h>

#include <immintrin.h>

const uint16_t Window_Width = 1024;
const uint16_t Window_Height = 720;
const char* Window_Title = "Holy Cheese";
@@ -125,7 +127,7 @@ void core_init(void)
.fs_images[0] = image
};

Render_InstanceBuffer = malloc(sizeof(Game_InstanceBuffer));
Render_InstanceBuffer = _mm_malloc(sizeof(Game_InstanceBuffer), 64);
game_init(Render_InstanceBuffer);
}

@@ -159,7 +161,7 @@ void core_frame(void)
void core_cleanup(void)
{
game_kill();
free(Render_InstanceBuffer);
_mm_free(Render_InstanceBuffer);

sg_shutdown();

@@ -172,7 +174,7 @@ void core_initProfile(void)
Mist_ProfileInit();

MIST_PROFILE_BEGIN("Core", "Init");
Render_InstanceBuffer = malloc(sizeof(Game_InstanceBuffer));
Render_InstanceBuffer = _mm_malloc(sizeof(Game_InstanceBuffer), 64);
game_init(Render_InstanceBuffer);
MIST_PROFILE_END("Core", "Init");
}
@@ -202,7 +204,7 @@ void core_cleanupProfile(void)
{
MIST_PROFILE_BEGIN("Core", "Cleanup");
game_kill();
free(Render_InstanceBuffer);
_mm_free(Render_InstanceBuffer);
MIST_PROFILE_END("Core", "Cleanup");

if (Mist_ProfileListSize() == 0)

0 comments on commit fafc0a2

Please sign in to comment.
You can’t perform that action at this time.