Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
pl_mpeg: rewrite AC decoding to improve performance (~10/20%)
- Loading branch information
Showing
1 changed file
with
47 additions
and
31 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3473,51 +3473,67 @@ void plm_video_decode_block(plm_video_t *self, int block) { | |
// Decode AC coefficients (+DC for non-intra) | ||
PROFILE_START(PS_MPEG_MB_DECODE_AC, 0); | ||
plm_buffer_has(self->buffer, 64*24); | ||
int level = 0; | ||
while (TRUE) { | ||
static const uint16_t qtable0[128] __attribute__((aligned(16))) = { 0,0,0,0,65535,65535,65535,65535,49666,49666,51457,51457,49156,49156,51201,51201,42753,42753,42753,42753,42497,42497,42497,42497,41218,41218,41218,41218,42241,42241,42241,42241,60673,57350,60417,60161,58114,57603,57349,59905,32771,32771,32771,32771,32771,32771,32771,32771,33793,33793,33793,33793,33793,33793,33793,33793,33537,33537,33537,33537,33537,33537,33537,33537,24578,24578,24578,24578,24578,24578,24578,24578,24578,24578,24578,24578,24578,24578,24578,24578,25089,25089,25089,25089,25089,25089,25089,25089,25089,25089,25089,25089,25089,25089,25089,25089,16641,16641,16641,16641,16641,16641,16641,16641,16641,16641,16641,16641,16641,16641,16641,16641,16641,16641,16641,16641,16641,16641,16641,16641,16641,16641,16641,16641,16641,16641,16641,16641 }; | ||
static const uint16_t qtables1[256] __attribute__((aligned(16))) = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 57618, 57617, 57616, 57615, 58883, 61442, 61186, 60930, 60674, 60418, 60162, 65281, 65025, 64769, 64513, 64257, 49192, 49192, 49191, 49191, 49190, 49190, 49189, 49189, 49188, 49188, 49187, 49187, 49186, 49186, 49185, 49185, 49184, 49184, 49422, 49422, 49421, 49421, 49420, 49420, 49419, 49419, 49418, 49418, 49417, 49417, 49416, 49416, 40991, 40991, 40991, 40991, 40990, 40990, 40990, 40990, 40989, 40989, 40989, 40989, 40988, 40988, 40988, 40988, 40987, 40987, 40987, 40987, 40986, 40986, 40986, 40986, 40985, 40985, 40985, 40985, 40984, 40984, 40984, 40984, 40983, 40983, 40983, 40983, 40982, 40982, 40982, 40982, 40981, 40981, 40981, 40981, 40980, 40980, 40980, 40980, 40979, 40979, 40979, 40979, 40978, 40978, 40978, 40978, 40977, 40977, 40977, 40977, 40976, 40976, 40976, 40976, 35330, 35330, 35330, 35330, 35330, 35330, 35330, 35330, 35074, 35074, 35074, 35074, 35074, 35074, 35074, 35074, 34051, 34051, 34051, 34051, 34051, 34051, 34051, 34051, 33540, 33540, 33540, 33540, 33540, 33540, 33540, 33540, 33285, 33285, 33285, 33285, 33285, 33285, 33285, 33285, 33031, 33031, 33031, 33031, 33031, 33031, 33031, 33031, 33030, 33030, 33030, 33030, 33030, 33030, 33030, 33030, 32783, 32783, 32783, 32783, 32783, 32783, 32783, 32783, 32782, 32782, 32782, 32782, 32782, 32782, 32782, 32782, 32781, 32781, 32781, 32781, 32781, 32781, 32781, 32781, 32780, 32780, 32780, 32780, 32780, 32780, 32780, 32780, 39425, 39425, 39425, 39425, 39425, 39425, 39425, 39425, 39169, 39169, 39169, 39169, 39169, 39169, 39169, 39169, 38913, 38913, 38913, 38913, 38913, 38913, 38913, 38913, 38657, 38657, 38657, 38657, 38657, 38657, 38657, 38657, 38401, 38401, 38401, 38401, 38401, 38401, 38401, 38401 }; | ||
static const uint16_t qtables2[16] __attribute__((aligned(16))) = { 24587, 26626, 25603, 24586, 25092, 26370, 29953, 29697, 24585, 29441, 29185, 24837, 25347, 24584, 26114, 28929 }; | ||
static const uint16_t qtables34[2][4] __attribute__((aligned(16))) = { {12289, 9474, 8199, 8707}, {8452, 12033, 11777, 9218} }; | ||
static const uint16_t *tbl[4] = { qtables1, qtables2, qtables34[0], qtables34[1] }; | ||
static const uint8_t shift[4] = { 0,4,6,6 }; | ||
|
||
int level = 0; | ||
int run = 0; | ||
PROFILE_START(PS_MPEG_MB_DECODE_AC_VLC, 0); | ||
uint16_t coeff = plm_video_decode_dct_coeff(self->buffer); | ||
// uint16_t coeff = plm_buffer_read_vlc_uint(self->buffer, PLM_VIDEO_DCT_COEFF); | ||
PROFILE_STOP(PS_MPEG_MB_DECODE_AC_VLC, 0); | ||
unsigned int coeff; | ||
|
||
PROFILE_START(PS_MPEG_MB_DECODE_AC_CODE, 0); | ||
uint64_t bits = plm_buffer_showbits(self->buffer); | ||
uint64_t bits = plm_buffer_showbits2(self->buffer); | ||
#define readbits(n) ({ uint64_t val = bits>>(64-n); bits <<= n; self->buffer->bit_index += n; val; }) | ||
|
||
if ((coeff == 0x0001) && (n > 0) && (readbits(1) == 0)) { | ||
// end_of_block | ||
break; | ||
} | ||
if (coeff == 0xffff) { | ||
// escape | ||
run = readbits(6); | ||
level = readbits(8); | ||
if (level == 0) { | ||
level = readbits(8); | ||
} | ||
else if (level == 128) { | ||
level = readbits(8) - 256; | ||
} | ||
else if (level > 128) { | ||
level = level - 256; | ||
} | ||
} | ||
else { | ||
run = coeff >> 8; | ||
level = coeff & 0xff; | ||
if (bits>>63 == 1) { | ||
readbits(1); | ||
if ((n > 0) && (readbits(1) == 0)) | ||
break; | ||
run = 0; | ||
level = 1; | ||
if (readbits(1)) { | ||
level = -level; | ||
} | ||
} else { | ||
unsigned int bit0 = (bits>>56) & 0xff; | ||
if (bit0 >= 4) { | ||
coeff = qtable0[bit0]; | ||
} else { | ||
unsigned int bit1 = (bits >> 48) & 0xff; | ||
coeff = tbl[bit0][bit1 >> shift[bit0]]; | ||
readbits(8); | ||
if (coeff == 0xffff) __builtin_unreachable(); | ||
} | ||
if (coeff == 0xffff) { | ||
This comment has been minimized.
Sorry, something went wrong.
This comment has been minimized.
Sorry, something went wrong.
rasky
Author
Collaborator
|
||
readbits(6); | ||
run = readbits(6); | ||
level = readbits(8); | ||
if (level == 0) { | ||
level = readbits(8); | ||
} else if (level == 128) { | ||
level = readbits(8) - 256; | ||
} else if (level > 128) { | ||
level = level - 256; | ||
} | ||
} else { | ||
readbits((coeff >> 13) + 1); | ||
coeff &= 0x1fff; | ||
run = coeff >> 8; | ||
level = coeff & 0xff; | ||
if (readbits(1)) { | ||
level = -level; | ||
} | ||
} | ||
} | ||
|
||
n += run; | ||
if (n < 0 || n >= 64) { | ||
fprintf(stderr, "INVALID AC COEFF\n"); | ||
return; // invalid | ||
} | ||
PROFILE_STOP(PS_MPEG_MB_DECODE_AC_CODE, 0); | ||
PROFILE_START(PS_MPEG_MB_DECODE_AC_DEQUANT, 0); | ||
|
||
if (RSP_MODE < 2) { | ||
int de_zig_zagged = PLM_VIDEO_ZIG_ZAG[n]; | ||
|
@@ -3546,7 +3562,7 @@ void plm_video_decode_block(plm_video_t *self, int block) { | |
rsp_mpeg1_block_coeff(n, level); | ||
} | ||
n++; | ||
PROFILE_STOP(PS_MPEG_MB_DECODE_AC_DEQUANT, 0); | ||
// PROFILE_STOP(PS_MPEG_MB_DECODE_AC_DEQUANT, 0); | ||
} | ||
PROFILE_STOP(PS_MPEG_MB_DECODE_AC, 0); | ||
|
||
|
Random idea, but this case looks very unlikely, so you could:
Then use:
GCC will then make sure the code path that's the most likely will be the one without the branch penalty.