@@ -278,7 +278,7 @@ SECTION .text
278278; void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
279279;-----------------------------------------------------------------------------
280280INIT_XMM
281- c global x264_deblock_v_luma_sse2
281+ c global x264_deblock_v_luma_sse2 , 5 , 5 , 10
282282 movd m8 , [ r4 ] ; tc0
283283 lea r4 , [ r1 * 3 ]
284284 dec r2d ; alpha-1
@@ -318,54 +318,66 @@ cglobal x264_deblock_v_luma_sse2
318318 DEBLOCK_P0_Q0
319319 mova [ r4 + 2 * r1 ], m1
320320 mova [ r0 ], m2
321- ret
321+ RET
322322
323323;-----------------------------------------------------------------------------
324324; void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
325325;-----------------------------------------------------------------------------
326326INIT_MMX
327- c global x264_deblock_h_luma_sse2
328- movsxd r10 , esi
327+ c global x264_deblock_h_luma_sse2 , 5 , 7
328+ movsxd r10 , r1d
329329 lea r11 , [ r10 + r10 * 2 ]
330- lea rax , [ r0 - 4 ]
331- lea r9 , [ r0 - 4 + r11 ]
330+ lea r6 , [ r0 - 4 ]
331+ lea r5 , [ r0 - 4 + r11 ]
332+ %ifdef WIN64
333+ sub rsp , 0x98
334+ %define pix_tmp rsp + 0x30
335+ %else
332336 sub rsp , 0x68
333337 %define pix_tmp rsp
338+ %endif
334339
335340 ; transpose 6x16 -> tmp space
336- TRANSPOSE6x8_MEM PASS8ROWS( rax , r9 , r10 , r11 ) , pix_tmp
337- lea rax , [ rax + r10 * 8 ]
338- lea r9 , [ r9 + r10 * 8 ]
339- TRANSPOSE6x8_MEM PASS8ROWS( rax , r9 , r10 , r11 ) , pix_tmp + 8
341+ TRANSPOSE6x8_MEM PASS8ROWS(r6 , r5 , r10 , r11 ) , pix_tmp
342+ lea r6 , [ r6 + r10 * 8 ]
343+ lea r5 , [ r5 + r10 * 8 ]
344+ TRANSPOSE6x8_MEM PASS8ROWS(r6 , r5 , r10 , r11 ) , pix_tmp + 8
340345
341346 ; vertical filter
342347 ; alpha, beta, tc0 are still in r2d, r3d, r4
343- ; don't backup rax, r9 , r10, r11 because x264_deblock_v_luma_sse2 doesn't use them
348+ ; don't backup r6, r5 , r10, r11 because x264_deblock_v_luma_sse2 doesn't use them
344349 lea r0 , [ pix_tmp + 0x30 ]
345- mov esi , 0x10
350+ mov r1d , 0x10
351+ %ifdef WIN64
352+ mov [ rsp + 0x20 ], r4
353+ %endif
346354 call x264_deblock_v_luma_sse2
347355
348356 ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
349- add rax , 2
350- add r9 , 2
357+ add r6 , 2
358+ add r5 , 2
351359 movq m0 , [ pix_tmp + 0x18 ]
352360 movq m1 , [ pix_tmp + 0x28 ]
353361 movq m2 , [ pix_tmp + 0x38 ]
354362 movq m3 , [ pix_tmp + 0x48 ]
355- TRANSPOSE8x4_STORE PASS8ROWS( rax , r9 , r10 , r11 )
363+ TRANSPOSE8x4_STORE PASS8ROWS(r6 , r5 , r10 , r11 )
356364
357365 shl r10 , 3
358- sub rax , r10
359- sub r9 , r10
366+ sub r6 , r10
367+ sub r5 , r10
360368 shr r10 , 3
361369 movq m0 , [ pix_tmp + 0x10 ]
362370 movq m1 , [ pix_tmp + 0x20 ]
363371 movq m2 , [ pix_tmp + 0x30 ]
364372 movq m3 , [ pix_tmp + 0x40 ]
365- TRANSPOSE8x4_STORE PASS8ROWS( rax , r9 , r10 , r11 )
373+ TRANSPOSE8x4_STORE PASS8ROWS(r6 , r5 , r10 , r11 )
366374
375+ %ifdef WIN64
376+ add rsp , 0x98
377+ %else
367378 add rsp , 0x68
368- ret
379+ %endif
380+ RET
369381
370382%else
371383
@@ -388,7 +400,7 @@ cglobal x264_deblock_%2_luma_%1, 5,5
388400 mova m3 , [ r0 + r1 ] ; q1
389401 LOAD_MASK r2 , r3
390402
391- mov r3 , r4m
403+ mov r3 , r4mp
392404 movd m4 , [ r3 ] ; tc0
393405 punpcklbw m4 , m4
394406 punpcklbw m4 , m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
@@ -428,7 +440,7 @@ cglobal x264_deblock_%2_luma_%1, 5,5
428440;-----------------------------------------------------------------------------
429441INIT_MMX
430442c global x264_deblock_h_luma_% 1 , 0 , 5
431- mov r0 , r0m
443+ mov r0 , r0mp
432444 mov r3 , r1m
433445 lea r4 , [ r3 * 3 ]
434446 sub r0 , 4
@@ -459,7 +471,7 @@ cglobal x264_deblock_h_luma_%1, 0,5
459471 ADD esp , 20
460472
461473 ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
462- mov r0 , r0m
474+ mov r0 , r0mp
463475 sub r0 , 2
464476 lea r1 , [ r0 + r4 ]
465477
@@ -607,7 +619,7 @@ DEBLOCK_LUMA sse2, v, 16
607619;-----------------------------------------------------------------------------
608620; void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
609621;-----------------------------------------------------------------------------
610- c global x264_deblock_%2_luma_intra_% 1 , 4 , 6
622+ c global x264_deblock_%2_luma_intra_% 1 , 4 , 6 , 16
611623%ifndef ARCH_X86_64
612624 sub esp , 0x60
613625%endif
@@ -669,34 +681,34 @@ INIT_MMX
669681;-----------------------------------------------------------------------------
670682; void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
671683;-----------------------------------------------------------------------------
672- c global x264_deblock_h_luma_intra_% 1
684+ c global x264_deblock_h_luma_intra_% 1 , 4 , 7
673685 movsxd r10 , r1d
674686 lea r11 , [ r10 * 3 ]
675- lea rax , [ r0 - 4 ]
676- lea r9 , [ r0 - 4 + r11 ]
687+ lea r6 , [ r0 - 4 ]
688+ lea r5 , [ r0 - 4 + r11 ]
677689 sub rsp , 0x88
678690 %define pix_tmp rsp
679691
680692 ; transpose 8x16 -> tmp space
681- TRANSPOSE8x8_MEM PASS8ROWS( rax , r9 , r10 , r11 ) , PASS8ROWS(pix_tmp , pix_tmp + 0x30 , 0x10 , 0x30 )
682- lea rax , [ rax + r10 * 8 ]
683- lea r9 , [ r9 + r10 * 8 ]
684- TRANSPOSE8x8_MEM PASS8ROWS( rax , r9 , r10 , r11 ) , PASS8ROWS(pix_tmp + 8 , pix_tmp + 0x38 , 0x10 , 0x30 )
693+ TRANSPOSE8x8_MEM PASS8ROWS(r6 , r5 , r10 , r11 ) , PASS8ROWS(pix_tmp , pix_tmp + 0x30 , 0x10 , 0x30 )
694+ lea r6 , [ r6 + r10 * 8 ]
695+ lea r5 , [ r5 + r10 * 8 ]
696+ TRANSPOSE8x8_MEM PASS8ROWS(r6 , r5 , r10 , r11 ) , PASS8ROWS(pix_tmp + 8 , pix_tmp + 0x38 , 0x10 , 0x30 )
685697
686698 lea r0 , [ pix_tmp + 0x40 ]
687699 mov r1 , 0x10
688700 call x264_deblock_v_luma_intra_% 1
689701
690702 ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
691- lea r9 , [ rax + r11 ]
692- TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp + 8 , pix_tmp + 0x38 , 0x10 , 0x30 ) , PASS8ROWS( rax , r9 , r10 , r11 )
703+ lea r5 , [ r6 + r11 ]
704+ TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp + 8 , pix_tmp + 0x38 , 0x10 , 0x30 ) , PASS8ROWS(r6 , r5 , r10 , r11 )
693705 shl r10 , 3
694- sub rax , r10
695- sub r9 , r10
706+ sub r6 , r10
707+ sub r5 , r10
696708 shr r10 , 3
697- TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp , pix_tmp + 0x30 , 0x10 , 0x30 ) , PASS8ROWS( rax , r9 , r10 , r11 )
709+ TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp , pix_tmp + 0x30 , 0x10 , 0x30 ) , PASS8ROWS(r6 , r5 , r10 , r11 )
698710 add rsp , 0x88
699- ret
711+ RET
700712%else
701713c global x264_deblock_h_luma_intra_% 1 , 2 , 4
702714 lea r3 , [ r1 * 3 ]
@@ -725,7 +737,7 @@ cglobal x264_deblock_h_luma_intra_%1, 2,4
725737 ADD esp , 16
726738
727739 mov r1 , r1m
728- mov r0 , r0m
740+ mov r0 , r0mp
729741 lea r3 , [ r1 * 3 ]
730742 sub r0 , 4
731743 lea r2 , [ r0 + r3 ]
0 commit comments