Skip to content

Commit 3f87f39

Browse files
John AdcockJason Garrett-Glaser
authored andcommitted
Update x264 asm code to latest to add support for 64-bit Windows.
Use the new x86inc features to support 64-bit Windows on all non-x264 nasm assembly code as well. Patch by John Adcock, dscaler.johnad AT googlemail DOT com. Win64 changes originally by Anton Mitrofanov. x86util changes mostly by Holger Lubitz. Originally committed as revision 19580 to svn://svn.ffmpeg.org/ffmpeg/trunk
1 parent d8c2f8f commit 3f87f39

File tree

5 files changed

+507
-142
lines changed

5 files changed

+507
-142
lines changed

libavcodec/x86/fft_mmx.asm

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -457,7 +457,7 @@ section .text
457457

458458
; On x86_32, this function does the register saving and restoring for all of fft.
459459
; The others pass args in registers and don't spill anything.
460-
cglobal fft_dispatch%3%2, 2,5,0, z, nbits
460+
cglobal fft_dispatch%3%2, 2,5,8, z, nbits
461461
lea r2, [dispatch_tab%3%2 GLOBAL]
462462
mov r2, [r2 + (nbitsq-2)*gprsize]
463463
call r2

libavcodec/x86/h264_deblock_sse2.asm

Lines changed: 49 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -278,7 +278,7 @@ SECTION .text
278278
; void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
279279
;-----------------------------------------------------------------------------
280280
INIT_XMM
281-
cglobal x264_deblock_v_luma_sse2
281+
cglobal x264_deblock_v_luma_sse2, 5,5,10
282282
movd m8, [r4] ; tc0
283283
lea r4, [r1*3]
284284
dec r2d ; alpha-1
@@ -318,54 +318,66 @@ cglobal x264_deblock_v_luma_sse2
318318
DEBLOCK_P0_Q0
319319
mova [r4+2*r1], m1
320320
mova [r0], m2
321-
ret
321+
RET
322322

323323
;-----------------------------------------------------------------------------
324324
; void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
325325
;-----------------------------------------------------------------------------
326326
INIT_MMX
327-
cglobal x264_deblock_h_luma_sse2
328-
movsxd r10, esi
327+
cglobal x264_deblock_h_luma_sse2, 5,7
328+
movsxd r10, r1d
329329
lea r11, [r10+r10*2]
330-
lea rax, [r0-4]
331-
lea r9, [r0-4+r11]
330+
lea r6, [r0-4]
331+
lea r5, [r0-4+r11]
332+
%ifdef WIN64
333+
sub rsp, 0x98
334+
%define pix_tmp rsp+0x30
335+
%else
332336
sub rsp, 0x68
333337
%define pix_tmp rsp
338+
%endif
334339

335340
; transpose 6x16 -> tmp space
336-
TRANSPOSE6x8_MEM PASS8ROWS(rax, r9, r10, r11), pix_tmp
337-
lea rax, [rax+r10*8]
338-
lea r9, [r9 +r10*8]
339-
TRANSPOSE6x8_MEM PASS8ROWS(rax, r9, r10, r11), pix_tmp+8
341+
TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp
342+
lea r6, [r6+r10*8]
343+
lea r5, [r5+r10*8]
344+
TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp+8
340345

341346
; vertical filter
342347
; alpha, beta, tc0 are still in r2d, r3d, r4
343-
; don't backup rax, r9, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them
348+
; don't backup r6, r5, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them
344349
lea r0, [pix_tmp+0x30]
345-
mov esi, 0x10
350+
mov r1d, 0x10
351+
%ifdef WIN64
352+
mov [rsp+0x20], r4
353+
%endif
346354
call x264_deblock_v_luma_sse2
347355

348356
; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
349-
add rax, 2
350-
add r9, 2
357+
add r6, 2
358+
add r5, 2
351359
movq m0, [pix_tmp+0x18]
352360
movq m1, [pix_tmp+0x28]
353361
movq m2, [pix_tmp+0x38]
354362
movq m3, [pix_tmp+0x48]
355-
TRANSPOSE8x4_STORE PASS8ROWS(rax, r9, r10, r11)
363+
TRANSPOSE8x4_STORE PASS8ROWS(r6, r5, r10, r11)
356364

357365
shl r10, 3
358-
sub rax, r10
359-
sub r9, r10
366+
sub r6, r10
367+
sub r5, r10
360368
shr r10, 3
361369
movq m0, [pix_tmp+0x10]
362370
movq m1, [pix_tmp+0x20]
363371
movq m2, [pix_tmp+0x30]
364372
movq m3, [pix_tmp+0x40]
365-
TRANSPOSE8x4_STORE PASS8ROWS(rax, r9, r10, r11)
373+
TRANSPOSE8x4_STORE PASS8ROWS(r6, r5, r10, r11)
366374

375+
%ifdef WIN64
376+
add rsp, 0x98
377+
%else
367378
add rsp, 0x68
368-
ret
379+
%endif
380+
RET
369381

370382
%else
371383

@@ -388,7 +400,7 @@ cglobal x264_deblock_%2_luma_%1, 5,5
388400
mova m3, [r0+r1] ; q1
389401
LOAD_MASK r2, r3
390402

391-
mov r3, r4m
403+
mov r3, r4mp
392404
movd m4, [r3] ; tc0
393405
punpcklbw m4, m4
394406
punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
@@ -428,7 +440,7 @@ cglobal x264_deblock_%2_luma_%1, 5,5
428440
;-----------------------------------------------------------------------------
429441
INIT_MMX
430442
cglobal x264_deblock_h_luma_%1, 0,5
431-
mov r0, r0m
443+
mov r0, r0mp
432444
mov r3, r1m
433445
lea r4, [r3*3]
434446
sub r0, 4
@@ -459,7 +471,7 @@ cglobal x264_deblock_h_luma_%1, 0,5
459471
ADD esp, 20
460472

461473
; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
462-
mov r0, r0m
474+
mov r0, r0mp
463475
sub r0, 2
464476
lea r1, [r0+r4]
465477

@@ -607,7 +619,7 @@ DEBLOCK_LUMA sse2, v, 16
607619
;-----------------------------------------------------------------------------
608620
; void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
609621
;-----------------------------------------------------------------------------
610-
cglobal x264_deblock_%2_luma_intra_%1, 4,6
622+
cglobal x264_deblock_%2_luma_intra_%1, 4,6,16
611623
%ifndef ARCH_X86_64
612624
sub esp, 0x60
613625
%endif
@@ -669,34 +681,34 @@ INIT_MMX
669681
;-----------------------------------------------------------------------------
670682
; void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
671683
;-----------------------------------------------------------------------------
672-
cglobal x264_deblock_h_luma_intra_%1
684+
cglobal x264_deblock_h_luma_intra_%1, 4,7
673685
movsxd r10, r1d
674686
lea r11, [r10*3]
675-
lea rax, [r0-4]
676-
lea r9, [r0-4+r11]
687+
lea r6, [r0-4]
688+
lea r5, [r0-4+r11]
677689
sub rsp, 0x88
678690
%define pix_tmp rsp
679691

680692
; transpose 8x16 -> tmp space
681-
TRANSPOSE8x8_MEM PASS8ROWS(rax, r9, r10, r11), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
682-
lea rax, [rax+r10*8]
683-
lea r9, [r9+r10*8]
684-
TRANSPOSE8x8_MEM PASS8ROWS(rax, r9, r10, r11), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
693+
TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
694+
lea r6, [r6+r10*8]
695+
lea r5, [r5+r10*8]
696+
TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
685697

686698
lea r0, [pix_tmp+0x40]
687699
mov r1, 0x10
688700
call x264_deblock_v_luma_intra_%1
689701

690702
; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
691-
lea r9, [rax+r11]
692-
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(rax, r9, r10, r11)
703+
lea r5, [r6+r11]
704+
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11)
693705
shl r10, 3
694-
sub rax, r10
695-
sub r9, r10
706+
sub r6, r10
707+
sub r5, r10
696708
shr r10, 3
697-
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(rax, r9, r10, r11)
709+
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11)
698710
add rsp, 0x88
699-
ret
711+
RET
700712
%else
701713
cglobal x264_deblock_h_luma_intra_%1, 2,4
702714
lea r3, [r1*3]
@@ -725,7 +737,7 @@ cglobal x264_deblock_h_luma_intra_%1, 2,4
725737
ADD esp, 16
726738

727739
mov r1, r1m
728-
mov r0, r0m
740+
mov r0, r0mp
729741
lea r3, [r1*3]
730742
sub r0, 4
731743
lea r2, [r0+r3]

libavcodec/x86/h264_idct_sse2.asm

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -31,15 +31,8 @@ pw_32: times 8 dw 32
3131

3232
SECTION .text
3333

34-
%macro IDCT4_1D 6
35-
SUMSUB_BA m%3, m%1
36-
SUMSUBD2_AB m%2, m%4, m%6, m%5
37-
SUMSUB_BADC m%2, m%3, m%5, m%1
38-
SWAP %1, %2, %5, %4, %3
39-
%endmacro
40-
4134
INIT_XMM
42-
cglobal x264_add8x4_idct_sse2, 3,3
35+
cglobal x264_add8x4_idct_sse2, 3,3,8
4336
movq m0, [r1+ 0]
4437
movq m1, [r1+ 8]
4538
movq m2, [r1+16]

0 commit comments

Comments
 (0)