103103 vmovl.u8 q15 , \y1 @ 8px of y
104104
105105 vdup. 16 q5 , r9 @ q5 = y_offset
106- vdup. 16 q7 , r10 @ q7 = y_coeff
106+ vmov d14 , d0 @ q7 = y_coeff
107+ vmov d15 , d0 @ q7 = y_coeff
107108
108109 vsub.s16 q14 , q5
109110 vsub.s16 q15 , q5
184185 compute_8px_32 r11 , d30 , \ofmt
185186.endm
186187
187- .macro load_args
188+ .macro load_args_nvx
188189 push {r4 - r12 , lr}
189190 vpush {q4 - q7}
190191 ldr r4 , [ sp , # 104 ] @ r4 = srcY
206207 sub r7 , r7 , r0 @ r7 = linesizeC - width (paddingC)
207208.endm
208209
210+ .macro load_args_yuv420p
211+ push {r4 - r12 , lr}
212+ vpush {q4 - q7}
213+ ldr r4 , [ sp , # 104 ] @ r4 = srcY
214+ ldr r5 , [ sp , # 108 ] @ r5 = linesizeY
215+ ldr r6 , [ sp , # 112 ] @ r6 = srcU
216+ ldr r8 , [ sp , # 128 ] @ r8 = table
217+ ldr r9 , [ sp , # 132 ] @ r9 = y_offset
218+ ldr r10 ,[ sp , # 136 ] @ r10 = y_coeff
219+ vdup. 16 d0 , r10 @ d0 = y_coeff
220+ vld1. 16 {d1} , [ r8 ] @ d1 = * table
221+ add r11 , r2 , r3 @ r11 = dst + linesize (dst2)
222+ add r12 , r4 , r5 @ r12 = srcY + linesizeY (srcY2)
223+ lsl r3 , r3 , # 1
224+ lsl r5 , r5 , # 1
225+ lsl r8 , r0 , # 2
226+ sub r3 , r3 , r8 @ r3 = linesize * 2 - width * 4 (padding)
227+ sub r5 , r5 , r0 @ r5 = linesizeY * 2 - width (paddingY)
228+ ldr r10 ,[ sp , # 120 ] @ r10 = srcV
229+ .endm
230+
209231.macro declare_func ifmt ofmt precision
210232function ff_\ifmt\()_to_\ofmt\()_neon_\precision\() , export= 1
211- load_args
233+
234+ .ifc \ifmt , nv12
235+ load_args_nvx
236+ .endif
237+
238+ .ifc \ifmt , nv21
239+ load_args_nvx
240+ .endif
241+
242+ .ifc \ifmt , yuv420p
243+ load_args_yuv420p
244+ .endif
245+
2122461 :
213247 mov r8 , r0 @ r8 = width
2142482 :
215249 pld [ r6 , # 64 * 3 ]
216250 pld [ r4 , # 64 * 3 ]
217251 pld [ r12 , # 64 * 3 ]
218252
219- vld2. 8 {d2 , d3} , [ r6 ] ! @ q1: interleaved chroma line
220253 vmov.i8 d10 , # 128
254+
221255.ifc \ifmt , nv12
256+ vld2. 8 {d2 , d3} , [ r6 ] ! @ q1: interleaved chroma line
222257 vsubl.u8 q14 , d2 , d10 @ q14 = U - 128
223258 vsubl.u8 q15 , d3 , d10 @ q15 = V - 128
224- .else
259+ .endif
260+
261+ .ifc \ifmt , nv21
262+ vld2. 8 {d2 , d3} , [ r6 ] ! @ q1: interleaved chroma line
225263 vsubl.u8 q14 , d3 , d10 @ q14 = U - 128
226264 vsubl.u8 q15 , d2 , d10 @ q15 = V - 128
227265.endif
228266
267+ .ifc \ifmt , yuv420p
268+ pld [ r10 , # 64 * 3 ]
269+
270+ vld1. 8 d2 , [ r6 ] ! @ d2: chroma red line
271+ vld1. 8 d3 , [ r10 ] ! @ d3: chroma blue line
272+ vsubl.u8 q14 , d2 , d10 @ q14 = U - 128
273+ vsubl.u8 q15 , d3 , d10 @ q15 = V - 128
274+ .endif
275+
276+
229277 process_16px_\precision \ofmt
230278
231279 subs r8 , r8 , # 16 @ width - = 16
@@ -235,7 +283,24 @@ function ff_\ifmt\()_to_\ofmt\()_neon_\precision\(), export=1
235283 add r4 , r4 , r5 @ srcY + = paddingY
236284 add r11 , r11 , r3 @ dst2 + = padding
237285 add r12 , r12 , r5 @ srcY2 + = paddingY
286+
287+ .ifc \ifmt , nv12
238288 add r6 , r6 , r7 @ srcC + = paddingC
289+ .endif
290+
291+ .ifc \ifmt , nv21
292+ add r6 , r6 , r7 @ srcC + = paddingC
293+ .endif
294+
295+ .ifc \ifmt , yuv420p
296+ ldr r7 , [ sp , # 116 ] @ r7 = linesizeU
297+ sub r7 , r7 , r0 , lsr # 1 @ r7 = linesizeU - width / 2 (paddingU)
298+ add r6 , r6 , r7 @ srcU + = paddingU
299+
300+ ldr r7 , [ sp , # 124 ] @ r7 = linesizeV
301+ sub r7 , r7 , r0 , lsr # 1 @ r7 = linesizeV - width / 2 (paddingV)
302+ add r10 , r10 , r7 @ srcU + = paddingV
303+ .endif
239304
240305 subs r1 , r1 , # 2 @ height - = 2
241306 bgt 1b
@@ -257,3 +322,5 @@ declare_rgb_funcs nv12, 16
257322declare_rgb_funcs nv21 , 16
258323declare_rgb_funcs nv12 , 32
259324declare_rgb_funcs nv21 , 32
325+ declare_rgb_funcs yuv420p , 16
326+ declare_rgb_funcs yuv420p , 32
0 commit comments