@@ -10,6 +10,14 @@ declare <3 x half> @llvm.canonicalize.v3f16(<3 x half>) #0
10
10
declare <4 x half > @llvm.canonicalize.v4f16 (<4 x half >) #0
11
11
declare i32 @llvm.amdgcn.workitem.id.x () #0
12
12
13
+ ; GCN-LABEL: {{^}}test_fold_canonicalize_undef_value_f16:
14
+ ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
15
+ ; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
16
+ define amdgpu_kernel void @test_fold_canonicalize_undef_value_f16 (half addrspace (1 )* %out ) #1 {
17
+ %canonicalized = call half @llvm.canonicalize.f16 (half undef )
18
+ store half %canonicalized , half addrspace (1 )* %out
19
+ ret void
20
+ }
13
21
14
22
; GCN-LABEL: {{^}}v_test_canonicalize_var_f16:
15
23
; GFX89: v_max_f16_e32 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
@@ -34,6 +42,21 @@ define amdgpu_kernel void @s_test_canonicalize_var_f16(half addrspace(1)* %out,
34
42
ret void
35
43
}
36
44
45
+ ; GCN-LABEL: {{^}}v_test_canonicalize_build_vector_v2f16:
46
+ ; GFX9: v_and_b32_e32 v0, 0xffff, v0
47
+ ; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
48
+ ; GFX9-NEXT: v_pk_max_f16 v0, v0, v0
49
+
50
+ ; VI: v_max_f16_sdwa v1, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
51
+ ; VI: v_max_f16_e32 v0, v0, v0
52
+ ; VI: v_or_b32_e32 v0, v0, v1
53
+ define <2 x half > @v_test_canonicalize_build_vector_v2f16 (half %lo , half %hi ) #1 {
54
+ %ins0 = insertelement <2 x half > undef , half %lo , i32 0
55
+ %ins1 = insertelement <2 x half > %ins0 , half %hi , i32 1
56
+ %canonicalized = call <2 x half > @llvm.canonicalize.v2f16 (<2 x half > %ins1 )
57
+ ret <2 x half > %canonicalized
58
+ }
59
+
37
60
; GCN-LABEL: {{^}}v_test_canonicalize_fabs_var_f16:
38
61
; GFX89: v_max_f16_e64 [[REG:v[0-9]+]], |{{v[0-9]+}}|, |{{v[0-9]+}}|
39
62
; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
@@ -515,6 +538,156 @@ define <4 x half> @v_test_canonicalize_var_v4f16(<4 x half> %val) #1 {
515
538
ret <4 x half > %canonicalized
516
539
}
517
540
541
+ ; GCN-LABEL: {{^}}s_test_canonicalize_undef_v2f16:
542
+ ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00
543
+ ; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
544
+ define amdgpu_kernel void @s_test_canonicalize_undef_v2f16 (<2 x half > addrspace (1 )* %out ) #1 {
545
+ %canonicalized = call <2 x half > @llvm.canonicalize.v2f16 (<2 x half > undef )
546
+ store <2 x half > %canonicalized , <2 x half > addrspace (1 )* %out
547
+ ret void
548
+ }
549
+
550
+ ; GCN-LABEL: {{^}}v_test_canonicalize_reg_undef_v2f16:
551
+ ; GFX9: s_waitcnt
552
+ ; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
553
+ ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
554
+ ; GFX9-NEXT: s_setpc_b64
555
+
556
+ ; High bits known zero
557
+ ; FIXME: Should also be true on gfx9 by default?
558
+ ; VI: s_waitcnt
559
+ ; VI-NEXT: v_max_f16_e32 v0, v0, v0
560
+ ; VI-NEXT: s_setpc_b64
561
+ define <2 x half > @v_test_canonicalize_reg_undef_v2f16 (half %val ) #1 {
562
+ %vec = insertelement <2 x half > undef , half %val , i32 0
563
+ %canonicalized = call <2 x half > @llvm.canonicalize.v2f16 (<2 x half > %vec )
564
+ ret <2 x half > %canonicalized
565
+ }
566
+
567
+ ; GCN-LABEL: {{^}}v_test_canonicalize_undef_reg_v2f16:
568
+ ; GFX9: s_waitcnt
569
+ ; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
570
+ ; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, 0
571
+ ; GFX9-NEXT: s_setpc_b64
572
+
573
+ ; VI: s_waitcnt
574
+ ; VI-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
575
+ ; VI-NEXT: s_setpc_b64
576
+ define <2 x half > @v_test_canonicalize_undef_reg_v2f16 (half %val ) #1 {
577
+ %vec = insertelement <2 x half > undef , half %val , i32 1
578
+ %canonicalized = call <2 x half > @llvm.canonicalize.v2f16 (<2 x half > %vec )
579
+ ret <2 x half > %canonicalized
580
+ }
581
+
582
+ ; GCN-LABEL: {{^}}v_test_canonicalize_reg_k_v2f16:
583
+ ; GFX9: s_waitcnt
584
+ ; GFX9-DAG: v_max_f16_e32 v0, v0, v0
585
+ ; GFX9-DAG: s_movk_i32 [[K:s[0-9]+]], 0x4000
586
+ ; GFX9: v_and_b32_e32 v0, 0xffff, v0
587
+ ; GFX9: v_lshl_or_b32 v0, [[K]], 16, v0
588
+ ; GFX9: s_setpc_b64
589
+
590
+ ; VI: s_waitcnt
591
+ ; VI-NEXT: v_max_f16_e32 v0, v0, v0
592
+ ; VI-NEXT: v_or_b32_e32 v0, 2.0, v0
593
+ ; VI-NEXT: s_setpc_b64
594
+ define <2 x half > @v_test_canonicalize_reg_k_v2f16 (half %val ) #1 {
595
+ %vec0 = insertelement <2 x half > undef , half %val , i32 0
596
+ %vec1 = insertelement <2 x half > %vec0 , half 2 .0 , i32 1
597
+ %canonicalized = call <2 x half > @llvm.canonicalize.v2f16 (<2 x half > %vec1 )
598
+ ret <2 x half > %canonicalized
599
+ }
600
+
601
+ ; GCN-LABEL: {{^}}v_test_canonicalize_k_reg_v2f16:
602
+ ; GFX9: v_max_f16_e32 v0, v0, v0
603
+ ; GFX9: v_mov_b32_e32 [[K:v[0-9]+]], 0x4000
604
+ ; GFX9: v_lshl_or_b32 v0, v0, 16, [[K]]
605
+ ; GFX9: s_setpc_b64
606
+
607
+ ; VI: s_waitcnt
608
+ ; VI-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
609
+ ; VI-NEXT: v_or_b32_e32 v0, 0x4000, v0
610
+ ; VI-NEXT: s_setpc_b64
611
+ define <2 x half > @v_test_canonicalize_k_reg_v2f16 (half %val ) #1 {
612
+ %vec0 = insertelement <2 x half > undef , half 2 .0 , i32 0
613
+ %vec1 = insertelement <2 x half > %vec0 , half %val , i32 1
614
+ %canonicalized = call <2 x half > @llvm.canonicalize.v2f16 (<2 x half > %vec1 )
615
+ ret <2 x half > %canonicalized
616
+ }
617
+
618
+ ; GCN-LABEL: {{^}}s_test_canonicalize_undef_v4f16:
619
+ ; GCN: v_mov_b32_e32 v0, 0x7e007e00
620
+ ; GCN: v_mov_b32_e32 v1, v0
621
+ define amdgpu_kernel void @s_test_canonicalize_undef_v4f16 (<4 x half > addrspace (1 )* %out ) #1 {
622
+ %canonicalized = call <4 x half > @llvm.canonicalize.v4f16 (<4 x half > undef )
623
+ store <4 x half > %canonicalized , <4 x half > addrspace (1 )* %out
624
+ ret void
625
+ }
626
+
627
+ ; GCN-LABEL: {{^}}v_test_canonicalize_reg_undef_undef_undef_v4f16:
628
+ ; GFX9: s_waitcnt
629
+ ; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
630
+ ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
631
+ ; GFX9-NEXT: v_mov_b32_e32 v1, 0
632
+ ; GFX9-NEXT: s_setpc_b64
633
+
634
+ ; VI: s_waitcnt
635
+ ; VI-NEXT: v_max_f16_e32 v0, v0, v0
636
+ ; VI-NEXT: v_or_b32_e32 v0, 0x7e000000, v0
637
+ ; VI-NEXT: v_mov_b32_e32 v1, 0x7e007e00
638
+ ; VI-NEXT: s_setpc_b64
639
+ define <4 x half > @v_test_canonicalize_reg_undef_undef_undef_v4f16 (half %val ) #1 {
640
+ %vec = insertelement <4 x half > undef , half %val , i32 0
641
+ %canonicalized = call <4 x half > @llvm.canonicalize.v4f16 (<4 x half > %vec )
642
+ ret <4 x half > %canonicalized
643
+ }
644
+
645
+ ; GCN-LABEL: {{^}}v_test_canonicalize_reg_reg_undef_undef_v4f16:
646
+ ; GFX9: s_waitcnt
647
+ ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
648
+ ; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
649
+ ; GFX9-NEXT: v_pk_max_f16 v0, v0, v0
650
+ ; GFX9-NEXT: v_mov_b32_e32 v1, 0
651
+ ; GFX9-NEXT: s_setpc_b64
652
+
653
+ ; VI: s_waitcnt
654
+ ; VI-DAG: v_max_f16_e32 v0, v0, v0
655
+ ; VI-DAG: v_max_f16_sdwa v1, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
656
+ ; VI: v_or_b32_e32 v0, v0, v1
657
+ ; VI-NEXT: v_mov_b32_e32 v1, 0x7e007e00
658
+ ; VI-NEXT: s_setpc_b64
659
+ define <4 x half > @v_test_canonicalize_reg_reg_undef_undef_v4f16 (half %val0 , half %val1 ) #1 {
660
+ %vec0 = insertelement <4 x half > undef , half %val0 , i32 0
661
+ %vec1 = insertelement <4 x half > %vec0 , half %val1 , i32 1
662
+ %canonicalized = call <4 x half > @llvm.canonicalize.v4f16 (<4 x half > %vec1 )
663
+ ret <4 x half > %canonicalized
664
+ }
665
+
666
+ ; GCN-LABEL: {{^}}v_test_canonicalize_reg_undef_reg_reg_v4f16:
667
+ ; GFX9: s_waitcnt
668
+ ; GFX9-NEXT: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff
669
+ ; GFX9-NEXT: v_and_b32_e32 v1, [[MASK]], v1
670
+ ; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
671
+ ; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1
672
+ ; GFX9-NEXT: v_and_b32_e32 v0, [[MASK]], v0
673
+ ; GFX9-NEXT: v_pk_max_f16 v1, v1, v1
674
+ ; GFX9-NEXT: s_setpc_b64
675
+
676
+ ; VI: s_waitcnt
677
+ ; VI-NEXT: v_max_f16_e32 v0, v0, v0
678
+ ; VI-NEXT: v_max_f16_e32 v1, v1, v1
679
+ ; VI-NEXT: v_max_f16_sdwa v2, v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
680
+ ; VI-NEXT: v_or_b32_e32 v0, 0x7e000000, v0
681
+ ; VI-NEXT: v_or_b32_e32 v1, v1, v2
682
+ ; VI-NEXT: s_setpc_b64
683
+ define <4 x half > @v_test_canonicalize_reg_undef_reg_reg_v4f16 (half %val0 , half %val1 , half %val2 ) #1 {
684
+ %vec0 = insertelement <4 x half > undef , half %val0 , i32 0
685
+ %vec1 = insertelement <4 x half > %vec0 , half %val1 , i32 2
686
+ %vec2 = insertelement <4 x half > %vec1 , half %val2 , i32 3
687
+ %canonicalized = call <4 x half > @llvm.canonicalize.v4f16 (<4 x half > %vec2 )
688
+ ret <4 x half > %canonicalized
689
+ }
690
+
518
691
attributes #0 = { nounwind readnone }
519
692
attributes #1 = { nounwind }
520
693
attributes #2 = { nounwind "target-features" ="-fp64-fp16-denormals" }
0 commit comments