Skip to content

Commit d710d2d

Browse files
jroelofstomtor
authored andcommitted
[Matrix] Propagate shape information through cast insts (llvm#141869)
1 parent aaeba9e commit d710d2d

File tree

2 files changed

+306
-0
lines changed

2 files changed

+306
-0
lines changed

llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,10 @@
3333
#include "llvm/IR/CFG.h"
3434
#include "llvm/IR/DataLayout.h"
3535
#include "llvm/IR/DebugInfoMetadata.h"
36+
#include "llvm/IR/DerivedTypes.h"
3637
#include "llvm/IR/Function.h"
3738
#include "llvm/IR/IRBuilder.h"
39+
#include "llvm/IR/InstrTypes.h"
3840
#include "llvm/IR/Instructions.h"
3941
#include "llvm/IR/IntrinsicInst.h"
4042
#include "llvm/IR/MatrixBuilder.h"
@@ -249,6 +251,34 @@ static bool isUniformShape(Value *V) {
249251
if (I->isBinaryOp())
250252
return true;
251253

254+
if (auto *Cast = dyn_cast<CastInst>(V)) {
255+
switch (Cast->getOpcode()) {
256+
case llvm::Instruction::Trunc:
257+
case llvm::Instruction::ZExt:
258+
case llvm::Instruction::SExt:
259+
case llvm::Instruction::FPToUI:
260+
case llvm::Instruction::FPToSI:
261+
case llvm::Instruction::UIToFP:
262+
case llvm::Instruction::SIToFP:
263+
case llvm::Instruction::FPTrunc:
264+
case llvm::Instruction::FPExt:
265+
return true;
266+
case llvm::Instruction::AddrSpaceCast:
267+
case CastInst::PtrToInt:
268+
case CastInst::IntToPtr:
269+
return false;
270+
case CastInst::BitCast: {
271+
if (auto *SrcVTy = dyn_cast<FixedVectorType>(Cast->getSrcTy()))
272+
if (auto *DestVTy = dyn_cast<FixedVectorType>(Cast->getDestTy()))
273+
return SrcVTy->getNumElements() == DestVTy->getNumElements();
274+
return false;
275+
}
276+
case llvm::Instruction::CastOpsEnd:
277+
llvm_unreachable("not an actual cast op");
278+
}
279+
llvm_unreachable("unhandled cast opcode");
280+
}
281+
252282
if (auto *II = dyn_cast<IntrinsicInst>(V))
253283
switch (II->getIntrinsicID()) {
254284
case Intrinsic::abs:
@@ -1112,6 +1142,8 @@ class LowerMatrixIntrinsics {
11121142
Value *Op2;
11131143
if (auto *BinOp = dyn_cast<BinaryOperator>(Inst))
11141144
VisitBinaryOperator(BinOp, SI);
1145+
else if (auto *Cast = dyn_cast<CastInst>(Inst))
1146+
VisitCastInstruction(Cast, SI);
11151147
else if (auto *UnOp = dyn_cast<UnaryOperator>(Inst))
11161148
VisitUnaryOperator(UnOp, SI);
11171149
else if (IntrinsicInst *Intr = dyn_cast<IntrinsicInst>(Inst))
@@ -2262,6 +2294,30 @@ class LowerMatrixIntrinsics {
22622294
Builder);
22632295
}
22642296

2297+
/// Lower cast instructions.
2298+
void VisitCastInstruction(CastInst *Inst, const ShapeInfo &Shape) {
2299+
Value *Op = Inst->getOperand(0);
2300+
2301+
IRBuilder<> Builder(Inst);
2302+
2303+
MatrixTy Result;
2304+
MatrixTy M = getMatrix(Op, Shape, Builder);
2305+
2306+
Builder.setFastMathFlags(getFastMathFlags(Inst));
2307+
2308+
auto *OrigVTy = cast<VectorType>(Inst->getType());
2309+
auto *NewVTy = VectorType::get(OrigVTy->getElementType(),
2310+
ElementCount::getFixed(M.getStride()));
2311+
2312+
for (auto &Vector : M.vectors())
2313+
Result.addVector(Builder.CreateCast(Inst->getOpcode(), Vector, NewVTy));
2314+
2315+
finalizeLowering(Inst,
2316+
Result.addNumComputeOps(getNumOps(Result.getVectorTy()) *
2317+
Result.getNumVectors()),
2318+
Builder);
2319+
}
2320+
22652321
/// Helper to linearize a matrix expression tree into a string. Currently
22662322
/// matrix expressions are linarized by starting at an expression leaf and
22672323
/// linearizing bottom up.
Lines changed: 250 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,250 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2+
; RUN: opt -passes='lower-matrix-intrinsics' -S < %s | FileCheck %s
3+
4+
define void @fneg_2x2(ptr %in, ptr %out) {
5+
; CHECK-LABEL: @fneg_2x2(
6+
; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x float>, ptr [[IN:%.*]], align 16
7+
; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr float, ptr [[IN]], i64 2
8+
; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x float>, ptr [[VEC_GEP]], align 8
9+
; CHECK-NEXT: [[TMP1:%.*]] = fneg <2 x float> [[COL_LOAD]]
10+
; CHECK-NEXT: [[TMP2:%.*]] = fneg <2 x float> [[COL_LOAD1]]
11+
; CHECK-NEXT: store <2 x float> [[TMP1]], ptr [[OUT:%.*]], align 4
12+
; CHECK-NEXT: [[VEC_GEP2:%.*]] = getelementptr float, ptr [[OUT]], i64 2
13+
; CHECK-NEXT: store <2 x float> [[TMP2]], ptr [[VEC_GEP2]], align 4
14+
; CHECK-NEXT: ret void
15+
;
16+
%inv = load <4 x float>, ptr %in
17+
%op = fneg <4 x float> %inv
18+
call void @llvm.matrix.column.major.store(<4 x float> %op, ptr %out, i64 2, i1 false, i32 2, i32 2)
19+
ret void
20+
}
21+
22+
define void @trunc_2x2(ptr %in, ptr %out) {
23+
; CHECK-LABEL: @trunc_2x2(
24+
; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x i64>, ptr [[IN:%.*]], align 32
25+
; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr i64, ptr [[IN]], i64 2
26+
; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x i64>, ptr [[VEC_GEP]], align 16
27+
; CHECK-NEXT: [[TMP1:%.*]] = trunc <2 x i64> [[COL_LOAD]] to <2 x i32>
28+
; CHECK-NEXT: [[TMP2:%.*]] = trunc <2 x i64> [[COL_LOAD1]] to <2 x i32>
29+
; CHECK-NEXT: store <2 x i32> [[TMP1]], ptr [[OUT:%.*]], align 4
30+
; CHECK-NEXT: [[VEC_GEP2:%.*]] = getelementptr i32, ptr [[OUT]], i64 2
31+
; CHECK-NEXT: store <2 x i32> [[TMP2]], ptr [[VEC_GEP2]], align 4
32+
; CHECK-NEXT: ret void
33+
;
34+
%inv = load <4 x i64>, ptr %in
35+
%op = trunc <4 x i64> %inv to <4 x i32>
36+
call void @llvm.matrix.column.major.store(<4 x i32> %op, ptr %out, i64 2, i1 false, i32 2, i32 2)
37+
ret void
38+
}
39+
40+
define void @zext_2x2(ptr %in, ptr %out) {
41+
; CHECK-LABEL: @zext_2x2(
42+
; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x i16>, ptr [[IN:%.*]], align 8
43+
; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr i16, ptr [[IN]], i64 2
44+
; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x i16>, ptr [[VEC_GEP]], align 4
45+
; CHECK-NEXT: [[TMP1:%.*]] = zext <2 x i16> [[COL_LOAD]] to <2 x i32>
46+
; CHECK-NEXT: [[TMP2:%.*]] = zext <2 x i16> [[COL_LOAD1]] to <2 x i32>
47+
; CHECK-NEXT: store <2 x i32> [[TMP1]], ptr [[OUT:%.*]], align 4
48+
; CHECK-NEXT: [[VEC_GEP2:%.*]] = getelementptr i32, ptr [[OUT]], i64 2
49+
; CHECK-NEXT: store <2 x i32> [[TMP2]], ptr [[VEC_GEP2]], align 4
50+
; CHECK-NEXT: ret void
51+
;
52+
%inv = load <4 x i16>, ptr %in
53+
%op = zext <4 x i16> %inv to <4 x i32>
54+
call void @llvm.matrix.column.major.store(<4 x i32> %op, ptr %out, i64 2, i1 false, i32 2, i32 2)
55+
ret void
56+
}
57+
58+
define void @sext_2x2(ptr %in, ptr %out) {
59+
; CHECK-LABEL: @sext_2x2(
60+
; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x i8>, ptr [[IN:%.*]], align 4
61+
; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr i8, ptr [[IN]], i64 2
62+
; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x i8>, ptr [[VEC_GEP]], align 2
63+
; CHECK-NEXT: [[TMP1:%.*]] = sext <2 x i8> [[COL_LOAD]] to <2 x i16>
64+
; CHECK-NEXT: [[TMP2:%.*]] = sext <2 x i8> [[COL_LOAD1]] to <2 x i16>
65+
; CHECK-NEXT: store <2 x i16> [[TMP1]], ptr [[OUT:%.*]], align 2
66+
; CHECK-NEXT: [[VEC_GEP2:%.*]] = getelementptr i16, ptr [[OUT]], i64 2
67+
; CHECK-NEXT: store <2 x i16> [[TMP2]], ptr [[VEC_GEP2]], align 2
68+
; CHECK-NEXT: ret void
69+
;
70+
%inv = load <4 x i8>, ptr %in
71+
%op = sext <4 x i8> %inv to <4 x i16>
72+
call void @llvm.matrix.column.major.store(<4 x i16> %op, ptr %out, i64 2, i1 false, i32 2, i32 2)
73+
ret void
74+
}
75+
76+
define void @fptoui_2x2(ptr %in, ptr %out) {
77+
; CHECK-LABEL: @fptoui_2x2(
78+
; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x float>, ptr [[IN:%.*]], align 16
79+
; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr float, ptr [[IN]], i64 2
80+
; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x float>, ptr [[VEC_GEP]], align 8
81+
; CHECK-NEXT: [[TMP1:%.*]] = fptoui <2 x float> [[COL_LOAD]] to <2 x i32>
82+
; CHECK-NEXT: [[TMP2:%.*]] = fptoui <2 x float> [[COL_LOAD1]] to <2 x i32>
83+
; CHECK-NEXT: store <2 x i32> [[TMP1]], ptr [[OUT:%.*]], align 4
84+
; CHECK-NEXT: [[VEC_GEP2:%.*]] = getelementptr i32, ptr [[OUT]], i64 2
85+
; CHECK-NEXT: store <2 x i32> [[TMP2]], ptr [[VEC_GEP2]], align 4
86+
; CHECK-NEXT: ret void
87+
;
88+
%inv = load <4 x float>, ptr %in
89+
%op = fptoui <4 x float> %inv to <4 x i32>
90+
call void @llvm.matrix.column.major.store(<4 x i32> %op, ptr %out, i64 2, i1 false, i32 2, i32 2)
91+
ret void
92+
}
93+
94+
define void @fptosi_2x2(ptr %in, ptr %out) {
95+
; CHECK-LABEL: @fptosi_2x2(
96+
; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x float>, ptr [[IN:%.*]], align 16
97+
; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr float, ptr [[IN]], i64 2
98+
; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x float>, ptr [[VEC_GEP]], align 8
99+
; CHECK-NEXT: [[TMP1:%.*]] = fptosi <2 x float> [[COL_LOAD]] to <2 x i32>
100+
; CHECK-NEXT: [[TMP2:%.*]] = fptosi <2 x float> [[COL_LOAD1]] to <2 x i32>
101+
; CHECK-NEXT: store <2 x i32> [[TMP1]], ptr [[OUT:%.*]], align 4
102+
; CHECK-NEXT: [[VEC_GEP2:%.*]] = getelementptr i32, ptr [[OUT]], i64 2
103+
; CHECK-NEXT: store <2 x i32> [[TMP2]], ptr [[VEC_GEP2]], align 4
104+
; CHECK-NEXT: ret void
105+
;
106+
%inv = load <4 x float>, ptr %in
107+
%op = fptosi <4 x float> %inv to <4 x i32>
108+
call void @llvm.matrix.column.major.store(<4 x i32> %op, ptr %out, i64 2, i1 false, i32 2, i32 2)
109+
ret void
110+
}
111+
112+
define void @uitofp_2x2(ptr %in, ptr %out) {
113+
; CHECK-LABEL: @uitofp_2x2(
114+
; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x i64>, ptr [[IN:%.*]], align 32
115+
; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr i64, ptr [[IN]], i64 2
116+
; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x i64>, ptr [[VEC_GEP]], align 16
117+
; CHECK-NEXT: [[TMP1:%.*]] = uitofp <2 x i64> [[COL_LOAD]] to <2 x double>
118+
; CHECK-NEXT: [[TMP2:%.*]] = uitofp <2 x i64> [[COL_LOAD1]] to <2 x double>
119+
; CHECK-NEXT: store <2 x double> [[TMP1]], ptr [[OUT:%.*]], align 8
120+
; CHECK-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[OUT]], i64 2
121+
; CHECK-NEXT: store <2 x double> [[TMP2]], ptr [[VEC_GEP2]], align 8
122+
; CHECK-NEXT: ret void
123+
;
124+
%inv = load <4 x i64>, ptr %in
125+
%op = uitofp <4 x i64> %inv to <4 x double>
126+
call void @llvm.matrix.column.major.store(<4 x double> %op, ptr %out, i64 2, i1 false, i32 2, i32 2)
127+
ret void
128+
}
129+
130+
define void @sitofp_2x2(ptr %in, ptr %out) {
131+
; CHECK-LABEL: @sitofp_2x2(
132+
; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x i64>, ptr [[IN:%.*]], align 32
133+
; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr i64, ptr [[IN]], i64 2
134+
; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x i64>, ptr [[VEC_GEP]], align 16
135+
; CHECK-NEXT: [[TMP1:%.*]] = sitofp <2 x i64> [[COL_LOAD]] to <2 x double>
136+
; CHECK-NEXT: [[TMP2:%.*]] = sitofp <2 x i64> [[COL_LOAD1]] to <2 x double>
137+
; CHECK-NEXT: store <2 x double> [[TMP1]], ptr [[OUT:%.*]], align 8
138+
; CHECK-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[OUT]], i64 2
139+
; CHECK-NEXT: store <2 x double> [[TMP2]], ptr [[VEC_GEP2]], align 8
140+
; CHECK-NEXT: ret void
141+
;
142+
%inv = load <4 x i64>, ptr %in
143+
%op = sitofp <4 x i64> %inv to <4 x double>
144+
call void @llvm.matrix.column.major.store(<4 x double> %op, ptr %out, i64 2, i1 false, i32 2, i32 2)
145+
ret void
146+
}
147+
148+
define void @fptrunc_2x2(ptr %in, ptr %out) {
149+
; CHECK-LABEL: @fptrunc_2x2(
150+
; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x double>, ptr [[IN:%.*]], align 32
151+
; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i64 2
152+
; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 16
153+
; CHECK-NEXT: [[TMP1:%.*]] = fptrunc nnan <2 x double> [[COL_LOAD]] to <2 x float>
154+
; CHECK-NEXT: [[TMP2:%.*]] = fptrunc nnan <2 x double> [[COL_LOAD1]] to <2 x float>
155+
; CHECK-NEXT: store <2 x float> [[TMP1]], ptr [[OUT:%.*]], align 4
156+
; CHECK-NEXT: [[VEC_GEP2:%.*]] = getelementptr float, ptr [[OUT]], i64 2
157+
; CHECK-NEXT: store <2 x float> [[TMP2]], ptr [[VEC_GEP2]], align 4
158+
; CHECK-NEXT: ret void
159+
;
160+
%inv = load <4 x double>, ptr %in
161+
%op = fptrunc nnan <4 x double> %inv to <4 x float>
162+
call void @llvm.matrix.column.major.store(<4 x float> %op, ptr %out, i64 2, i1 false, i32 2, i32 2)
163+
ret void
164+
}
165+
166+
define void @fpext_2x2(ptr %in, ptr %out) {
167+
; CHECK-LABEL: @fpext_2x2(
168+
; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x float>, ptr [[IN:%.*]], align 16
169+
; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr float, ptr [[IN]], i64 2
170+
; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x float>, ptr [[VEC_GEP]], align 8
171+
; CHECK-NEXT: [[TMP1:%.*]] = fpext <2 x float> [[COL_LOAD]] to <2 x double>
172+
; CHECK-NEXT: [[TMP2:%.*]] = fpext <2 x float> [[COL_LOAD1]] to <2 x double>
173+
; CHECK-NEXT: store <2 x double> [[TMP1]], ptr [[OUT:%.*]], align 8
174+
; CHECK-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[OUT]], i64 2
175+
; CHECK-NEXT: store <2 x double> [[TMP2]], ptr [[VEC_GEP2]], align 8
176+
; CHECK-NEXT: ret void
177+
;
178+
%inv = load <4 x float>, ptr %in
179+
%op = fpext <4 x float> %inv to <4 x double>
180+
call void @llvm.matrix.column.major.store(<4 x double> %op, ptr %out, i64 2, i1 false, i32 2, i32 2)
181+
ret void
182+
}
183+
184+
define void @bitcast_2x2_v4f64_to_v4i64(ptr %in, ptr %out) {
185+
; CHECK-LABEL: @bitcast_2x2_v4f64_to_v4i64(
186+
; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x double>, ptr [[IN:%.*]], align 32
187+
; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i64 2
188+
; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 16
189+
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[COL_LOAD]] to <2 x i64>
190+
; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[COL_LOAD1]] to <2 x i64>
191+
; CHECK-NEXT: store <2 x i64> [[TMP1]], ptr [[OUT:%.*]], align 4
192+
; CHECK-NEXT: [[VEC_GEP2:%.*]] = getelementptr i64, ptr [[OUT]], i64 2
193+
; CHECK-NEXT: store <2 x i64> [[TMP2]], ptr [[VEC_GEP2]], align 4
194+
; CHECK-NEXT: ret void
195+
;
196+
%inv = load <4 x double>, ptr %in
197+
%op = bitcast <4 x double> %inv to <4 x i64>
198+
call void @llvm.matrix.column.major.store(<4 x i64> %op, ptr %out, i64 2, i1 false, i32 2, i32 2)
199+
ret void
200+
}
201+
202+
define void @bitcast_2x2_v4f64_to_v8i32(ptr %in, ptr %out) {
203+
; CHECK-LABEL: @bitcast_2x2_v4f64_to_v8i32(
204+
; CHECK-NEXT: [[INV:%.*]] = load <4 x double>, ptr [[IN:%.*]], align 32
205+
; CHECK-NEXT: [[OP:%.*]] = bitcast <4 x double> [[INV]] to <8 x i32>
206+
; CHECK-NEXT: [[SPLIT:%.*]] = shufflevector <8 x i32> [[OP]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
207+
; CHECK-NEXT: [[SPLIT1:%.*]] = shufflevector <8 x i32> [[OP]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
208+
; CHECK-NEXT: store <4 x i32> [[SPLIT]], ptr [[OUT:%.*]], align 4
209+
; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr i32, ptr [[OUT]], i64 4
210+
; CHECK-NEXT: store <4 x i32> [[SPLIT1]], ptr [[VEC_GEP]], align 4
211+
; CHECK-NEXT: ret void
212+
;
213+
%inv = load <4 x double>, ptr %in
214+
%op = bitcast <4 x double> %inv to <8 x i32>
215+
call void @llvm.matrix.column.major.store(<8 x i32> %op, ptr %out, i64 4, i1 false, i32 4, i32 2)
216+
ret void
217+
}
218+
219+
define void @bitcast_2x2_i256_to_v4i64(ptr %in, ptr %out) {
220+
; CHECK-LABEL: @bitcast_2x2_i256_to_v4i64(
221+
; CHECK-NEXT: [[INV:%.*]] = load i256, ptr [[IN:%.*]], align 4
222+
; CHECK-NEXT: [[OP:%.*]] = bitcast i256 [[INV]] to <4 x double>
223+
; CHECK-NEXT: [[SPLIT:%.*]] = shufflevector <4 x double> [[OP]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
224+
; CHECK-NEXT: [[SPLIT1:%.*]] = shufflevector <4 x double> [[OP]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
225+
; CHECK-NEXT: store <2 x double> [[SPLIT]], ptr [[OUT:%.*]], align 8
226+
; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[OUT]], i64 2
227+
; CHECK-NEXT: store <2 x double> [[SPLIT1]], ptr [[VEC_GEP]], align 8
228+
; CHECK-NEXT: ret void
229+
;
230+
%inv = load i256, ptr %in
231+
%op = bitcast i256 %inv to <4 x double>
232+
call void @llvm.matrix.column.major.store(<4 x double> %op, ptr %out, i64 2, i1 false, i32 2, i32 2)
233+
ret void
234+
}
235+
236+
define void @bitcast_2x2_4i64_to_i256(ptr %in, ptr %out) {
237+
; CHECK-LABEL: @bitcast_2x2_4i64_to_i256(
238+
; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x double>, ptr [[IN:%.*]], align 8
239+
; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i64 2
240+
; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
241+
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> [[COL_LOAD1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
242+
; CHECK-NEXT: [[OP:%.*]] = bitcast <4 x double> [[TMP1]] to i256
243+
; CHECK-NEXT: store i256 [[OP]], ptr [[OUT:%.*]], align 4
244+
; CHECK-NEXT: ret void
245+
;
246+
%inv = call <4 x double> @llvm.matrix.column.major.load(ptr %in, i64 2, i1 false, i32 2, i32 2)
247+
%op = bitcast <4 x double> %inv to i256
248+
store i256 %op, ptr %out
249+
ret void
250+
}

0 commit comments

Comments
 (0)