1
- // RUN: mlir-opt %s --test-transform-dialect-interpreter -canonicalize -split-input-file | FileCheck %s
1
+ // RUN: mlir-opt %s --test-transform-dialect-interpreter -canonicalize -cse - split-input-file | FileCheck %s
2
2
3
3
// Offset per thread:
4
4
// CHECK-DAG: affine_map<(d0)[s0] -> (d0 * (s0 ceildiv 10))>
@@ -22,7 +22,7 @@ module {
22
22
// CHECK: %[[RES:.*]] = linalg.matmul
23
23
// CHECK-SAME: ins(%[[tA]], %[[tB]] : tensor<?x?xf32>, tensor<?x?xf32>)
24
24
// CHECK-SAME: outs(%[[tC]] : tensor<?x?xf32>) -> tensor<?x?xf32>
25
- // CHECK-NEXT : scf.foreach_thread.perform_concurrently {
25
+ // CHECK: scf.foreach_thread.perform_concurrently {
26
26
// CHECK-NEXT: tensor.parallel_insert_slice %[[RES]] into %[[C]]{{.*}} :
27
27
// CHECK-SAME: tensor<?x?xf32> into tensor<?x?xf32>
28
28
// CHECK-NEXT: }
@@ -65,11 +65,9 @@ func.func @matmul_static(%A: tensor<100x200xf32>, %B: tensor<200x300xf32>, %C: t
65
65
// CHECK-NOT: affine.max
66
66
// CHECK: %[[LB0:.+]] = affine.apply #[[$map2]](%[[IV0]])
67
67
// CHECK: %[[LB1:.+]] = affine.apply #[[$map3]](%[[IV1]])
68
- // CHECK: %[[LB0_1:.+]] = affine.apply #[[$map2]](%[[IV0]])
69
- // CHECK: %[[LB1_1:.+]] = affine.apply #[[$map3]](%[[IV1]])
70
68
// CHECK: %[[tA:.+]] = tensor.extract_slice %[[A]][%[[LB0]], 0] [10, 200] [1, 1] :
71
69
// CHECK: %[[tB:.+]] = tensor.extract_slice %[[B]][0, %[[LB1]]] [200, %[[TS]]] [1, 1] :
72
- // CHECK: %[[tC:.+]] = tensor.extract_slice %[[C]][%[[LB0_1 ]], %[[LB1_1 ]]] [10, %[[TS]]] [1, 1] :
70
+ // CHECK: %[[tC:.+]] = tensor.extract_slice %[[C]][%[[LB0 ]], %[[LB1 ]]] [10, %[[TS]]] [1, 1] :
73
71
// CHECK: linalg.matmul
74
72
// CHECK: scf.foreach_thread.perform_concurrently
75
73
// CHECK-NEXT: tensor.parallel_insert_slice
@@ -106,17 +104,13 @@ func.func @matmul_tile_size_dynamic(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %C
106
104
// CHECK: %[[N:.+]] = tensor.dim %[[B]], %c1 :
107
105
// CHECK: %[[NT0:.+]] = affine.apply #map0()[%[[M]]]
108
106
// CHECK: %[[NT1:.+]] = affine.apply #map1()[%[[N]]]
109
- // CHECK: %[[M:.+]] = tensor.dim %[[A]], %c0 :
110
- // CHECK: %[[N:.+]] = tensor.dim %[[B]], %c1 :
111
107
// CHECK: scf.foreach_thread (%[[IV0:.+]], %[[IV1:.+]]) in (%[[NT0]], %[[NT1]])
112
108
// CHECK: %[[TS0:.+]] = affine.min #[[$map2]](%[[IV0]])[%[[M]]]
113
109
// CHECK: %[[TS1:.+]] = affine.min #[[$map4]](%[[IV1]])[%[[N]]]
114
110
// CHECK: %[[LB0:.+]] = affine.apply #[[$map5]](%[[IV0]])
115
111
// CHECK tensor.extract_slice %[[A]]
116
112
// CHECK: %[[LB1:.+]] = affine.apply #[[$map6]](%[[IV1]])
117
113
// CHECK tensor.extract_slice %[[B]]
118
- // CHECK: %[[LB0:.+]] = affine.apply #[[$map5]](%[[IV0]])
119
- // CHECK: %[[LB1:.+]] = affine.apply #[[$map6]](%[[IV1]])
120
114
// CHECK tensor.extract_slice %[[C]]
121
115
// CHECK: linalg.matmul
122
116
// CHECK: scf.foreach_thread.perform_concurrently
@@ -156,11 +150,9 @@ func.func @matmul_tile_size_static(%A: tensor<100x200xf32>, %B: tensor<200x300xf
156
150
// CHECK-NOT: affine.min
157
151
// CHECK: %[[LB0:.+]] = affine.apply #[[$map2]](%[[IV0]])
158
152
// CHECK: %[[LB1:.+]] = affine.apply #[[$map3]](%[[IV1]])
159
- // CHECK: %[[LB0_1:.+]] = affine.apply #[[$map2]](%[[IV0]])
160
- // CHECK: %[[LB1_1:.+]] = affine.apply #[[$map3]](%[[IV1]])
161
153
// CHECK: %[[tA:.+]] = tensor.extract_slice %[[A]][%[[LB0]], 0] [10, 200] [1, 1] :
162
154
// CHECK: %[[tB:.+]] = tensor.extract_slice %[[B]][0, %[[LB1]]] [200, %[[TS]]] [1, 1] :
163
- // CHECK: %[[tC:.+]] = tensor.extract_slice %[[C]][%[[LB0_1 ]], %[[LB1_1 ]]] [10, %[[TS]]] [1, 1] :
155
+ // CHECK: %[[tC:.+]] = tensor.extract_slice %[[C]][%[[LB0 ]], %[[LB1 ]]] [10, %[[TS]]] [1, 1] :
164
156
// CHECK: linalg.matmul
165
157
// CHECK: scf.foreach_thread.perform_concurrently
166
158
// CHECK-NEXT: tensor.parallel_insert_slice
@@ -177,3 +169,37 @@ transform.with_pdl_patterns {
177
169
%1:2 = transform.structured.tile_to_foreach_thread_op %0 tile_sizes [10 , 21 ]
178
170
}
179
171
}
172
+
173
+ // -----
174
+
175
+ module {
176
+ func.func @extract_source (%A: tensor <4 xf32 >, %B: tensor <16 xf32 >) -> tensor <4 xf32 > {
177
+ %B1 = tensor.extract_slice %B [10 ] [4 ] [1 ] : tensor <16 xf32 > to tensor <4 xf32 >
178
+ %result = linalg.generic {index ing_maps = [
179
+ affine_map <(d0 ) -> (d0 )>,affine_map <(d0 ) -> (d0 )>],
180
+ iterator_types = [" parallel" ]}
181
+ ins (%A : tensor <4 xf32 >) outs (%B1 : tensor <4 xf32 >) {
182
+ ^bb0 (%arg3: f32 , %arg4: f32 ): // no predecessors
183
+ %2 = arith.addf %arg3 , %arg3 : f32
184
+ linalg.yield %2 : f32
185
+ } -> tensor <4 xf32 >
186
+ return %result : tensor <4 xf32 >
187
+ }
188
+
189
+ transform.with_pdl_patterns {
190
+ ^bb0 (%arg0: !pdl.operation ):
191
+ transform.sequence %arg0 failures (propagate ) {
192
+ ^bb1 (%arg1: !pdl.operation ):
193
+ %0 = transform.structured.match ops {[" linalg.generic" ]} in %arg1
194
+ %1:2 = transform.structured.tile_to_foreach_thread_op %0 num_threads [2 ] (mapped to dims [0 ])
195
+ }
196
+ }
197
+ }
198
+ // CHECK-DAG: #[[$map0:.+]] = affine_map<(d0) -> (d0 * 2)>
199
+
200
+ // CHECK-LABEL: extract_source(
201
+ // CHECK: %[[C2:.*]] = arith.constant 2 : index
202
+ // CHECK: scf.foreach_thread (%[[ARG:.*]]) in (%[[C2]]) -> (tensor<4xf32>) {
203
+ // CHECK: %[[OFF:.*]] = affine.apply #[[$map0]](%[[ARG]])
204
+ // CHECK: scf.foreach_thread.perform_concurrently {
205
+ // CHECK: tensor.parallel_insert_slice %{{.*}} into %{{.*}}[%[[OFF]]] [2] [1] : tensor<2xf32> into tensor<4xf32>
0 commit comments