/
Execute.hs
595 lines (531 loc) · 18 KB
/
Execute.hs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
{-# LANGUAGE CPP #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE GADTs #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE TypeOperators #-}
{-# OPTIONS_GHC -fno-warn-orphans #-}
-- |
-- Module : Data.Array.Accelerate.LLVM.PTX.Execute
-- Copyright : [2014..2017] Trevor L. McDonell
-- [2014..2014] Vinod Grover (NVIDIA Corporation)
-- License : BSD3
--
-- Maintainer : Trevor L. McDonell <tmcdonell@cse.unsw.edu.au>
-- Stability : experimental
-- Portability : non-portable (GHC extensions)
--
module Data.Array.Accelerate.LLVM.PTX.Execute (
executeAcc, executeAfun,
executeOpenAcc,
) where
-- accelerate
import Data.Array.Accelerate.Analysis.Match
import Data.Array.Accelerate.Array.Sugar
import Data.Array.Accelerate.Error
import Data.Array.Accelerate.Lifetime
import Data.Array.Accelerate.LLVM.Analysis.Match
import Data.Array.Accelerate.LLVM.Execute
import Data.Array.Accelerate.LLVM.State
import Data.Array.Accelerate.LLVM.PTX.Analysis.Launch ( multipleOf )
import Data.Array.Accelerate.LLVM.PTX.Array.Data
import Data.Array.Accelerate.LLVM.PTX.Array.Prim ( memsetArrayAsync )
import Data.Array.Accelerate.LLVM.PTX.Execute.Async
import Data.Array.Accelerate.LLVM.PTX.Execute.Environment
import Data.Array.Accelerate.LLVM.PTX.Execute.Marshal
import Data.Array.Accelerate.LLVM.PTX.Link
import Data.Array.Accelerate.LLVM.PTX.Target
import qualified Data.Array.Accelerate.LLVM.PTX.Debug as Debug
import Data.Range.Range ( Range(..) )
import Control.Parallel.Meta ( runExecutable )
-- cuda
import qualified Foreign.CUDA.Driver as CUDA
-- library
import Control.Monad ( when )
import Control.Monad.State ( gets, liftIO )
import Data.ByteString.Short.Char8 ( ShortByteString, unpack )
import Data.Int ( Int32 )
import Data.List ( find )
import Data.Maybe ( fromMaybe )
import Data.Word ( Word32 )
import Text.Printf ( printf )
import Prelude hiding ( exp, map, sum, scanl, scanr )
import qualified Prelude as P
-- Array expression evaluation
-- ---------------------------
-- Computations are evaluated by traversing the AST bottom up, and for each node
-- distinguishing between three cases:
--
-- 1. If it is a Use node, we return a reference to the array data. The data
-- will already have been copied to the device during compilation of the
-- kernels.
--
-- 2. If it is a non-skeleton node, such as a let binding or shape conversion,
-- then execute directly by updating the environment or similar.
--
-- 3. If it is a skeleton node, then we need to execute the generated LLVM
-- code.
--
instance Execute PTX where
map = simpleOp
generate = simpleOp
transform = simpleOp
backpermute = simpleOp
fold = foldOp
fold1 = fold1Op
foldSeg = foldSegOp
fold1Seg = foldSegOp
scanl = scanOp
scanl1 = scan1Op
scanl' = scan'Op
scanr = scanOp
scanr1 = scan1Op
scanr' = scan'Op
permute = permuteOp
stencil1 = stencil1Op
stencil2 = stencil2Op
-- Skeleton implementation
-- -----------------------
-- Simple kernels just need to know the shape of the output array
--
simpleOp
:: (Shape sh, Elt e)
=> ExecutableR PTX
-> Gamma aenv
-> Aval aenv
-> Stream
-> sh
-> LLVM PTX (Array sh e)
simpleOp exe gamma aenv stream sh = withExecutable exe $ \ptxExecutable -> do
let kernel = case functionTable ptxExecutable of
k:_ -> k
_ -> $internalError "simpleOp" "no kernels found"
--
out <- allocateRemote sh
ptx <- gets llvmTarget
liftIO $ executeOp ptx kernel gamma aenv stream (IE 0 (size sh)) out
return out
simpleNamed
:: (Shape sh, Elt e)
=> ShortByteString
-> ExecutableR PTX
-> Gamma aenv
-> Aval aenv
-> Stream
-> sh
-> LLVM PTX (Array sh e)
simpleNamed fun exe gamma aenv stream sh = withExecutable exe $ \ptxExecutable -> do
out <- allocateRemote sh
ptx <- gets llvmTarget
liftIO $ executeOp ptx (ptxExecutable !# fun) gamma aenv stream (IE 0 (size sh)) out
return out
-- There are two flavours of fold operation:
--
-- 1. If we are collapsing to a single value, then multiple thread blocks are
-- working together. Since thread blocks synchronise with each other via
-- kernel launches, each block computes a partial sum and the kernel is
-- launched recursively until the final value is reached.
--
-- 2. If this is a multidimensional reduction, then each inner dimension is
-- handled by a single thread block, so no global communication is
-- necessary. Furthermore are two kernel flavours: each innermost dimension
-- can be cooperatively reduced by (a) a thread warp; or (b) a thread
-- block. Currently we always use the first, but require benchmarking to
-- determine when to select each.
--
fold1Op
:: (Shape sh, Elt e)
=> ExecutableR PTX
-> Gamma aenv
-> Aval aenv
-> Stream
-> (sh :. Int)
-> LLVM PTX (Array sh e)
fold1Op exe gamma aenv stream sh@(sx :. sz)
= $boundsCheck "fold1" "empty array" (sz > 0)
$ case size sh of
0 -> allocateRemote sx -- empty, but possibly with one or more non-zero dimensions
_ -> foldCore exe gamma aenv stream sh
foldOp
:: (Shape sh, Elt e)
=> ExecutableR PTX
-> Gamma aenv
-> Aval aenv
-> Stream
-> (sh :. Int)
-> LLVM PTX (Array sh e)
foldOp exe gamma aenv stream sh@(sx :. _)
= case size sh of
0 -> simpleNamed "generate" exe gamma aenv stream (listToShape (P.map (max 1) (shapeToList sx)))
_ -> foldCore exe gamma aenv stream sh
foldCore
:: forall aenv sh e. (Shape sh, Elt e)
=> ExecutableR PTX
-> Gamma aenv
-> Aval aenv
-> Stream
-> (sh :. Int)
-> LLVM PTX (Array sh e)
foldCore exe gamma aenv stream sh
| Just Refl <- matchShapeType (undefined::sh) (undefined::Z)
= foldAllOp exe gamma aenv stream sh
--
| otherwise
= foldDimOp exe gamma aenv stream sh
foldAllOp
:: forall aenv e. Elt e
=> ExecutableR PTX
-> Gamma aenv
-> Aval aenv
-> Stream
-> DIM1
-> LLVM PTX (Scalar e)
foldAllOp exe gamma aenv stream (Z :. n) = withExecutable exe $ \ptxExecutable -> do
ptx <- gets llvmTarget
let
ks = ptxExecutable !# "foldAllS"
km1 = ptxExecutable !# "foldAllM1"
km2 = ptxExecutable !# "foldAllM2"
--
if kernelThreadBlocks ks n == 1
then do
-- The array is small enough that we can compute it in a single step
out <- allocateRemote Z
liftIO $ executeOp ptx ks gamma aenv stream (IE 0 n) out
return out
else do
-- Multi-kernel reduction to a single element. The first kernel integrates
-- any delayed elements, and the second is called recursively until
-- reaching a single element.
let
rec :: Vector e -> LLVM PTX (Scalar e)
rec tmp@(Array ((),m) adata)
| m <= 1 = return $ Array () adata
| otherwise = do
let s = m `multipleOf` kernelThreadBlockSize km2
out <- allocateRemote (Z :. s)
liftIO $ executeOp ptx km2 gamma aenv stream (IE 0 s) (tmp, out)
rec out
--
let s = n `multipleOf` kernelThreadBlockSize km1
tmp <- allocateRemote (Z :. s)
liftIO $ executeOp ptx km1 gamma aenv stream (IE 0 s) tmp
rec tmp
foldDimOp
:: forall aenv sh e. (Shape sh, Elt e)
=> ExecutableR PTX
-> Gamma aenv
-> Aval aenv
-> Stream
-> (sh :. Int)
-> LLVM PTX (Array sh e)
foldDimOp exe gamma aenv stream (sh :. sz) = withExecutable exe $ \ptxExecutable -> do
let
kernel = if sz > 0
then ptxExecutable !# "fold"
else ptxExecutable !# "generate"
--
out <- allocateRemote sh
ptx <- gets llvmTarget
liftIO $ executeOp ptx kernel gamma aenv stream (IE 0 (size sh)) out
return out
foldSegOp
:: (Shape sh, Elt e)
=> ExecutableR PTX
-> Gamma aenv
-> Aval aenv
-> Stream
-> (sh :. Int)
-> (Z :. Int)
-> LLVM PTX (Array (sh :. Int) e)
foldSegOp exe gamma aenv stream (sh :. sz) (Z :. ss) = withExecutable exe $ \ptxExecutable -> do
let
n = ss - 1 -- segments array has been 'scanl (+) 0'`ed
m = size sh * n
foldseg = if (sz`quot`ss) < (2 * kernelThreadBlockSize foldseg_cta)
then foldseg_warp
else foldseg_cta
--
foldseg_cta = ptxExecutable !# "foldSeg_block"
foldseg_warp = ptxExecutable !# "foldSeg_warp"
-- qinit = ptxExecutable !# "qinit"
--
out <- allocateRemote (sh :. n)
ptx <- gets llvmTarget
liftIO $ executeOp ptx foldseg gamma aenv stream (IE 0 m) out
return out
scanOp
:: (Shape sh, Elt e)
=> ExecutableR PTX
-> Gamma aenv
-> Aval aenv
-> Stream
-> sh :. Int
-> LLVM PTX (Array (sh:.Int) e)
scanOp exe gamma aenv stream (sz :. n) =
case n of
0 -> simpleNamed "generate" exe gamma aenv stream (sz :. 1)
_ -> scanCore exe gamma aenv stream sz n (n+1)
scan1Op
:: (Shape sh, Elt e)
=> ExecutableR PTX
-> Gamma aenv
-> Aval aenv
-> Stream
-> sh :. Int
-> LLVM PTX (Array (sh:.Int) e)
scan1Op exe gamma aenv stream (sz :. n)
= $boundsCheck "scan1" "empty array" (n > 0)
$ scanCore exe gamma aenv stream sz n n
scanCore
:: forall aenv sh e. (Shape sh, Elt e)
=> ExecutableR PTX
-> Gamma aenv
-> Aval aenv
-> Stream
-> sh
-> Int -- input size
-> Int -- output size
-> LLVM PTX (Array (sh:.Int) e)
scanCore exe gamma aenv stream sz n m
| Just Refl <- matchShapeType (undefined::sh) (undefined::Z)
= scanAllOp exe gamma aenv stream n m
--
| otherwise
= scanDimOp exe gamma aenv stream sz m
scanAllOp
:: forall aenv e. Elt e
=> ExecutableR PTX
-> Gamma aenv
-> Aval aenv
-> Stream
-> Int -- input size
-> Int -- output size
-> LLVM PTX (Vector e)
scanAllOp exe gamma aenv stream n m = withExecutable exe $ \ptxExecutable -> do
let
k1 = ptxExecutable !# "scanP1"
k2 = ptxExecutable !# "scanP2"
k3 = ptxExecutable !# "scanP3"
--
c = kernelThreadBlockSize k1
s = n `multipleOf` c
--
ptx <- gets llvmTarget
out <- allocateRemote (Z :. m)
-- Step 1: Independent thread-block-wide scans of the input. Small arrays
-- which can be computed by a single thread block will require no
-- additional work.
tmp <- allocateRemote (Z :. s) :: LLVM PTX (Vector e)
liftIO $ executeOp ptx k1 gamma aenv stream (IE 0 s) (tmp, out)
-- Step 2: Multi-block reductions need to compute the per-block prefix,
-- then apply those values to the partial results.
when (s > 1) $ do
liftIO $ executeOp ptx k2 gamma aenv stream (IE 0 s) tmp
liftIO $ executeOp ptx k3 gamma aenv stream (IE 0 (s-1)) (tmp, out, i32 c)
return out
scanDimOp
:: forall aenv sh e. (Shape sh, Elt e)
=> ExecutableR PTX
-> Gamma aenv
-> Aval aenv
-> Stream
-> sh
-> Int
-> LLVM PTX (Array (sh:.Int) e)
scanDimOp exe gamma aenv stream sz m = withExecutable exe $ \ptxExecutable -> do
ptx <- gets llvmTarget
out <- allocateRemote (sz :. m)
liftIO $ executeOp ptx (ptxExecutable !# "scan") gamma aenv stream (IE 0 (size sz)) out
return out
scan'Op
:: forall aenv sh e. (Shape sh, Elt e)
=> ExecutableR PTX
-> Gamma aenv
-> Aval aenv
-> Stream
-> sh :. Int
-> LLVM PTX (Array (sh:.Int) e, Array sh e)
scan'Op exe gamma aenv stream sh@(sz :. n) =
case n of
0 -> do out <- allocateRemote (sz :. 0)
sum <- simpleNamed "generate" exe gamma aenv stream sz
return (out, sum)
_ -> scan'Core exe gamma aenv stream sh
scan'Core
:: forall aenv sh e. (Shape sh, Elt e)
=> ExecutableR PTX
-> Gamma aenv
-> Aval aenv
-> Stream
-> sh :. Int
-> LLVM PTX (Array (sh:.Int) e, Array sh e)
scan'Core exe gamma aenv stream sh
| Just Refl <- matchShapeType (undefined::sh) (undefined::Z)
= scan'AllOp exe gamma aenv stream sh
--
| otherwise
= scan'DimOp exe gamma aenv stream sh
scan'AllOp
:: forall aenv e. Elt e
=> ExecutableR PTX
-> Gamma aenv
-> Aval aenv
-> Stream
-> DIM1
-> LLVM PTX (Vector e, Scalar e)
scan'AllOp exe gamma aenv stream (Z :. n) = withExecutable exe $ \ptxExecutable -> do
let
k1 = ptxExecutable !# "scanP1"
k2 = ptxExecutable !# "scanP2"
k3 = ptxExecutable !# "scanP3"
--
c = kernelThreadBlockSize k1
s = n `multipleOf` c
--
ptx <- gets llvmTarget
out <- allocateRemote (Z :. n)
tmp <- allocateRemote (Z :. s) :: LLVM PTX (Vector e)
-- Step 1: independent thread-block-wide scans. Each block stores its partial
-- sum to a temporary array.
liftIO $ executeOp ptx k1 gamma aenv stream (IE 0 s) (tmp, out)
-- If this was a small array that was processed by a single thread block then
-- we are done, otherwise compute the per-block prefix and apply those values
-- to the partial results.
if s == 1
then case tmp of
Array _ ad -> return (out, Array () ad)
else do
sum <- allocateRemote Z
liftIO $ executeOp ptx k2 gamma aenv stream (IE 0 s) (tmp, sum)
liftIO $ executeOp ptx k3 gamma aenv stream (IE 0 (s-1)) (tmp, out, i32 c)
return (out, sum)
scan'DimOp
:: forall aenv sh e. (Shape sh, Elt e)
=> ExecutableR PTX
-> Gamma aenv
-> Aval aenv
-> Stream
-> sh :. Int
-> LLVM PTX (Array (sh:.Int) e, Array sh e)
scan'DimOp exe gamma aenv stream sh@(sz :. _) = withExecutable exe $ \ptxExecutable -> do
ptx <- gets llvmTarget
out <- allocateRemote sh
sum <- allocateRemote sz
liftIO $ executeOp ptx (ptxExecutable !# "scan") gamma aenv stream (IE 0 (size sz)) (out,sum)
return (out,sum)
permuteOp
:: (Shape sh, Shape sh', Elt e)
=> ExecutableR PTX
-> Gamma aenv
-> Aval aenv
-> Stream
-> Bool
-> sh
-> Array sh' e
-> LLVM PTX (Array sh' e)
permuteOp exe gamma aenv stream inplace shIn dfs = withExecutable exe $ \ptxExecutable -> do
let
n = size shIn
m = size (shape dfs)
kernel = case functionTable ptxExecutable of
k:_ -> k
_ -> $internalError "permute" "no kernels found"
--
ptx <- gets llvmTarget
out <- if inplace
then return dfs
else cloneArrayAsync stream dfs
--
case kernelName kernel of
"permute_rmw" -> liftIO $ executeOp ptx kernel gamma aenv stream (IE 0 n) out
"permute_mutex" -> do
barrier@(Array _ ad) <- allocateRemote (Z :. m) :: LLVM PTX (Vector Word32)
memsetArrayAsync stream m 0 ad
liftIO $ executeOp ptx kernel gamma aenv stream (IE 0 n) (out, barrier)
_ -> $internalError "permute" "unexpected kernel image"
--
return out
-- Using the defaulting instances for stencil operations (for now).
--
stencil1Op
:: (Shape sh, Elt b)
=> ExecutableR PTX
-> Gamma aenv
-> Aval aenv
-> Stream
-> Array sh a
-> LLVM PTX (Array sh b)
stencil1Op exe gamma aenv stream arr =
simpleOp exe gamma aenv stream (shape arr)
stencil2Op
:: (Shape sh, Elt c)
=> ExecutableR PTX
-> Gamma aenv
-> Aval aenv
-> Stream
-> Array sh a
-> Array sh b
-> LLVM PTX (Array sh c)
stencil2Op exe gamma aenv stream arr brr =
simpleOp exe gamma aenv stream (shape arr `intersect` shape brr)
-- Skeleton execution
-- ------------------
-- TODO: Calculate this from the device properties, say [a multiple of] the
-- maximum number of in-flight threads that the device supports.
--
defaultPPT :: Int
defaultPPT = 32768
{-# INLINE i32 #-}
i32 :: Int -> Int32
i32 = fromIntegral
-- | Retrieve the named kernel
--
(!#) :: FunctionTable -> ShortByteString -> Kernel
(!#) exe name
= fromMaybe ($internalError "lookupFunction" ("function not found: " ++ unpack name))
$ lookupKernel name exe
lookupKernel :: ShortByteString -> FunctionTable -> Maybe Kernel
lookupKernel name ptxExecutable =
find (\k -> kernelName k == name) (functionTable ptxExecutable)
-- Execute the function implementing this kernel.
--
executeOp
:: Marshalable args
=> PTX
-> Kernel
-> Gamma aenv
-> Aval aenv
-> Stream
-> Range
-> args
-> IO ()
executeOp ptx@PTX{..} kernel@Kernel{..} gamma aenv stream r args =
runExecutable fillP kernelName defaultPPT r $ \start end _ -> do
argv <- marshal ptx stream (i32 start, i32 end, args, (gamma,aenv))
launch kernel stream (end-start) argv
-- Execute a device function with the given thread configuration and function
-- parameters.
--
launch :: Kernel -> Stream -> Int -> [CUDA.FunParam] -> IO ()
launch Kernel{..} stream n args =
when (n > 0) $
withLifetime stream $ \st ->
Debug.monitorProcTime query msg (Just st) $
CUDA.launchKernel kernelFun grid cta smem (Just st) args
where
cta = (kernelThreadBlockSize, 1, 1)
grid = (kernelThreadBlocks n, 1, 1)
smem = kernelSharedMemBytes
-- Debugging/monitoring support
query = if Debug.monitoringIsEnabled
then return True
else Debug.getFlag Debug.dump_exec
fst3 (x,_,_) = x
msg wall cpu gpu = do
Debug.addProcessorTime Debug.PTX gpu
Debug.traceIO Debug.dump_exec $
printf "exec: %s <<< %d, %d, %d >>> %s"
(unpack kernelName) (fst3 grid) (fst3 cta) smem (Debug.elapsed wall cpu gpu)