-
Notifications
You must be signed in to change notification settings - Fork 51
/
Target.hs
215 lines (191 loc) · 7.62 KB
/
Target.hs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
{-# LANGUAGE CPP #-}
{-# LANGUAGE EmptyDataDecls #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE TypeApplications #-}
{-# LANGUAGE TypeFamilies #-}
-- |
-- Module : Data.Array.Accelerate.LLVM.PTX.Target
-- Copyright : [2014..2020] The Accelerate Team
-- License : BSD3
--
-- Maintainer : Trevor L. McDonell <trevor.mcdonell@gmail.com>
-- Stability : experimental
-- Portability : non-portable (GHC extensions)
--
module Data.Array.Accelerate.LLVM.PTX.Target (
module Data.Array.Accelerate.LLVM.Target,
module Data.Array.Accelerate.LLVM.PTX.Target,
) where
-- llvm-hs
import LLVM.AST.AddrSpace
import LLVM.AST.DataLayout
import LLVM.Target hiding ( Target )
import qualified LLVM.Target as LLVM
import qualified LLVM.Relocation as R
import qualified LLVM.CodeModel as CM
import qualified LLVM.CodeGenOpt as CGO
-- accelerate
import Data.Array.Accelerate.Error
import Data.Array.Accelerate.LLVM.Extra
import Data.Array.Accelerate.LLVM.Target
import Data.Array.Accelerate.LLVM.PTX.Array.Table ( MemoryTable )
import Data.Array.Accelerate.LLVM.PTX.Context ( Context, deviceProperties, deviceName )
import Data.Array.Accelerate.LLVM.PTX.Execute.Stream.Reservoir ( Reservoir )
import Data.Array.Accelerate.LLVM.PTX.Link.Cache ( KernelTable )
-- CUDA
import Foreign.CUDA.Analysis.Device ( DeviceProperties, Compute(..), computeCapability )
-- standard library
import Data.ByteString ( ByteString )
import Data.ByteString.Short ( ShortByteString )
import Data.Primitive.ByteArray
import Data.String
import Debug.Trace
import Foreign.C.String
import Foreign.Ptr
import System.IO.Unsafe
import Text.Printf
import qualified Data.Map as Map
import qualified Data.Set as Set
-- | The PTX execution target for NVIDIA GPUs.
--
-- The execution target carries state specific for the current execution
-- context. The data here --- device memory and execution streams --- are
-- implicitly tied to this CUDA execution context.
--
-- Don't store anything here that is independent of the context, for example
-- state related to [persistent] kernel caching should _not_ go here.
--
data PTX = PTX {
ptxContext :: {-# UNPACK #-} !Context
, ptxMemoryTable :: {-# UNPACK #-} !MemoryTable
, ptxKernelTable :: {-# UNPACK #-} !KernelTable
, ptxStreamReservoir :: {-# UNPACK #-} !Reservoir
}
instance Target PTX where
targetTriple = Just ptxTargetTriple
#if ACCELERATE_USE_NVVM
targetDataLayout = Nothing -- see note: [NVVM and target data layout]
#else
targetDataLayout = Just ptxDataLayout
#endif
-- | Extract the properties of the device the current PTX execution state is
-- executing on.
--
ptxDeviceProperties :: PTX -> DeviceProperties
ptxDeviceProperties = deviceProperties . ptxContext
-- | Extract the name of the device of the current execution context
--
ptxDeviceName :: PTX -> CString
ptxDeviceName = castPtr . byteArrayContents . deviceName . ptxContext
-- | A description of the various data layout properties that may be used during
-- optimisation. For CUDA the following data layouts are supported:
--
-- 32-bit:
-- e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64
--
-- 64-bit:
-- e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64
--
-- Thus, only the size of the pointer layout changes depending on the host
-- architecture.
--
ptxDataLayout :: DataLayout
ptxDataLayout = DataLayout
{ endianness = LittleEndian
, mangling = Nothing
, aggregateLayout = AlignmentInfo 0 64
, stackAlignment = Nothing
, pointerLayouts = Map.fromList
[ (AddrSpace 0, (wordSize, AlignmentInfo wordSize wordSize)) ]
, typeLayouts = Map.fromList $
[ ((IntegerAlign, 1), AlignmentInfo 8 8) ] ++
[ ((IntegerAlign, i), AlignmentInfo i i) | i <- [8,16,32,64]] ++
[ ((VectorAlign, v), AlignmentInfo v v) | v <- [16,32,64,128]] ++
[ ((FloatAlign, f), AlignmentInfo f f) | f <- [32,64] ]
, nativeSizes = Just $ Set.fromList [ 16,32,64 ]
}
where
wordSize = bitSize (undefined :: Int)
-- | String that describes the target host.
--
ptxTargetTriple :: HasCallStack => ShortByteString
ptxTargetTriple =
case bitSize (undefined::Int) of
32 -> "nvptx-nvidia-cuda"
64 -> "nvptx64-nvidia-cuda"
_ -> internalError "I don't know what architecture I am"
-- | Bracket creation and destruction of the NVVM TargetMachine.
--
withPTXTargetMachine
:: HasCallStack
=> DeviceProperties
-> (TargetMachine -> IO a)
-> IO a
withPTXTargetMachine dev go =
let (sm, isa) = ptxTargetVersion (computeCapability dev)
in
withTargetOptions $ \options -> do
withTargetMachine
ptxTarget
ptxTargetTriple
sm -- CPU
(Map.singleton (CPUFeature isa) True) -- CPU features
options -- target options
R.Default -- relocation model
CM.Default -- code model
CGO.Default -- optimisation level
go
-- Compile using the earliest version of the SM target PTX ISA supported by
-- the given compute device and this version of LLVM.
--
-- Note that we require at least ptx40 for some libnvvm device functions.
--
-- See table NVPTX supported processors:
--
-- https://github.com/llvm/llvm-project/blob/master/lib/Target/NVPTX/NVPTX.td
--
-- PTX ISA verison history:
--
-- https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#release-notes
--
ptxTargetVersion :: Compute -> (ByteString, ByteString)
ptxTargetVersion compute@(Compute m n)
#if MIN_VERSION_llvm_hs(8,0,0)
| m >= 7 && n >= 5 = ("sm_75", "ptx63")
#endif
#if MIN_VERSION_llvm_hs(7,0,0)
| m >= 7 && n >= 2 = ("sm_72", "ptx61")
#endif
#if MIN_VERSION_llvm_hs(6,0,0)
| m >= 7 = ("sm_70", "ptx60")
#endif
| m > 6 = ("sm_62", "ptx50") -- fallthrough
--
| m == 6 && n == 2 = ("sm_62", "ptx50")
| m == 6 && n == 1 = ("sm_61", "ptx50")
| m == 6 = ("sm_60", "ptx50")
| m == 5 && n == 3 = ("sm_53", "ptx42")
| m == 5 && n == 2 = ("sm_52", "ptx41")
| m == 5 = ("sm_50", "ptx40")
| m == 3 && n == 7 = ("sm_37", "ptx41")
| m == 3 && n == 5 = ("sm_35", "ptx40")
| m == 3 && n == 2 = ("sm_32", "ptx40")
| m == 3 = ("sm_30", "ptx40")
| m == 2 && n == 1 = ("sm_21", "ptx40")
| m == 2 = ("sm_20", "ptx40")
--
| otherwise
= trace warning (fromString (printf "sm_%d%d" m n), "ptx40")
where
warning = unlines [ "*** Warning: Unhandled CUDA device compute capability: " ++ show compute
, "*** Please submit a bug report at https://github.com/AccelerateHS/accelerate/issues" ]
-- | The NVPTX target for this host.
--
-- The top-level 'unsafePerformIO' is so that 'initializeAllTargets' is run once
-- per program execution (although that might not be necessary?)
--
{-# NOINLINE ptxTarget #-}
ptxTarget :: LLVM.Target
ptxTarget = unsafePerformIO $ do
initializeAllTargets
fst `fmap` lookupTarget Nothing ptxTargetTriple