Skip to content

Commit 4753a4e

Browse files
Jisheng Zhaojplehr
authored andcommitted
[OpenMP] asynchronous memory copy support
We introduced the implementation of supporting asynchronous routines with depend objects specified in Version 5.1 of the OpenMP Application Programming Interface. In brief, these routines omp_target_memcpy_async and omp_target_memcpy_rect_async perform asynchronous (nonblocking) memory copies between any combination of host and device pointers. The basic idea is to create the implicit tasks to carry the memory copy calls and handle dependencies specified by depend objects. The implicit tasks are executed via hidden helper thread in OpenMP runtime. Reviewed By: jdoerfert, tianshilei1992 Committed By: jplehr Differential Revision: https://reviews.llvm.org/D136103
1 parent f2696e4 commit 4753a4e

File tree

8 files changed

+540
-24
lines changed

8 files changed

+540
-24
lines changed

openmp/libomptarget/include/interop.h

Lines changed: 0 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -116,30 +116,6 @@ omp_get_interop_type_desc(const omp_interop_t, omp_interop_property_t);
116116
extern const char *__KAI_KMPC_CONVENTION
117117
omp_get_interop_rc_desc(const omp_interop_t, omp_interop_rc_t);
118118

119-
typedef struct kmp_tasking_flags { /* Total struct must be exactly 32 bits */
120-
/* Compiler flags */ /* Total compiler flags must be 16 bits */
121-
unsigned tiedness : 1; /* task is either tied (1) or untied (0) */
122-
unsigned final : 1; /* task is final(1) so execute immediately */
123-
unsigned merged_if0 : 1; // no __kmpc_task_{begin/complete}_if0 calls in if0
124-
unsigned destructors_thunk : 1; // set if the compiler creates a thunk to
125-
unsigned proxy : 1; // task is a proxy task (it will be executed outside the
126-
unsigned priority_specified : 1; // set if the compiler provides priority
127-
unsigned detachable : 1; // 1 == can detach */
128-
unsigned unshackled : 1; /* 1 == unshackled task */
129-
unsigned target : 1; /* 1 == target task */
130-
unsigned reserved : 7; /* reserved for compiler use */
131-
unsigned tasktype : 1; /* task is either explicit(1) or implicit (0) */
132-
unsigned task_serial : 1; // task is executed immediately (1) or deferred (0)
133-
unsigned tasking_ser : 1; // all tasks in team are either executed immediately
134-
unsigned team_serial : 1; // entire team is serial (1) [1 thread] or parallel
135-
unsigned started : 1; /* 1==started, 0==not started */
136-
unsigned executing : 1; /* 1==executing, 0==not executing */
137-
unsigned complete : 1; /* 1==complete, 0==not complete */
138-
unsigned freed : 1; /* 1==freed, 0==allocated */
139-
unsigned native : 1; /* 1==gcc-compiled task, 0==intel */
140-
unsigned reserved31 : 7; /* reserved for library use */
141-
} kmp_tasking_flags_t;
142-
143119
typedef enum omp_interop_backend_type_t {
144120
// reserve 0
145121
omp_interop_backend_type_cuda_1 = 1,

openmp/libomptarget/src/api.cpp

Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
#include "private.h"
1616
#include "rtl.h"
1717

18+
#include "llvm/ADT/SmallVector.h"
19+
1820
#include <climits>
1921
#include <cstdlib>
2022
#include <cstring>
@@ -207,6 +209,105 @@ EXTERN int omp_target_memcpy(void *Dst, const void *Src, size_t Length,
207209
return Rc;
208210
}
209211

212+
// The helper function that calls omp_target_memcpy or omp_target_memcpy_rect
213+
static int libomp_target_memcpy_async_helper(kmp_int32 Gtid, kmp_task_t *Task) {
214+
if (Task == nullptr)
215+
return OFFLOAD_FAIL;
216+
217+
TargetMemcpyArgsTy *Args = (TargetMemcpyArgsTy *)Task->shareds;
218+
219+
if (Args == nullptr)
220+
return OFFLOAD_FAIL;
221+
222+
// Call blocked version
223+
int Rc = OFFLOAD_SUCCESS;
224+
if (Args->IsRectMemcpy) {
225+
Rc = omp_target_memcpy_rect(
226+
Args->Dst, Args->Src, Args->ElementSize, Args->NumDims, Args->Volume,
227+
Args->DstOffsets, Args->SrcOffsets, Args->DstDimensions,
228+
Args->SrcDimensions, Args->DstDevice, Args->SrcDevice);
229+
230+
DP("omp_target_memcpy_rect returns %d\n", Rc);
231+
} else {
232+
Rc = omp_target_memcpy(Args->Dst, Args->Src, Args->Length, Args->DstOffset,
233+
Args->SrcOffset, Args->DstDevice, Args->SrcDevice);
234+
235+
DP("omp_target_memcpy returns %d\n", Rc);
236+
}
237+
238+
// Release the arguments object
239+
delete Args;
240+
241+
return Rc;
242+
}
243+
244+
// Allocate and launch helper task
245+
static int libomp_helper_task_creation(TargetMemcpyArgsTy *Args,
246+
int DepObjCount,
247+
omp_depend_t *DepObjList) {
248+
// Create global thread ID
249+
int Gtid = __kmpc_global_thread_num(nullptr);
250+
int (*Fn)(kmp_int32, kmp_task_t *) = &libomp_target_memcpy_async_helper;
251+
252+
// Setup the hidden helper flags;
253+
kmp_int32 Flags = 0;
254+
kmp_tasking_flags_t *InputFlags = (kmp_tasking_flags_t *)&Flags;
255+
InputFlags->hidden_helper = 1;
256+
257+
// Alloc helper task
258+
kmp_task_t *Ptr = __kmpc_omp_target_task_alloc(nullptr, Gtid, Flags,
259+
sizeof(kmp_task_t), 0, Fn, -1);
260+
261+
if (Ptr == nullptr) {
262+
// Task allocation failed, delete the argument object
263+
delete Args;
264+
265+
return OFFLOAD_FAIL;
266+
}
267+
268+
// Setup the arguments passed to helper task
269+
Ptr->shareds = Args;
270+
271+
// Convert the type of depend objects
272+
llvm::SmallVector<kmp_depend_info_t> DepObjs;
273+
for (int i = 0; i < DepObjCount; i++) {
274+
omp_depend_t DepObj = DepObjList[i];
275+
DepObjs.push_back(*((kmp_depend_info_t *)DepObj));
276+
}
277+
278+
// Launch the helper task
279+
int Rc = __kmpc_omp_task_with_deps(nullptr, Gtid, Ptr, DepObjCount,
280+
DepObjs.data(), 0, nullptr);
281+
282+
return Rc;
283+
}
284+
285+
EXTERN int omp_target_memcpy_async(void *Dst, const void *Src, size_t Length,
286+
size_t DstOffset, size_t SrcOffset,
287+
int DstDevice, int SrcDevice,
288+
int DepObjCount, omp_depend_t *DepObjList) {
289+
TIMESCOPE();
290+
DP("Call to omp_target_memcpy_async, dst device %d, src device %d, "
291+
"dst addr " DPxMOD ", src addr " DPxMOD ", dst offset %zu, "
292+
"src offset %zu, length %zu\n",
293+
DstDevice, SrcDevice, DPxPTR(Dst), DPxPTR(Src), DstOffset, SrcOffset,
294+
Length);
295+
296+
// Check the source and dest address
297+
if (Dst == nullptr || Src == nullptr)
298+
return OFFLOAD_FAIL;
299+
300+
// Create task object
301+
TargetMemcpyArgsTy *Args = new TargetMemcpyArgsTy(
302+
Dst, Src, Length, DstOffset, SrcOffset, DstDevice, SrcDevice);
303+
304+
// Create and launch helper task
305+
int Rc = libomp_helper_task_creation(Args, DepObjCount, DepObjList);
306+
307+
DP("omp_target_memcpy_async returns %d\n", Rc);
308+
return Rc;
309+
}
310+
210311
EXTERN int
211312
omp_target_memcpy_rect(void *Dst, const void *Src, size_t ElementSize,
212313
int NumDims, const size_t *Volume,
@@ -267,6 +368,43 @@ omp_target_memcpy_rect(void *Dst, const void *Src, size_t ElementSize,
267368
return Rc;
268369
}
269370

371+
EXTERN int omp_target_memcpy_rect_async(
372+
void *Dst, const void *Src, size_t ElementSize, int NumDims,
373+
const size_t *Volume, const size_t *DstOffsets, const size_t *SrcOffsets,
374+
const size_t *DstDimensions, const size_t *SrcDimensions, int DstDevice,
375+
int SrcDevice, int DepObjCount, omp_depend_t *DepObjList) {
376+
TIMESCOPE();
377+
DP("Call to omp_target_memcpy_rect_async, dst device %d, src device %d, "
378+
"dst addr " DPxMOD ", src addr " DPxMOD ", dst offsets " DPxMOD ", "
379+
"src offsets " DPxMOD ", dst dims " DPxMOD ", src dims " DPxMOD ", "
380+
"volume " DPxMOD ", element size %zu, num_dims %d\n",
381+
DstDevice, SrcDevice, DPxPTR(Dst), DPxPTR(Src), DPxPTR(DstOffsets),
382+
DPxPTR(SrcOffsets), DPxPTR(DstDimensions), DPxPTR(SrcDimensions),
383+
DPxPTR(Volume), ElementSize, NumDims);
384+
385+
// Need to check this first to not return OFFLOAD_FAIL instead
386+
if (!Dst && !Src) {
387+
DP("Call to omp_target_memcpy_rect returns max supported dimensions %d\n",
388+
INT_MAX);
389+
return INT_MAX;
390+
}
391+
392+
// Check the source and dest address
393+
if (Dst == nullptr || Src == nullptr)
394+
return OFFLOAD_FAIL;
395+
396+
// Create task object
397+
TargetMemcpyArgsTy *Args = new TargetMemcpyArgsTy(
398+
Dst, Src, ElementSize, NumDims, Volume, DstOffsets, SrcOffsets,
399+
DstDimensions, SrcDimensions, DstDevice, SrcDevice);
400+
401+
// Create and launch helper task
402+
int Rc = libomp_helper_task_creation(Args, DepObjCount, DepObjList);
403+
404+
DP("omp_target_memcpy_rect_async returns %d\n", Rc);
405+
return Rc;
406+
}
407+
270408
EXTERN int omp_target_associate_ptr(const void *HostPtr, const void *DevicePtr,
271409
size_t Size, size_t DeviceOffset,
272410
int DeviceNum) {

openmp/libomptarget/src/exports

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@ VERS1.0 {
4141
omp_target_is_present;
4242
omp_target_memcpy;
4343
omp_target_memcpy_rect;
44+
omp_target_memcpy_async;
45+
omp_target_memcpy_rect_async;
4446
omp_target_associate_ptr;
4547
omp_target_disassociate_ptr;
4648
llvm_omp_target_alloc_host;

openmp/libomptarget/src/private.h

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,51 @@ extern "C" {
104104
* We maintain the same data structure for compatibility.
105105
*/
106106
typedef int kmp_int32;
107+
typedef int64_t kmp_int64;
107108
typedef intptr_t kmp_intptr_t;
109+
110+
typedef void *omp_depend_t;
111+
struct kmp_task;
112+
typedef kmp_int32 (*kmp_routine_entry_t)(kmp_int32, struct kmp_task *);
113+
typedef struct kmp_task {
114+
void *shareds;
115+
kmp_routine_entry_t routine;
116+
kmp_int32 part_id;
117+
} kmp_task_t;
118+
119+
typedef struct kmp_tasking_flags { /* Total struct must be exactly 32 bits */
120+
/* Compiler flags */ /* Total compiler flags must be 16 bits */
121+
unsigned tiedness : 1; /* task is either tied (1) or untied (0) */
122+
unsigned final : 1; /* task is final(1) so execute immediately */
123+
unsigned merged_if0 : 1; /* no __kmpc_task_{begin/complete}_if0 calls in if0
124+
code path */
125+
unsigned destructors_thunk : 1; /* set if the compiler creates a thunk to
126+
invoke destructors from the runtime */
127+
unsigned proxy : 1; /* task is a proxy task (it will be executed outside the
128+
context of the RTL) */
129+
unsigned priority_specified : 1; /* set if the compiler provides priority
130+
setting for the task */
131+
unsigned detachable : 1; /* 1 == can detach */
132+
unsigned hidden_helper : 1; /* 1 == hidden helper task */
133+
unsigned reserved : 8; /* reserved for compiler use */
134+
135+
/* Library flags */ /* Total library flags must be 16 bits */
136+
unsigned tasktype : 1; /* task is either explicit(1) or implicit (0) */
137+
unsigned task_serial : 1; // task is executed immediately (1) or deferred (0)
138+
unsigned tasking_ser : 1; // all tasks in team are either executed immediately
139+
// (1) or may be deferred (0)
140+
unsigned team_serial : 1; // entire team is serial (1) [1 thread] or parallel
141+
// (0) [>= 2 threads]
142+
/* If either team_serial or tasking_ser is set, task team may be NULL */
143+
/* Task State Flags: */
144+
unsigned started : 1; /* 1==started, 0==not started */
145+
unsigned executing : 1; /* 1==executing, 0==not executing */
146+
unsigned complete : 1; /* 1==complete, 0==not complete */
147+
unsigned freed : 1; /* 1==freed, 0==allocated */
148+
unsigned native : 1; /* 1==gcc-compiled task, 0==intel */
149+
unsigned reserved31 : 7; /* reserved for library use */
150+
} kmp_tasking_flags_t;
151+
108152
// Compiler sends us this info:
109153
typedef struct kmp_depend_info {
110154
kmp_intptr_t base_addr;
@@ -126,6 +170,86 @@ void __kmpc_omp_wait_deps(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 ndeps,
126170
void **__kmpc_omp_get_target_async_handle_ptr(kmp_int32 gtid)
127171
__attribute__((weak));
128172
bool __kmpc_omp_has_task_team(kmp_int32 gtid) __attribute__((weak));
173+
kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
174+
kmp_int32 flags, size_t sizeof_kmp_task_t,
175+
size_t sizeof_shareds,
176+
kmp_routine_entry_t task_entry)
177+
__attribute__((weak));
178+
179+
kmp_task_t *
180+
__kmpc_omp_target_task_alloc(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 flags,
181+
size_t sizeof_kmp_task_t, size_t sizeof_shareds,
182+
kmp_routine_entry_t task_entry,
183+
kmp_int64 device_id) __attribute__((weak));
184+
185+
kmp_int32 __kmpc_omp_task_with_deps(ident_t *loc_ref, kmp_int32 gtid,
186+
kmp_task_t *new_task, kmp_int32 ndeps,
187+
kmp_depend_info_t *dep_list,
188+
kmp_int32 ndeps_noalias,
189+
kmp_depend_info_t *noalias_dep_list)
190+
__attribute__((weak));
191+
192+
/**
193+
* The argument set that is passed from asynchronous memory copy to block
194+
* version of memory copy invoked in helper task
195+
*/
196+
struct TargetMemcpyArgsTy {
197+
/**
198+
* Common attribuutes
199+
*/
200+
void *Dst;
201+
const void *Src;
202+
int DstDevice;
203+
int SrcDevice;
204+
205+
/**
206+
* The flag that denotes single dimensional or rectangle dimensional copy
207+
*/
208+
bool IsRectMemcpy;
209+
210+
/**
211+
* Arguments for single dimensional copy
212+
*/
213+
size_t Length;
214+
size_t DstOffset;
215+
size_t SrcOffset;
216+
217+
/**
218+
* Arguments for rectangle dimensional copy
219+
*/
220+
size_t ElementSize;
221+
int NumDims;
222+
const size_t *Volume;
223+
const size_t *DstOffsets;
224+
const size_t *SrcOffsets;
225+
const size_t *DstDimensions;
226+
const size_t *SrcDimensions;
227+
228+
/**
229+
* Constructor for single dimensional copy
230+
*/
231+
TargetMemcpyArgsTy(void *Dst, const void *Src, size_t Length,
232+
size_t DstOffset, size_t SrcOffset, int DstDevice,
233+
int SrcDevice)
234+
: Dst(Dst), Src(Src), DstDevice(DstDevice), SrcDevice(SrcDevice),
235+
IsRectMemcpy(false), Length(Length), DstOffset(DstOffset),
236+
SrcOffset(SrcOffset), ElementSize(0), NumDims(0), Volume(0),
237+
DstOffsets(0), SrcOffsets(0), DstDimensions(0), SrcDimensions(0){};
238+
239+
/**
240+
* Constructor for rectangle dimensional copy
241+
*/
242+
TargetMemcpyArgsTy(void *Dst, const void *Src, size_t ElementSize,
243+
int NumDims, const size_t *Volume,
244+
const size_t *DstOffsets, const size_t *SrcOffsets,
245+
const size_t *DstDimensions, const size_t *SrcDimensions,
246+
int DstDevice, int SrcDevice)
247+
: Dst(Dst), Src(Src), DstDevice(DstDevice), SrcDevice(SrcDevice),
248+
IsRectMemcpy(true), Length(0), DstOffset(0), SrcOffset(0),
249+
ElementSize(ElementSize), NumDims(NumDims), Volume(Volume),
250+
DstOffsets(DstOffsets), SrcOffsets(SrcOffsets),
251+
DstDimensions(DstDimensions), SrcDimensions(SrcDimensions){};
252+
};
129253
// Invalid GTID as defined by libomp; keep in sync
130254
#define KMP_GTID_DNE (-2)
131255
#ifdef __cplusplus
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
// RUN: %libomptarget-compile-and-run-generic
2+
3+
// Test case for omp_target_memcpy_async, oringally from GCC
4+
5+
#include "stdio.h"
6+
#include <omp.h>
7+
#include <stdlib.h>
8+
9+
int main() {
10+
int d = omp_get_default_device();
11+
int id = omp_get_initial_device();
12+
int q[128], i;
13+
void *p;
14+
15+
if (d < 0 || d >= omp_get_num_devices())
16+
d = id;
17+
18+
p = omp_target_alloc(130 * sizeof(int), d);
19+
if (p == NULL)
20+
return 0;
21+
22+
for (i = 0; i < 128; i++)
23+
q[i] = i;
24+
25+
if (omp_target_memcpy_async(p, q, 128 * sizeof(int), sizeof(int), 0, d, id, 0,
26+
NULL)) {
27+
abort();
28+
}
29+
30+
#pragma omp taskwait
31+
32+
int q2[128];
33+
for (i = 0; i < 128; ++i)
34+
q2[i] = 0;
35+
if (omp_target_memcpy_async(q2, p, 128 * sizeof(int), 0, sizeof(int), id, d,
36+
0, NULL))
37+
abort();
38+
39+
#pragma omp taskwait
40+
41+
for (i = 0; i < 128; ++i)
42+
if (q2[i] != q[i])
43+
abort();
44+
45+
omp_target_free(p, d);
46+
47+
return 0;
48+
}

0 commit comments

Comments
 (0)