Skip to content
Permalink
Browse files
RFC: sched: UMCG: episode IV: A New Hope
A lot of effort has been put into making UMCG based on
userspace TLS data work, and it gets ugly very fast because
it is very hard to guarantee that the pages are present
when needed; and they are needed in non-preemptible (sched)
contexts. The last attempt here:
https://lore.kernel.org/lkml/20220120155517.066795336@infradead.org/
is a good example: a lot of mm-related work, a lot of
extra stuff added to struct task_struct just to deal
with kernel-to-userspace writes in sched contexts.

Here I propose a different approach (actually, it was my first approach,
before we pivoted to userspace TLS). Keep everything the kernel
needs in a kernel-side struct umcg_task, and copy relevant
data out to the userspace when the server's sys_umcg_wait() returns.

Before I go too deep down into implementing and testing this,
I'd like to get some feedback re: if this approach is acceptable.

Please review.

=====================

User Managed Concurrency Groups is an M:N threading toolkit that allows
constructing user space schedulers designed to efficiently manage
heterogeneous in-process workloads while maintaining high CPU
utilization (95%+).

Add UMCG syscall stubs, Kconfig, as well as stubs for hooks into
sched, execve, etc., as this boilerplate is more or less stable,
comparing to various approaches attempted at implementing UMCG.

Signed-off-by: Peter Oskolkov <posk@google.com>
  • Loading branch information
posk-io authored and intel-lab-lkp committed Feb 11, 2022
1 parent c8eaf6a commit 64a46b86ef2bb15a0a1bc7566b6ca87f5e0492dd
Show file tree
Hide file tree
Showing 14 changed files with 442 additions and 6 deletions.
@@ -249,6 +249,7 @@ config X86
select HAVE_RSEQ
select HAVE_SYSCALL_TRACEPOINTS
select HAVE_UNSTABLE_SCHED_CLOCK
select HAVE_UMCG if X86_64
select HAVE_USER_RETURN_NOTIFIER
select HAVE_GENERIC_VDSO
select HOTPLUG_SMT if SMP
@@ -372,6 +372,8 @@
448 common process_mrelease sys_process_mrelease
449 common futex_waitv sys_futex_waitv
450 common set_mempolicy_home_node sys_set_mempolicy_home_node
451 common umcg_wait sys_umcg_wait
452 common umcg_kick sys_umcg_kick

#
# Due to a historical design error, certain syscalls are numbered differently
@@ -1842,6 +1842,7 @@ static int bprm_execve(struct linux_binprm *bprm,
current->fs->in_exec = 0;
current->in_execve = 0;
rseq_execve(current);
umcg_execve(current);
acct_update_integrals(current);
task_numa_free(current, false);
return retval;
@@ -67,6 +67,7 @@ struct sighand_struct;
struct signal_struct;
struct task_delay_info;
struct task_group;
struct umcg_task;

/*
* Task state bitmask. NOTE! These bits are also
@@ -1299,6 +1300,10 @@ struct task_struct {
unsigned long rseq_event_mask;
#endif

#ifdef CONFIG_UMCG
struct umcg_task *umcg_task;
#endif

struct tlbflush_unmap_batch tlb_ubc;

union {
@@ -1695,6 +1700,13 @@ extern struct pid *cad_pid;
#define PF_KTHREAD 0x00200000 /* I am a kernel thread */
#define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */
#define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */

#ifdef CONFIG_UMCG
#define PF_UMCG_WORKER 0x01000000 /* UMCG worker */
#else
#define PF_UMCG_WORKER 0x00000000
#endif

#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */
#define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */
#define PF_MEMALLOC_PIN 0x10000000 /* Allocation context constrained to zones which allow long term pinning. */
@@ -2316,6 +2328,43 @@ static inline void rseq_syscall(struct pt_regs *regs)

#endif

#ifdef CONFIG_UMCG

extern void umcg_notify_resume(void);

/* Called by do_exit() in kernel/exit.c. */
extern void umcg_handle_exit(void);

/* Called by bprm_execve() in fs/exec.c. */
extern void umcg_execve(struct task_struct *tsk);

/*
* umcg_wq_worker_[sleeping|running] are called in core.c by
* sched_submit_work() and sched_update_worker().
*/
extern void umcg_wq_worker_sleeping(struct task_struct *tsk);
extern void umcg_wq_worker_running(struct task_struct *tsk);

#else /* CONFIG_UMCG */

static inline void umcg_notify_resume(void)
{
}
static inline void umcg_execve(struct task_struct *tsk)
{
}
static inline void umcg_handle_exit(void)
{
}
static inline void umcg_wq_worker_sleeping(struct task_struct *tsk)
{
}
static inline void umcg_wq_worker_running(struct task_struct *tsk)
{
}

#endif

const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq);
char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len);
int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq);
@@ -178,9 +178,9 @@ extern struct trace_event_functions exit_syscall_print_funcs;
SYSCALL_TRACE_EXIT_EVENT(sname); \
static struct syscall_metadata __used \
__syscall_meta_##sname = { \
.name = "sys"#sname, \
.name = "sys"#sname, \
.syscall_nr = -1, /* Filled in at boot */ \
.nb_args = nb, \
.nb_args = nb, \
.types = nb ? types_##sname : NULL, \
.args = nb ? args_##sname : NULL, \
.enter_event = &event_enter_##sname, \
@@ -1060,6 +1060,9 @@ asmlinkage long sys_memfd_secret(unsigned int flags);
asmlinkage long sys_set_mempolicy_home_node(unsigned long start, unsigned long len,
unsigned long home_node,
unsigned long flags);
asmlinkage long sys_umcg_wait(u64 flags, pid_t next_tid, u64 abs_timeout,
u64 __user *workers, u64 worker_id_or_sz);
asmlinkage long sys_umcg_kick(u32 flags, pid_t tid);

/*
* Architecture-specific system calls
@@ -886,8 +886,13 @@ __SYSCALL(__NR_futex_waitv, sys_futex_waitv)
#define __NR_set_mempolicy_home_node 450
__SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node)

#define __NR_umcg_wait 451
__SYSCALL(__NR_umcg_wait, sys_umcg_wait)
#define __NR_umcg_kick 452
__SYSCALL(__NR_umcg_kick, sys_umcg_kick)

#undef __NR_syscalls
#define __NR_syscalls 451
#define __NR_syscalls 453

/*
* 32 bit systems traditionally used different
@@ -0,0 +1,232 @@
#ifndef _UAPI_LINUX_UMCG_H
#define _UAPI_LINUX_UMCG_H

#include <linux/types.h>

/*
* UMCG: User Managed Concurrency Groups.
*
* Syscalls, documented below and implemented in kernel/sched/umcg.c:
* sys_umcg_wait() - wait/wake/context-switch;
* sys_umcg_kick() - prod a UMCG task.
*
* UMCG workers have the following internal states:
*
* .-----------------------.
* | |
* | v
* RUNNING --> BLOCKED --> RUNNABLE
* ^ |
* | |
* .-----------------------.
*
* RUNNING -> BLOCKED transition happens when the worker blocks in the
* kernel in I/O, pagefault, futex, etc.
* UMCG_WORKER_BLOCK event will be delivered
* to the worker's server
*
* RUNNING -> RUNNABLE transition happens when the worker calls
* sys_umcg_wait() (UMCG_WORKER_WAIT event) or
* when the worker is preempted via sys_umcg_kick()
* (UMCG_WORKER_PREEMPT event)
*
* RUNNABLE -> RUNNING transition happens when the worker is "scheduled"
* by a server via sys_umcg_wait() (no events are
* delivered to the server in this case)
*
* Note that umcg_kick() can race with the worker calling a blocking
* syscall; in this case the worker enters BLOCKED state, and both
* BLOCK and PREEMPT events are delivered to the server.
*
* So the high-level usage pattern is like this:
* servers:
* // server loop
* bool start = true;
* struct umcg_worker_event *events = malloc(...);
*
* while (!stop) {
* pid_t next_worker = 0;
*
* int ret = sys_umcg_wait(start ? UMCG_NEW_SERVER : 0, 0 ,
* 0, events, event_sz);
* start = false;
*
* if (ret > 0)
* next_worker = scheduler_process_events(events, ret);
* if (next_worker)
* ret = sys_umcg_wait(0, next_worker, 0, events, event_sz);
* }
*
* Workers will start by calling
* sys_umcg_wait(UMCG_NEW_WORKER, 0, 0, NULL, worker_id);
* and then potentially yielding by calling
* sys_umcg_wait(0, 0, 0, NULL, 0);
* or cooperatively context-switching by calling
* sys_umcg_wait(0, next_worker_tid, 0, NULL, 0).
*
* See below for more details.
*/

/**
* enum umcg_event_type - types of worker events delivered to UMCG servers
* @UMCG_WORKER_BLOCK: the worker blocked in kernel in any way
* (e.g. I/O, pagefault, futex, etc.) other than
* in sys_umcg_wait()
* @UMCG_WORKER_WAKE: the worker blocking operation, previously
* indicated by @UMCG_WORKER_BLOCK, has
* completed, and the worker can now be "scheduled"
* @UMCG_WORKER_PREEMPT: the worker has been preempted via umcg_kick
* note: can race with BLOCK, i.e. a running
* worker generate a combined BLOCK | PREEMPT
* event
* @UMCG_WORKER_WAIT: the worker blocked in kernel by calling
* sys_umcg_wait()
* @UMCG_WORKER_EXIT: the worker thread exited or unregistered
*
*/
enum umcg_event_type {
UMCG_WORKER_BLOCK = 0x0001,
UMCG_WORKER_WAKE = 0x0002,
UMCG_WORKER_PREEMPT = 0x0004,
UMCG_WORKER_WAIT = 0x0008,
UMCG_WORKER_EXIT = 0x0010,
};

/**
* struct umcg_worker_event - indicates one or more worker state transitions.
* @worker_id: the ID of the worker (se sys_umcg_wait())
* @worker_event_type: ORed values from umcg_event_type
* @counter: a monotonically increasing wraparound counter,
* per worker,of events delivered to the userspace;
* if the event represents several distinct events (ORed), the
* counter will reflect that number (e.g. if
* @worker_event_type == BLOCK | WAKE, the counter
* will increment by 2).
*
* Worker events are delivered to UMCG servers upon their return from
* sys_umcg_wait().
*/
struct umcg_worker_event {
u64 worker_id;
u32 worker_event_type;
u32 counter;
/* maybe instead of @counter there should be a @timestamp or two? */
} __attribute__((packed, aligned(64)));

/**
* enum umcg_wait_flag - flags to pass to sys_umcg_wait
* @UMCG_NEW_WORKER: register the current task as a UMCG worker
* @UMCG_NEW_SERVER: register the current task as a UMCG server
* @UMCG_UNREGISTER: unregister the current task as a UMCG task
*
*
* @UMCG_CLOCK_REALTIME: treat @abs_timeout as realtime clock value
* @UMCG_CLOCK_TAI: treat @abs_timeout as TAI clock value
* (default: treat @abs_timeout as MONOTONIC clock value)
*/
enum umcg_wait_flag {
UMCG_NEW_WORKER = 0x00001,
UMCG_NEW_SERVER = 0x00002,
UMCG_UNREGISTER = 0x00004,

UMCG_CLOCK_REALTIME = 0x10000,
UMCG_CLOCK_TAI = 0x20000,
};

/*
* int sys_umcg_wait(u64 flags, pid_t next_tid, u64 abs_timeout,
* struct umcg_worker_event __user *events,
* u64 event_sz_or_worker_id);
*
* sys_umcg_wait() context switches, synchronously on-CPU if possible,
* from the currently running thread to @next_tid; also
* @events is used to deliver worker events to servers.
*
* @flags: ORed values from enum umcg_wait_flag.
* - UMCG_NEW_WORKER : register the current thread
* as a new UMCG worker;
* - UMCG_NEW_SERVER : register the current thread
* as a new UMCG server;
* - UMCG_UNREGISTER : unregister the current thread
* as a UMCG task; will not block;
* all other parameters must be zeroes.
*
* if the current thread is a worker,
* UMCG_WORKER_EXIT event will be
* delivered to its server;
*
* if @abs_timeout is not zero, @flags may contain one of the
* UMCG_CLOCK_XXX bits to indicate which clock to use; if
* none of the CLOCK bits are set, the MONOTONIC clock is used;
*
* @next_tid: tid of the UMCG task to context switch to;
*
* if the current thread is a server, @next_tid must be either
* that of a worker, or zero; if @next_tid is a worker, and there
* are no events waiting for this server, sys_umcg_wait() will
* context switch to the worker; if there _are_ events, sys_umcg_wait()
* will wake the worker and immediately return with @events
* populated;
* if the current thread is a server, and @next_tid is zero,
* sys_umcg_wait() will block until there are worker events for
* for this server to consume, or sys_umcg_kick() is called (or
* timeout exipired);
*
* if the current thread is a worker, sys_umcg_wait() will block;
* if @next_tid is zero, UMCG_WORKER_WAIT event will be delivered
* to the worker's server; if @next_tid is a RUNNABLE worker,
* sys_umcg_wait() will context-switch to that worker, without
* any events generated;
*
* Note: if a worker calls sys_umcg_wait() with @next_tid as zero,
* its server should be woken so that it can schedule another
* worker in place of the waiting worker; if the worker
* cooperatively context-switches into another worker,
* its server does not really need to do anything, so no
* new events are generated;
*
* @abs_timeout: if not zero, and the current thread is a server,
* sys_umcg_wait will wake; if the current thread is a worker,
* the worker will remain RUNNABLE, but UMCG_WORKER_WAKE
* event will be delivered to its server; in this case
* sys_umcg_wait() will return -ETIMEDOUT when the worker
* is eventually scheduled by a server;
*
* @events: a block of memory that is used to deliver worker events to
* their servers; must be NOT NULL if the current thread is
* a server; must be NULL if the current thread is a worker;
*
* @event_sz_or_worker_id: if the current thread is a server, indicates
* the number of struct umcg_worker_event the @events
* buffer can accommodate;
*
* if the current thread is a worker, must be
* zero unless UMCG_NEW_WORKER flag is set,
* in which case it must indicate a
* userspace-provided worker ID, usually
* a pointer to a TLS struct holding the worker's
* userspace state;
*
*
* Returns:
* 0 - Ok;
* >0 - the number of worker events in @events;
* -ESRCH - @next_tid is not a UMCG task;
* -ETIMEDOUT - @abs_timeout expired;
* -EINVAL - another error;
*/

/*
* int sys_umcg_kick(u32 flags, pid_t tid) - preempts a running UMCG worker
* or wakes a sleeping UMCG server.
*
* See sys_umcg_wait() for more details.
*
* Returns:
* 0 - Ok;
* -EAGAIN - the worker is not running or the server is not sleeping;
* -ESRCH - not a related UMCG task;
* -EINVAL - another error happened (unknown flags, etc..).
*/

#endif /* _UAPI_LINUX_UMCG_H */
@@ -1685,6 +1685,21 @@ config MEMBARRIER

If unsure, say Y.

config HAVE_UMCG
bool

config UMCG
bool "Enable User Managed Concurrency Groups API"
depends on 64BIT
depends on GENERIC_ENTRY
depends on HAVE_UMCG
default n
help
Enable User Managed Concurrency Groups API, which form the basis
for an in-process M:N userspace scheduling framework.
At the moment this is an experimental/RFC feature that is not
guaranteed to be backward-compatible.

config KALLSYMS
bool "Load all symbols for debugging/ksymoops" if EXPERT
default y

0 comments on commit 64a46b8

Please sign in to comment.