Skip to content

Commit 1446e1d

Browse files
krisman-at-collaboraKAGA-KOKO
authored andcommitted
kernel: Implement selective syscall userspace redirection
Introduce a mechanism to quickly disable/enable syscall handling for a specific process and redirect to userspace via SIGSYS. This is useful for processes with parts that require syscall redirection and parts that don't, but who need to perform this boundary crossing really fast, without paying the cost of a system call to reconfigure syscall handling on each boundary transition. This is particularly important for Windows games running over Wine. The proposed interface looks like this: prctl(PR_SET_SYSCALL_USER_DISPATCH, <op>, <off>, <length>, [selector]) The range [<offset>,<offset>+<length>) is a part of the process memory map that is allowed to by-pass the redirection code and dispatch syscalls directly, such that in fast paths a process doesn't need to disable the trap nor the kernel has to check the selector. This is essential to return from SIGSYS to a blocked area without triggering another SIGSYS from rt_sigreturn. selector is an optional pointer to a char-sized userspace memory region that has a key switch for the mechanism. This key switch is set to either PR_SYS_DISPATCH_ON, PR_SYS_DISPATCH_OFF to enable and disable the redirection without calling the kernel. The feature is meant to be set per-thread and it is disabled on fork/clone/execv. Internally, this doesn't add overhead to the syscall hot path, and it requires very little per-architecture support. I avoided using seccomp, even though it duplicates some functionality, due to previous feedback that maybe it shouldn't mix with seccomp since it is not a security mechanism. And obviously, this should never be considered a security mechanism, since any part of the program can by-pass it by using the syscall dispatcher. For the sysinfo benchmark, which measures the overhead added to executing a native syscall that doesn't require interception, the overhead using only the direct dispatcher region to issue syscalls is pretty much irrelevant. The overhead of using the selector goes around 40ns for a native (unredirected) syscall in my system, and it is (as expected) dominated by the supervisor-mode user-address access. In fact, with SMAP off, the overhead is consistently less than 5ns on my test box. Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Reviewed-by: Andy Lutomirski <luto@kernel.org> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Acked-by: Kees Cook <keescook@chromium.org> Link: https://lore.kernel.org/r/20201127193238.821364-4-krisman@collabora.com
1 parent 1d7637d commit 1446e1d

File tree

10 files changed

+170
-1
lines changed

10 files changed

+170
-1
lines changed

fs/exec.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@
6464
#include <linux/compat.h>
6565
#include <linux/vmalloc.h>
6666
#include <linux/io_uring.h>
67+
#include <linux/syscall_user_dispatch.h>
6768

6869
#include <linux/uaccess.h>
6970
#include <asm/mmu_context.h>
@@ -1302,6 +1303,8 @@ int begin_new_exec(struct linux_binprm * bprm)
13021303
flush_thread();
13031304
me->personality &= ~bprm->per_clear;
13041305

1306+
clear_syscall_work_syscall_user_dispatch(me);
1307+
13051308
/*
13061309
* We have to apply CLOEXEC before we change whether the process is
13071310
* dumpable (in setup_new_exec) to avoid a race with a process in userspace

include/linux/sched.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
#include <linux/sched/prio.h>
2929
#include <linux/sched/types.h>
3030
#include <linux/signal_types.h>
31+
#include <linux/syscall_user_dispatch.h>
3132
#include <linux/mm_types_task.h>
3233
#include <linux/task_io_accounting.h>
3334
#include <linux/posix-timers.h>
@@ -965,6 +966,7 @@ struct task_struct {
965966
unsigned int sessionid;
966967
#endif
967968
struct seccomp seccomp;
969+
struct syscall_user_dispatch syscall_dispatch;
968970

969971
/* Thread group tracking: */
970972
u64 parent_exec_id;

include/linux/syscall_user_dispatch.h

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
/* SPDX-License-Identifier: GPL-2.0 */
2+
/*
3+
* Copyright (C) 2020 Collabora Ltd.
4+
*/
5+
#ifndef _SYSCALL_USER_DISPATCH_H
6+
#define _SYSCALL_USER_DISPATCH_H
7+
8+
#include <linux/thread_info.h>
9+
10+
#ifdef CONFIG_GENERIC_ENTRY
11+
12+
struct syscall_user_dispatch {
13+
char __user *selector;
14+
unsigned long offset;
15+
unsigned long len;
16+
bool on_dispatch;
17+
};
18+
19+
int set_syscall_user_dispatch(unsigned long mode, unsigned long offset,
20+
unsigned long len, char __user *selector);
21+
22+
#define clear_syscall_work_syscall_user_dispatch(tsk) \
23+
clear_task_syscall_work(tsk, SYSCALL_USER_DISPATCH)
24+
25+
#else
26+
struct syscall_user_dispatch {};
27+
28+
static inline int set_syscall_user_dispatch(unsigned long mode, unsigned long offset,
29+
unsigned long len, char __user *selector)
30+
{
31+
return -EINVAL;
32+
}
33+
34+
static inline void clear_syscall_work_syscall_user_dispatch(struct task_struct *tsk)
35+
{
36+
}
37+
38+
#endif /* CONFIG_GENERIC_ENTRY */
39+
40+
#endif /* _SYSCALL_USER_DISPATCH_H */

include/linux/thread_info.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,13 +42,15 @@ enum syscall_work_bit {
4242
SYSCALL_WORK_BIT_SYSCALL_TRACE,
4343
SYSCALL_WORK_BIT_SYSCALL_EMU,
4444
SYSCALL_WORK_BIT_SYSCALL_AUDIT,
45+
SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH,
4546
};
4647

4748
#define SYSCALL_WORK_SECCOMP BIT(SYSCALL_WORK_BIT_SECCOMP)
4849
#define SYSCALL_WORK_SYSCALL_TRACEPOINT BIT(SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT)
4950
#define SYSCALL_WORK_SYSCALL_TRACE BIT(SYSCALL_WORK_BIT_SYSCALL_TRACE)
5051
#define SYSCALL_WORK_SYSCALL_EMU BIT(SYSCALL_WORK_BIT_SYSCALL_EMU)
5152
#define SYSCALL_WORK_SYSCALL_AUDIT BIT(SYSCALL_WORK_BIT_SYSCALL_AUDIT)
53+
#define SYSCALL_WORK_SYSCALL_USER_DISPATCH BIT(SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH)
5254
#endif
5355

5456
#include <asm/thread_info.h>

include/uapi/linux/prctl.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -247,4 +247,9 @@ struct prctl_mm_map {
247247
#define PR_SET_IO_FLUSHER 57
248248
#define PR_GET_IO_FLUSHER 58
249249

250+
/* Dispatch syscalls to a userspace handler */
251+
#define PR_SET_SYSCALL_USER_DISPATCH 59
252+
# define PR_SYS_DISPATCH_OFF 0
253+
# define PR_SYS_DISPATCH_ON 1
254+
250255
#endif /* _LINUX_PRCTL_H */

kernel/entry/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,5 +9,5 @@ KCOV_INSTRUMENT := n
99
CFLAGS_REMOVE_common.o = -fstack-protector -fstack-protector-strong
1010
CFLAGS_common.o += -fno-stack-protector
1111

12-
obj-$(CONFIG_GENERIC_ENTRY) += common.o
12+
obj-$(CONFIG_GENERIC_ENTRY) += common.o syscall_user_dispatch.o
1313
obj-$(CONFIG_KVM_XFER_TO_GUEST_WORK) += kvm.o

kernel/entry/common.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
/* SPDX-License-Identifier: GPL-2.0 */
2+
#ifndef _COMMON_H
3+
#define _COMMON_H
4+
5+
bool syscall_user_dispatch(struct pt_regs *regs);
6+
7+
#endif

kernel/entry/syscall_user_dispatch.c

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
// SPDX-License-Identifier: GPL-2.0
2+
/*
3+
* Copyright (C) 2020 Collabora Ltd.
4+
*/
5+
#include <linux/sched.h>
6+
#include <linux/prctl.h>
7+
#include <linux/syscall_user_dispatch.h>
8+
#include <linux/uaccess.h>
9+
#include <linux/signal.h>
10+
#include <linux/elf.h>
11+
12+
#include <linux/sched/signal.h>
13+
#include <linux/sched/task_stack.h>
14+
15+
#include <asm/syscall.h>
16+
17+
#include "common.h"
18+
19+
static void trigger_sigsys(struct pt_regs *regs)
20+
{
21+
struct kernel_siginfo info;
22+
23+
clear_siginfo(&info);
24+
info.si_signo = SIGSYS;
25+
info.si_code = SYS_USER_DISPATCH;
26+
info.si_call_addr = (void __user *)KSTK_EIP(current);
27+
info.si_errno = 0;
28+
info.si_arch = syscall_get_arch(current);
29+
info.si_syscall = syscall_get_nr(current, regs);
30+
31+
force_sig_info(&info);
32+
}
33+
34+
bool syscall_user_dispatch(struct pt_regs *regs)
35+
{
36+
struct syscall_user_dispatch *sd = &current->syscall_dispatch;
37+
char state;
38+
39+
if (likely(instruction_pointer(regs) - sd->offset < sd->len))
40+
return false;
41+
42+
if (unlikely(arch_syscall_is_vdso_sigreturn(regs)))
43+
return false;
44+
45+
if (likely(sd->selector)) {
46+
/*
47+
* access_ok() is performed once, at prctl time, when
48+
* the selector is loaded by userspace.
49+
*/
50+
if (unlikely(__get_user(state, sd->selector)))
51+
do_exit(SIGSEGV);
52+
53+
if (likely(state == PR_SYS_DISPATCH_OFF))
54+
return false;
55+
56+
if (state != PR_SYS_DISPATCH_ON)
57+
do_exit(SIGSYS);
58+
}
59+
60+
sd->on_dispatch = true;
61+
syscall_rollback(current, regs);
62+
trigger_sigsys(regs);
63+
64+
return true;
65+
}
66+
67+
int set_syscall_user_dispatch(unsigned long mode, unsigned long offset,
68+
unsigned long len, char __user *selector)
69+
{
70+
switch (mode) {
71+
case PR_SYS_DISPATCH_OFF:
72+
if (offset || len || selector)
73+
return -EINVAL;
74+
break;
75+
case PR_SYS_DISPATCH_ON:
76+
/*
77+
* Validate the direct dispatcher region just for basic
78+
* sanity against overflow and a 0-sized dispatcher
79+
* region. If the user is able to submit a syscall from
80+
* an address, that address is obviously valid.
81+
*/
82+
if (offset && offset + len <= offset)
83+
return -EINVAL;
84+
85+
if (selector && !access_ok(selector, sizeof(*selector)))
86+
return -EFAULT;
87+
88+
break;
89+
default:
90+
return -EINVAL;
91+
}
92+
93+
current->syscall_dispatch.selector = selector;
94+
current->syscall_dispatch.offset = offset;
95+
current->syscall_dispatch.len = len;
96+
current->syscall_dispatch.on_dispatch = false;
97+
98+
if (mode == PR_SYS_DISPATCH_ON)
99+
set_syscall_work(SYSCALL_USER_DISPATCH);
100+
else
101+
clear_syscall_work(SYSCALL_USER_DISPATCH);
102+
103+
return 0;
104+
}

kernel/fork.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -906,6 +906,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
906906
clear_user_return_notifier(tsk);
907907
clear_tsk_need_resched(tsk);
908908
set_task_stack_end_magic(tsk);
909+
clear_syscall_work_syscall_user_dispatch(tsk);
909910

910911
#ifdef CONFIG_STACKPROTECTOR
911912
tsk->stack_canary = get_random_canary();

kernel/sys.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
#include <linux/syscore_ops.h>
4343
#include <linux/version.h>
4444
#include <linux/ctype.h>
45+
#include <linux/syscall_user_dispatch.h>
4546

4647
#include <linux/compat.h>
4748
#include <linux/syscalls.h>
@@ -2530,6 +2531,10 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
25302531

25312532
error = (current->flags & PR_IO_FLUSHER) == PR_IO_FLUSHER;
25322533
break;
2534+
case PR_SET_SYSCALL_USER_DISPATCH:
2535+
error = set_syscall_user_dispatch(arg2, arg3, arg4,
2536+
(char __user *) arg5);
2537+
break;
25332538
default:
25342539
error = -EINVAL;
25352540
break;

0 commit comments

Comments
 (0)