Skip to content
Permalink
Browse files
RFC: add pidfd_send_signal flag to reclaim mm while killing a process
When a process is being killed it might be in an uninterruptible sleep
which leads to an unpredictable delay in its memory reclaim. In low memory
situations, when it's important to free up memory quickly, such delay is
problematic. Kernel solves this problem with oom-reaper thread which
performs memory reclaim even when the victim process is not runnable.
Userspace currently lacks such mechanisms and the need and potential
solutions were discussed before (see links below).
This patch provides a mechanism to perform memory reclaim in the context
of the process that sends SIGKILL signal. New SYNC_REAP_MM flag for
pidfd_send_signal syscall can be used only when sending SIGKILL signal
and will lead to the caller synchronously reclaiming the memory that
belongs to the victim and can be easily reclaimed.

1. https://patchwork.kernel.org/cover/10894999
2. https://lwn.net/Articles/787217
3. https://lore.kernel.org/linux-api/CAJuCfpGz1kPM3G1gZH+09Z7aoWKg05QSAMMisJ7H5MdmRrRhNQ@mail.gmail.com

Signed-off-by: Suren Baghdasaryan <surenb@google.com>
  • Loading branch information
surenbaghdasaryan authored and intel-lab-lkp committed Nov 13, 2020
1 parent 585e5b1 commit f350a31b4bb718aa40224cb93f82197839581274
Show file tree
Hide file tree
Showing 4 changed files with 81 additions and 3 deletions.
@@ -111,6 +111,8 @@ bool __oom_reap_task_mm(struct mm_struct *mm);
long oom_badness(struct task_struct *p,
unsigned long totalpages);

extern bool task_will_free_mem(struct task_struct *task);

extern bool out_of_memory(struct oom_control *oc);

extern void exit_oom_victim(void);
@@ -449,6 +449,13 @@ extern bool unhandled_signal(struct task_struct *tsk, int sig);
(!siginmask(signr, SIG_KERNEL_IGNORE_MASK|SIG_KERNEL_STOP_MASK) && \
(t)->sighand->action[(signr)-1].sa.sa_handler == SIG_DFL)

/*
* Flag values used in pidfd_send_signal:
*
* SYNC_REAP_MM indicates request to reclaim mm after SIGKILL.
*/
#define SYNC_REAP_MM 0x1

void signals_init(void);

int restore_altstack(const stack_t __user *);
@@ -46,6 +46,7 @@
#include <linux/livepatch.h>
#include <linux/cgroup.h>
#include <linux/audit.h>
#include <linux/oom.h>

#define CREATE_TRACE_POINTS
#include <trace/events/signal.h>
@@ -3711,6 +3712,63 @@ static struct pid *pidfd_to_pid(const struct file *file)
return tgid_pidfd_to_pid(file);
}

static int reap_mm(struct pid *pid)
{
struct task_struct *task;
struct mm_struct *mm;
int ret = 0;

/* Get the task_struct */
task = get_pid_task(pid, PIDTYPE_PID);
if (!task) {
ret = -ESRCH;
goto out;
}

task_lock(task);

/* Check if memory can be easily reclaimed */
if (!task_will_free_mem(task)) {
task_unlock(task);
ret = -EBUSY;
goto release_task;
}

/* Get mm to prevent exit_mmap */
mm = task->mm;
mmget(mm);

/* Ensure no competition with OOM-killer to prevent contention */
if (unlikely(mm_is_oom_victim(mm)) ||
unlikely(test_bit(MMF_OOM_SKIP, &mm->flags))) {
/* Already being reclaimed */
task_unlock(task);
goto drop_mm;
}
/*
* Prevent OOM-killer or other pidfd_send_signal from considering
* this task
*/
set_bit(MMF_OOM_SKIP, &mm->flags);

task_unlock(task);

mmap_read_lock(mm);
if (!__oom_reap_task_mm(mm)) {
/* Failed to reap part of the address space. User can retry */
ret = -EAGAIN;
clear_bit(MMF_OOM_SKIP, &mm->flags);
}
mmap_read_unlock(mm);

drop_mm:
mmput(mm);
release_task:
put_task_struct(task);
out:
return ret;
}

/**
* sys_pidfd_send_signal - Signal a process through a pidfd
* @pidfd: file descriptor of the process
@@ -3737,10 +3795,16 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
struct pid *pid;
kernel_siginfo_t kinfo;

/* Enforce flags be set to 0 until we add an extension. */
if (flags)
/* Enforce only valid flags. */
if (flags) {
/* Allow SYNC_REAP_MM only with SIGKILL. */
if (flags == SYNC_REAP_MM && sig == SIGKILL)
goto valid;

return -EINVAL;
}

valid:
f = fdget(pidfd);
if (!f.file)
return -EBADF;
@@ -3775,6 +3839,11 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
}

ret = kill_pid_info(sig, &kinfo, pid);
if (unlikely(ret))
goto err;

if (flags & SYNC_REAP_MM)
ret = reap_mm(pid);

err:
fdput(f);
@@ -808,7 +808,7 @@ static inline bool __task_will_free_mem(struct task_struct *task)
* Caller has to make sure that task->mm is stable (hold task_lock or
* it operates on the current).
*/
static bool task_will_free_mem(struct task_struct *task)
bool task_will_free_mem(struct task_struct *task)
{
struct mm_struct *mm = task->mm;
struct task_struct *p;

0 comments on commit f350a31

Please sign in to comment.