Skip to content

Commit

Permalink
support cgroup pool in v1
Browse files Browse the repository at this point in the history
Add pool_size interface and delay_time interface. When the user writes
pool_size, a cgroup pool will be created, and then when the user needs
to create a cgroup, it will take the fast path protected by spinlock to
obtain it from the resource pool. Performance is improved by the
following aspects:
	1.reduce the critical area for creating cgroups
	2.reduce the scheduling time of sleep
	3.avoid competition with other cgroup behaviors which protected
	  by cgroup_mutex

The essence of obtaining resources from the pool is kernfs rename. With
the help of the previous pinned kernfs node function, when the pool is
enabled, these cgroups will be in the pinned state, and the protection
of the kernfs data structure will be protected by the specified
spinlock, thus getting rid of the cgroup_mutex and kernfs_rwsem.

In order to avoid random operations by users, the kernfs nodes of the
cgroups in the pool will be placed under a hidden kernfs tree, and users
can not directly touch them. When a user creates a cgroup, it will take
the fast path, select a node from the hidden tree, and move it to the
correct position.

As users continue to obtain resources from the pool, the number of
cgroups in the pool will gradually decrease. When the number is less
than a certain value, it will be supplemented. In order to avoid
competition with the currently created cgroup, you can delay this by
setting delay_time process

Suggested-by: Shanpei Chen <shanpeic@linux.alibaba.com>
Signed-off-by: Yi Tao <escape@linux.alibaba.com>
  • Loading branch information
Yi Tao authored and intel-lab-lkp committed Sep 8, 2021
1 parent fa7929f commit 6fadbb1
Show file tree
Hide file tree
Showing 5 changed files with 277 additions and 1 deletion.
16 changes: 16 additions & 0 deletions include/linux/cgroup-defs.h
Expand Up @@ -486,6 +486,22 @@ struct cgroup {
/* Used to store internal freezer state */
struct cgroup_freezer_state freezer;

/*
* cgroup pool related members. lock protects cgroup's kernfs node in
* pool. pool_index records index of cgroup which put into pool next.
* pool_amount records how many cgroups pool remains. pool_size is set
* by user, supply pool util pool_amount reach 2*pool_size if
* pool_amount is less than pool_size to retain enough cgroup in pool to
* guarantee cgroup_mkdir take the fast path.
*/
spinlock_t lock;
atomic64_t pool_index;
atomic64_t pool_amount;
u64 pool_size;
bool enable_pool;
struct kernfs_root *hidden_place;
struct delayed_work supply_pool_work;

/* ids of the ancestors at each level including self */
u64 ancestor_ids[];
};
Expand Down
2 changes: 2 additions & 0 deletions include/linux/cgroup.h
Expand Up @@ -432,6 +432,8 @@ static inline void cgroup_put(struct cgroup *cgrp)
css_put(&cgrp->self);
}

extern unsigned int cgroup_supply_delay_time;

/**
* task_css_set_check - obtain a task's css_set with extra access conditions
* @task: the task to obtain css_set for
Expand Down
139 changes: 139 additions & 0 deletions kernel/cgroup/cgroup-v1.c
Expand Up @@ -609,6 +609,136 @@ static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
return 0;
}

/*
* kernfs_open_file->mutex can't avoid competition if writing to pool_size
* of parent cgroup and child cgroup at the same time. use cgroup_pool_mutex
* to serialize any write operation to pool_size.
*/
DEFINE_MUTEX(cgroup_pool_mutex);

static u64 cgroup_pool_size_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
/* kernfs r/w access is serialized by kernfs_open_file->mutex */
return css->cgroup->pool_size;
}

extern struct kernfs_node *kernfs_get_active(struct kernfs_node *kn);
extern void kernfs_put_active(struct kernfs_node *kn);

static ssize_t cgroup_pool_size_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
char name[NAME_MAX + 1];
struct cgroup *cgrp;
ssize_t ret = -EPERM;
u64 val;
int i;

cgrp = of->kn->parent->priv;

if (kstrtoull(buf, 0, &val))
return -EINVAL;

if (!cgroup_tryget(cgrp))
return -ENODEV;

kernfs_break_active_protection(of->kn);
mutex_lock(&cgroup_pool_mutex);
mutex_lock(&cgroup_mutex);
spin_lock(&cgrp->lock);

/*
* only non-zero -> zero or zero -> non-zero settings are invalid.
*/
if ((cgrp->pool_size && val) || (!cgrp->pool_size && !val))
goto out_fail;

if (cgroup_is_dead(cgrp)) {
ret = -ENODEV;
goto out_fail;
}

cgrp->pool_size = val;
spin_unlock(&cgrp->lock);
mutex_unlock(&cgroup_mutex);

if (val) {
/* create kernfs root to hide cgroup which belongs to pool */
cgrp->hidden_place = kernfs_create_root(NULL, 0, NULL);

/*
* names of cgroups in pool obey the rule of pool-*, it may
* fail if cgroup has the same name already exists, if failed,
* try again with different name.
*
* cgroup_mkdir called here is under context of writing
* pool_size, so we need to call kernfs_get_active to simulate
* kernfs mkdir context.
*
* normally, the mode of 0xffff is intercepted at the VFS layer
* because it is invalid. use 0xffff to tell cgroup_mkdir it is
* create a cgroup for cgroup pool.
*/
for (i = 0; i < val * 2;) {
sprintf(name, "pool-%llu", atomic64_add_return(1, &cgrp->pool_index));
kernfs_get_active(cgrp->kn);
if (!cgroup_mkdir(cgrp->kn, name, 0xffff))
i++;
kernfs_put_active(cgrp->kn);
}
atomic64_set(&cgrp->pool_amount, val * 2);

/* set kernfs node pinned after generating pool */
mutex_lock(&cgroup_mutex);
spin_lock(&cgrp->lock);
cgrp->enable_pool = true;
kernfs_set_pinned(cgrp->kn, &cgrp->lock);
kernfs_set_pinned(cgrp->hidden_place->kn, &cgrp->lock);
spin_unlock(&cgrp->lock);
mutex_unlock(&cgroup_mutex);
} else {
struct kernfs_node *child, *n;
struct rb_root *hidden_root = &cgrp->hidden_place->kn->dir.children;

/* clear kernfs node pinned before removing pool */
mutex_lock(&cgroup_mutex);
spin_lock(&cgrp->lock);
cgrp->enable_pool = false;
kernfs_clear_pinned(cgrp->kn);
kernfs_clear_pinned(cgrp->hidden_place->kn);
spin_unlock(&cgrp->lock);
mutex_unlock(&cgroup_mutex);

/* pool is disabled, cancel supply work */
cancel_delayed_work_sync(&cgrp->supply_pool_work);

/* traverse cgroup in pool and remove them */
while (hidden_root->rb_node) {
rbtree_postorder_for_each_entry_safe(child, n, hidden_root, rb) {
kernfs_get_active(child);
ret = cgroup_rmdir(child);
kernfs_put_active(child);
}
}
kernfs_destroy_root(cgrp->hidden_place);
atomic64_set(&cgrp->pool_amount, 0);
}


ret = nbytes;
goto out_success;

out_fail:
spin_unlock(&cgrp->lock);
mutex_unlock(&cgroup_mutex);
out_success:
mutex_unlock(&cgroup_pool_mutex);
kernfs_unbreak_active_protection(of->kn);
cgroup_put(cgrp);
return ret;
}

/* cgroup core interface files for the legacy hierarchies */
struct cftype cgroup1_base_files[] = {
{
Expand Down Expand Up @@ -651,6 +781,11 @@ struct cftype cgroup1_base_files[] = {
.write = cgroup_release_agent_write,
.max_write_len = PATH_MAX - 1,
},
{
.name = "pool_size",
.read_u64 = cgroup_pool_size_read,
.write = cgroup_pool_size_write,
},
{ } /* terminate */
};

Expand Down Expand Up @@ -845,9 +980,13 @@ static int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent

mutex_lock(&cgroup_mutex);

if (kn->parent->pinned)
spin_lock(kn->parent->lock);
ret = kernfs_rename(kn, new_parent, new_name_str);
if (!ret)
TRACE_CGROUP_PATH(rename, cgrp);
if (kn->parent->pinned)
spin_unlock(kn->parent->lock);

mutex_unlock(&cgroup_mutex);

Expand Down
113 changes: 112 additions & 1 deletion kernel/cgroup/cgroup.c
Expand Up @@ -245,6 +245,7 @@ static void kill_css(struct cgroup_subsys_state *css);
static int cgroup_addrm_files(struct cgroup_subsys_state *css,
struct cgroup *cgrp, struct cftype cfts[],
bool is_add);
static void cgroup_supply_work(struct work_struct *work);

/**
* cgroup_ssid_enabled - cgroup subsys enabled test by subsys ID
Expand Down Expand Up @@ -1925,6 +1926,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
INIT_LIST_HEAD(&cgrp->cset_links);
INIT_LIST_HEAD(&cgrp->pidlists);
mutex_init(&cgrp->pidlist_mutex);
spin_lock_init(&cgrp->lock);
cgrp->self.cgroup = cgrp;
cgrp->self.flags |= CSS_ONLINE;
cgrp->dom_cgrp = cgrp;
Expand All @@ -1938,6 +1940,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)

init_waitqueue_head(&cgrp->offline_waitq);
INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent);
INIT_DELAYED_WORK(&cgrp->supply_pool_work, cgroup_supply_work);
}

void init_cgroup_root(struct cgroup_fs_context *ctx)
Expand Down Expand Up @@ -5419,15 +5422,113 @@ static bool cgroup_check_hierarchy_limits(struct cgroup *parent)
return ret;
}

extern struct kernfs_node *kernfs_get_active(struct kernfs_node *kn);
extern void kernfs_put_active(struct kernfs_node *kn);

unsigned int cgroup_supply_delay_time;

/* supply pool_amount to 2*pool_size */
static void cgroup_supply_work(struct work_struct *work)
{
char name[NAME_MAX + 1];
struct cgroup *parent = container_of((struct delayed_work *)work,
struct cgroup, supply_pool_work);
struct kernfs_node *parent_kn = parent->kn;

while (atomic64_read(&parent->pool_amount) < 2 * parent->pool_size) {
sprintf(name, "pool-%llu", atomic64_add_return(1, &parent->pool_index));
kernfs_get_active(parent_kn);
if (!cgroup_mkdir(parent_kn, name, 0xffff))
atomic64_add(1, &parent->pool_amount);
kernfs_put_active(parent_kn);
}
}

static int cgroup_mkdir_fast_path(struct kernfs_node *parent_kn, const char *name)
{
struct cgroup *parent;
struct rb_root *hidden_root;
int ret;

parent = parent_kn->priv;

if (!cgroup_tryget(parent))
return -ENODEV;

/*
* acquire spinlock outside kernfs_rename because choosing kernfs node
* and renaming need to be atomic.
*/
spin_lock(&parent->lock);

/* if pool is disabled or empty, return and take the slowpath */
if (!parent->enable_pool) {
ret = 1;
goto out_unlock;
}

hidden_root = &parent->hidden_place->kn->dir.children;
if (!hidden_root->rb_node) {
ret = 1;
goto out_unlock;
}

#define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb)
ret = kernfs_rename(rb_to_kn(rb_first(hidden_root)), parent_kn, name);
if (ret)
goto out_unlock;

/* supply pool if pool_amount is less than pool_size */
if (atomic64_sub_return(1, &parent->pool_amount) < parent->pool_size)
schedule_delayed_work(&parent->supply_pool_work,
msecs_to_jiffies(cgroup_supply_delay_time));

out_unlock:
spin_unlock(&parent->lock);
cgroup_put(parent);
return ret;
}

/* hide cgroup which belongs to pool */
static void cgroup_hide(struct cgroup *parent, struct cgroup *cgrp, const char *name)
{
/*
* if cgroup_hide is called by cgroup_supply_work, pool is enabled,
* it needs to acquire spinlock to protect kernfs_rename
*/
if (parent->enable_pool)
spin_lock(&parent->lock);
kernfs_get_active(parent->hidden_place->kn);
BUG_ON(kernfs_rename(cgrp->kn, parent->hidden_place->kn, name));
kernfs_put_active(parent->hidden_place->kn);
if (parent->enable_pool) {
spin_unlock(&parent->lock);
kernfs_set_pinned(cgrp->kn, &parent->lock);
}
}

int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
{
struct cgroup *parent, *cgrp;
int ret;
struct kernfs_node *kn;
int ret = 1;
bool hide = false;

/* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */
if (strchr(name, '\n'))
return -EINVAL;

/* 0xffff means cgroup is created for pool, set to default mode 0x1ed */
if (mode == 0xffff) {
hide = true;
mode = 0x1ed;
}

if (!hide)
ret = cgroup_mkdir_fast_path(parent_kn, name);
if (ret <= 0)
return ret;

parent = cgroup_kn_lock_live(parent_kn, false);
if (!parent)
return -ENODEV;
Expand Down Expand Up @@ -5466,6 +5567,9 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
/* let's create and online css's */
kernfs_activate(cgrp->kn);

if (hide)
cgroup_hide(parent, cgrp, name);

ret = 0;
goto out_unlock;

Expand Down Expand Up @@ -5658,10 +5762,17 @@ int cgroup_rmdir(struct kernfs_node *kn)
if (!cgrp)
return 0;

/* it may creating cgroup in pool */
if (cgrp->pool_size) {
ret = -EBUSY;
goto out_unlock;
}

ret = cgroup_destroy_locked(cgrp);
if (!ret)
TRACE_CGROUP_PATH(rmdir, cgrp);

out_unlock:
cgroup_kn_unlock(kn);
return ret;
}
Expand Down
8 changes: 8 additions & 0 deletions kernel/sysctl.c
Expand Up @@ -73,6 +73,7 @@
#include <linux/latencytop.h>
#include <linux/pid.h>
#include <linux/delayacct.h>
#include <linux/cgroup.h>

#include "../lib/kstrtox.h"

Expand Down Expand Up @@ -2718,6 +2719,13 @@ static struct ctl_table kern_table[] = {
.extra2 = SYSCTL_ONE,
},
#endif
{
.procname = "cgroup_supply_delay_time",
.data = &cgroup_supply_delay_time,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{ }
};

Expand Down

0 comments on commit 6fadbb1

Please sign in to comment.