diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index e1c705fdfa7c53..e3e486d1b67825 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -486,6 +486,22 @@ struct cgroup { /* Used to store internal freezer state */ struct cgroup_freezer_state freezer; + /* + * cgroup pool related members. lock protects cgroup's kernfs node in + * pool. pool_index records index of cgroup which put into pool next. + * pool_amount records how many cgroups pool remains. pool_size is set + * by user, supply pool util pool_amount reach 2*pool_size if + * pool_amount is less than pool_size to retain enough cgroup in pool to + * guarantee cgroup_mkdir take the fast path. + */ + spinlock_t lock; + atomic64_t pool_index; + atomic64_t pool_amount; + u64 pool_size; + bool enable_pool; + struct kernfs_root *hidden_place; + struct delayed_work supply_pool_work; + /* ids of the ancestors at each level including self */ u64 ancestor_ids[]; }; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 7bf60454a31361..ade614b7804058 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -432,6 +432,8 @@ static inline void cgroup_put(struct cgroup *cgrp) css_put(&cgrp->self); } +extern unsigned int cgroup_supply_delay_time; + /** * task_css_set_check - obtain a task's css_set with extra access conditions * @task: the task to obtain css_set for diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 35b92032834477..8964c4d0741b5a 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -609,6 +609,136 @@ static int cgroup_clone_children_write(struct cgroup_subsys_state *css, return 0; } +/* + * kernfs_open_file->mutex can't avoid competition if writing to pool_size + * of parent cgroup and child cgroup at the same time. use cgroup_pool_mutex + * to serialize any write operation to pool_size. + */ +DEFINE_MUTEX(cgroup_pool_mutex); + +static u64 cgroup_pool_size_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + /* kernfs r/w access is serialized by kernfs_open_file->mutex */ + return css->cgroup->pool_size; +} + +extern struct kernfs_node *kernfs_get_active(struct kernfs_node *kn); +extern void kernfs_put_active(struct kernfs_node *kn); + +static ssize_t cgroup_pool_size_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + char name[NAME_MAX + 1]; + struct cgroup *cgrp; + ssize_t ret = -EPERM; + u64 val; + int i; + + cgrp = of->kn->parent->priv; + + if (kstrtoull(buf, 0, &val)) + return -EINVAL; + + if (!cgroup_tryget(cgrp)) + return -ENODEV; + + kernfs_break_active_protection(of->kn); + mutex_lock(&cgroup_pool_mutex); + mutex_lock(&cgroup_mutex); + spin_lock(&cgrp->lock); + + /* + * only non-zero -> zero or zero -> non-zero settings are invalid. + */ + if ((cgrp->pool_size && val) || (!cgrp->pool_size && !val)) + goto out_fail; + + if (cgroup_is_dead(cgrp)) { + ret = -ENODEV; + goto out_fail; + } + + cgrp->pool_size = val; + spin_unlock(&cgrp->lock); + mutex_unlock(&cgroup_mutex); + + if (val) { + /* create kernfs root to hide cgroup which belongs to pool */ + cgrp->hidden_place = kernfs_create_root(NULL, 0, NULL); + + /* + * names of cgroups in pool obey the rule of pool-*, it may + * fail if cgroup has the same name already exists, if failed, + * try again with different name. + * + * cgroup_mkdir called here is under context of writing + * pool_size, so we need to call kernfs_get_active to simulate + * kernfs mkdir context. + * + * normally, the mode of 0xffff is intercepted at the VFS layer + * because it is invalid. use 0xffff to tell cgroup_mkdir it is + * create a cgroup for cgroup pool. + */ + for (i = 0; i < val * 2;) { + sprintf(name, "pool-%llu", atomic64_add_return(1, &cgrp->pool_index)); + kernfs_get_active(cgrp->kn); + if (!cgroup_mkdir(cgrp->kn, name, 0xffff)) + i++; + kernfs_put_active(cgrp->kn); + } + atomic64_set(&cgrp->pool_amount, val * 2); + + /* set kernfs node pinned after generating pool */ + mutex_lock(&cgroup_mutex); + spin_lock(&cgrp->lock); + cgrp->enable_pool = true; + kernfs_set_pinned(cgrp->kn, &cgrp->lock); + kernfs_set_pinned(cgrp->hidden_place->kn, &cgrp->lock); + spin_unlock(&cgrp->lock); + mutex_unlock(&cgroup_mutex); + } else { + struct kernfs_node *child, *n; + struct rb_root *hidden_root = &cgrp->hidden_place->kn->dir.children; + + /* clear kernfs node pinned before removing pool */ + mutex_lock(&cgroup_mutex); + spin_lock(&cgrp->lock); + cgrp->enable_pool = false; + kernfs_clear_pinned(cgrp->kn); + kernfs_clear_pinned(cgrp->hidden_place->kn); + spin_unlock(&cgrp->lock); + mutex_unlock(&cgroup_mutex); + + /* pool is disabled, cancel supply work */ + cancel_delayed_work_sync(&cgrp->supply_pool_work); + + /* traverse cgroup in pool and remove them */ + while (hidden_root->rb_node) { + rbtree_postorder_for_each_entry_safe(child, n, hidden_root, rb) { + kernfs_get_active(child); + ret = cgroup_rmdir(child); + kernfs_put_active(child); + } + } + kernfs_destroy_root(cgrp->hidden_place); + atomic64_set(&cgrp->pool_amount, 0); + } + + + ret = nbytes; + goto out_success; + +out_fail: + spin_unlock(&cgrp->lock); + mutex_unlock(&cgroup_mutex); +out_success: + mutex_unlock(&cgroup_pool_mutex); + kernfs_unbreak_active_protection(of->kn); + cgroup_put(cgrp); + return ret; +} + /* cgroup core interface files for the legacy hierarchies */ struct cftype cgroup1_base_files[] = { { @@ -651,6 +781,11 @@ struct cftype cgroup1_base_files[] = { .write = cgroup_release_agent_write, .max_write_len = PATH_MAX - 1, }, + { + .name = "pool_size", + .read_u64 = cgroup_pool_size_read, + .write = cgroup_pool_size_write, + }, { } /* terminate */ }; @@ -845,9 +980,13 @@ static int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent mutex_lock(&cgroup_mutex); + if (kn->parent->pinned) + spin_lock(kn->parent->lock); ret = kernfs_rename(kn, new_parent, new_name_str); if (!ret) TRACE_CGROUP_PATH(rename, cgrp); + if (kn->parent->pinned) + spin_unlock(kn->parent->lock); mutex_unlock(&cgroup_mutex); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 881ce1470bebad..d259e3bdca2af3 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -245,6 +245,7 @@ static void kill_css(struct cgroup_subsys_state *css); static int cgroup_addrm_files(struct cgroup_subsys_state *css, struct cgroup *cgrp, struct cftype cfts[], bool is_add); +static void cgroup_supply_work(struct work_struct *work); /** * cgroup_ssid_enabled - cgroup subsys enabled test by subsys ID @@ -1925,6 +1926,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->cset_links); INIT_LIST_HEAD(&cgrp->pidlists); mutex_init(&cgrp->pidlist_mutex); + spin_lock_init(&cgrp->lock); cgrp->self.cgroup = cgrp; cgrp->self.flags |= CSS_ONLINE; cgrp->dom_cgrp = cgrp; @@ -1938,6 +1940,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) init_waitqueue_head(&cgrp->offline_waitq); INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent); + INIT_DELAYED_WORK(&cgrp->supply_pool_work, cgroup_supply_work); } void init_cgroup_root(struct cgroup_fs_context *ctx) @@ -5419,15 +5422,113 @@ static bool cgroup_check_hierarchy_limits(struct cgroup *parent) return ret; } +extern struct kernfs_node *kernfs_get_active(struct kernfs_node *kn); +extern void kernfs_put_active(struct kernfs_node *kn); + +unsigned int cgroup_supply_delay_time; + +/* supply pool_amount to 2*pool_size */ +static void cgroup_supply_work(struct work_struct *work) +{ + char name[NAME_MAX + 1]; + struct cgroup *parent = container_of((struct delayed_work *)work, + struct cgroup, supply_pool_work); + struct kernfs_node *parent_kn = parent->kn; + + while (atomic64_read(&parent->pool_amount) < 2 * parent->pool_size) { + sprintf(name, "pool-%llu", atomic64_add_return(1, &parent->pool_index)); + kernfs_get_active(parent_kn); + if (!cgroup_mkdir(parent_kn, name, 0xffff)) + atomic64_add(1, &parent->pool_amount); + kernfs_put_active(parent_kn); + } +} + +static int cgroup_mkdir_fast_path(struct kernfs_node *parent_kn, const char *name) +{ + struct cgroup *parent; + struct rb_root *hidden_root; + int ret; + + parent = parent_kn->priv; + + if (!cgroup_tryget(parent)) + return -ENODEV; + + /* + * acquire spinlock outside kernfs_rename because choosing kernfs node + * and renaming need to be atomic. + */ + spin_lock(&parent->lock); + + /* if pool is disabled or empty, return and take the slowpath */ + if (!parent->enable_pool) { + ret = 1; + goto out_unlock; + } + + hidden_root = &parent->hidden_place->kn->dir.children; + if (!hidden_root->rb_node) { + ret = 1; + goto out_unlock; + } + +#define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb) + ret = kernfs_rename(rb_to_kn(rb_first(hidden_root)), parent_kn, name); + if (ret) + goto out_unlock; + + /* supply pool if pool_amount is less than pool_size */ + if (atomic64_sub_return(1, &parent->pool_amount) < parent->pool_size) + schedule_delayed_work(&parent->supply_pool_work, + msecs_to_jiffies(cgroup_supply_delay_time)); + +out_unlock: + spin_unlock(&parent->lock); + cgroup_put(parent); + return ret; +} + +/* hide cgroup which belongs to pool */ +static void cgroup_hide(struct cgroup *parent, struct cgroup *cgrp, const char *name) +{ + /* + * if cgroup_hide is called by cgroup_supply_work, pool is enabled, + * it needs to acquire spinlock to protect kernfs_rename + */ + if (parent->enable_pool) + spin_lock(&parent->lock); + kernfs_get_active(parent->hidden_place->kn); + BUG_ON(kernfs_rename(cgrp->kn, parent->hidden_place->kn, name)); + kernfs_put_active(parent->hidden_place->kn); + if (parent->enable_pool) { + spin_unlock(&parent->lock); + kernfs_set_pinned(cgrp->kn, &parent->lock); + } +} + int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) { struct cgroup *parent, *cgrp; - int ret; + struct kernfs_node *kn; + int ret = 1; + bool hide = false; /* do not accept '\n' to prevent making /proc//cgroup unparsable */ if (strchr(name, '\n')) return -EINVAL; + /* 0xffff means cgroup is created for pool, set to default mode 0x1ed */ + if (mode == 0xffff) { + hide = true; + mode = 0x1ed; + } + + if (!hide) + ret = cgroup_mkdir_fast_path(parent_kn, name); + if (ret <= 0) + return ret; + parent = cgroup_kn_lock_live(parent_kn, false); if (!parent) return -ENODEV; @@ -5466,6 +5567,9 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) /* let's create and online css's */ kernfs_activate(cgrp->kn); + if (hide) + cgroup_hide(parent, cgrp, name); + ret = 0; goto out_unlock; @@ -5658,10 +5762,17 @@ int cgroup_rmdir(struct kernfs_node *kn) if (!cgrp) return 0; + /* it may creating cgroup in pool */ + if (cgrp->pool_size) { + ret = -EBUSY; + goto out_unlock; + } + ret = cgroup_destroy_locked(cgrp); if (!ret) TRACE_CGROUP_PATH(rmdir, cgrp); +out_unlock: cgroup_kn_unlock(kn); return ret; } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 083be6af29d705..9406111a30cb01 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -73,6 +73,7 @@ #include #include #include +#include #include "../lib/kstrtox.h" @@ -2718,6 +2719,13 @@ static struct ctl_table kern_table[] = { .extra2 = SYSCTL_ONE, }, #endif + { + .procname = "cgroup_supply_delay_time", + .data = &cgroup_supply_delay_time, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { } };