Skip to content

Commit

Permalink
mm: memcg/slab: reparent memcg kmem_caches on cgroup removal
Browse files Browse the repository at this point in the history
Let's reparent non-root kmem_caches on memcg offlining.  This allows us to
release the memory cgroup without waiting for the last outstanding kernel
object (e.g.  dentry used by another application).

Since the parent cgroup is already charged, everything we need to do is to
splice the list of kmem_caches to the parent's kmem_caches list, swap the
memcg pointer, drop the css refcounter for each kmem_cache and adjust the
parent's css refcounter.

Please, note that kmem_cache->memcg_params.memcg isn't a stable pointer
anymore.  It's safe to read it under rcu_read_lock(), cgroup_mutex held,
or any other way that protects the memory cgroup from being released.

We can race with the slab allocation and deallocation paths.  It's not a
big problem: parent's charge and slab global stats are always correct, and
we don't care anymore about the child usage and global stats.  The child
cgroup is already offline, so we don't use or show it anywhere.

Local slab stats (NR_SLAB_RECLAIMABLE and NR_SLAB_UNRECLAIMABLE) aren't
used anywhere except count_shadow_nodes().  But even there it won't break
anything: after reparenting "nodes" will be 0 on child level (because
we're already reparenting shrinker lists), and on parent level page stats
always were 0, and this patch won't change anything.

[guro@fb.com: properly handle kmem_caches reparented to root_mem_cgroup]
  Link: http://lkml.kernel.org/r/20190620213427.1691847-1-guro@fb.com
Link: http://lkml.kernel.org/r/20190611231813.3148843-11-guro@fb.com
Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Vladimir Davydov <vdavydov.dev@gmail.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Waiman Long <longman@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Andrei Vagin <avagin@gmail.com>
Cc: Qian Cai <cai@lca.pw>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
  • Loading branch information
rgushchin authored and torvalds committed Jul 12, 2019
1 parent 4d96ba3 commit fb2f2b0
Show file tree
Hide file tree
Showing 4 changed files with 58 additions and 18 deletions.
2 changes: 1 addition & 1 deletion include/linux/slab.h
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ void kmem_cache_destroy(struct kmem_cache *);
int kmem_cache_shrink(struct kmem_cache *);

void memcg_create_kmem_cache(struct mem_cgroup *, struct kmem_cache *);
void memcg_deactivate_kmem_caches(struct mem_cgroup *);
void memcg_deactivate_kmem_caches(struct mem_cgroup *, struct mem_cgroup *);

/*
* Please use this macro to create slab caches. Simply specify the
Expand Down
14 changes: 8 additions & 6 deletions mm/memcontrol.c
Original file line number Diff line number Diff line change
Expand Up @@ -3284,15 +3284,15 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg)
*/
memcg->kmem_state = KMEM_ALLOCATED;

memcg_deactivate_kmem_caches(memcg);

kmemcg_id = memcg->kmemcg_id;
BUG_ON(kmemcg_id < 0);

parent = parent_mem_cgroup(memcg);
if (!parent)
parent = root_mem_cgroup;

memcg_deactivate_kmem_caches(memcg, parent);

kmemcg_id = memcg->kmemcg_id;
BUG_ON(kmemcg_id < 0);

/*
* Change kmemcg_id of this cgroup and all its descendants to the
* parent's id, and then move all entries from this cgroup's list_lrus
Expand Down Expand Up @@ -3325,7 +3325,6 @@ static void memcg_free_kmem(struct mem_cgroup *memcg)
if (memcg->kmem_state == KMEM_ALLOCATED) {
WARN_ON(!list_empty(&memcg->kmem_caches));
static_branch_dec(&memcg_kmem_enabled_key);
WARN_ON(page_counter_read(&memcg->kmem));
}
}
#else
Expand Down Expand Up @@ -4773,6 +4772,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)

/* The following stuff does not apply to the root */
if (!parent) {
#ifdef CONFIG_MEMCG_KMEM
INIT_LIST_HEAD(&memcg->kmem_caches);
#endif
root_mem_cgroup = memcg;
return &memcg->css;
}
Expand Down
41 changes: 32 additions & 9 deletions mm/slab.h
Original file line number Diff line number Diff line change
Expand Up @@ -261,14 +261,17 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
* which do not have slab_cache pointer set.
* So this function assumes that the page can pass PageHead() and PageSlab()
* checks.
*
* The kmem_cache can be reparented asynchronously. The caller must ensure
* the memcg lifetime, e.g. by taking rcu_read_lock() or cgroup_mutex.
*/
static inline struct mem_cgroup *memcg_from_slab_page(struct page *page)
{
struct kmem_cache *s;

s = READ_ONCE(page->slab_cache);
if (s && !is_root_cache(s))
return s->memcg_params.memcg;
return READ_ONCE(s->memcg_params.memcg);

return NULL;
}
Expand All @@ -285,19 +288,32 @@ static __always_inline int memcg_charge_slab(struct page *page,
struct lruvec *lruvec;
int ret;

memcg = s->memcg_params.memcg;
rcu_read_lock();
memcg = READ_ONCE(s->memcg_params.memcg);
while (memcg && !css_tryget_online(&memcg->css))
memcg = parent_mem_cgroup(memcg);
rcu_read_unlock();

if (unlikely(!memcg || mem_cgroup_is_root(memcg))) {
mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
(1 << order));
percpu_ref_get_many(&s->memcg_params.refcnt, 1 << order);
return 0;
}

ret = memcg_kmem_charge_memcg(page, gfp, order, memcg);
if (ret)
return ret;
goto out;

lruvec = mem_cgroup_lruvec(page_pgdat(page), memcg);
mod_lruvec_state(lruvec, cache_vmstat_idx(s), 1 << order);

/* transer try_charge() page references to kmem_cache */
percpu_ref_get_many(&s->memcg_params.refcnt, 1 << order);
css_put_many(&memcg->css, 1 << order);

return 0;
out:
css_put(&memcg->css);
return ret;
}

/*
Expand All @@ -310,10 +326,17 @@ static __always_inline void memcg_uncharge_slab(struct page *page, int order,
struct mem_cgroup *memcg;
struct lruvec *lruvec;

memcg = s->memcg_params.memcg;
lruvec = mem_cgroup_lruvec(page_pgdat(page), memcg);
mod_lruvec_state(lruvec, cache_vmstat_idx(s), -(1 << order));
memcg_kmem_uncharge_memcg(page, order, memcg);
rcu_read_lock();
memcg = READ_ONCE(s->memcg_params.memcg);
if (likely(!mem_cgroup_is_root(memcg))) {
lruvec = mem_cgroup_lruvec(page_pgdat(page), memcg);
mod_lruvec_state(lruvec, cache_vmstat_idx(s), -(1 << order));
memcg_kmem_uncharge_memcg(page, order, memcg);
} else {
mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
-(1 << order));
}
rcu_read_unlock();

percpu_ref_put_many(&s->memcg_params.refcnt, 1 << order);
}
Expand Down
19 changes: 17 additions & 2 deletions mm/slab_common.c
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,8 @@ static void memcg_unlink_cache(struct kmem_cache *s)
} else {
list_del(&s->memcg_params.children_node);
list_del(&s->memcg_params.kmem_caches_node);
css_put(&s->memcg_params.memcg->css);
mem_cgroup_put(s->memcg_params.memcg);
WRITE_ONCE(s->memcg_params.memcg, NULL);
}
}
#else
Expand Down Expand Up @@ -785,11 +786,13 @@ static void kmemcg_cache_deactivate(struct kmem_cache *s)
spin_unlock_irq(&memcg_kmem_wq_lock);
}

void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg)
void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg,
struct mem_cgroup *parent)
{
int idx;
struct memcg_cache_array *arr;
struct kmem_cache *s, *c;
unsigned int nr_reparented;

idx = memcg_cache_id(memcg);

Expand All @@ -807,6 +810,18 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg)
kmemcg_cache_deactivate(c);
arr->entries[idx] = NULL;
}
nr_reparented = 0;
list_for_each_entry(s, &memcg->kmem_caches,
memcg_params.kmem_caches_node) {
WRITE_ONCE(s->memcg_params.memcg, parent);
css_put(&memcg->css);
nr_reparented++;
}
if (nr_reparented) {
list_splice_init(&memcg->kmem_caches,
&parent->kmem_caches);
css_get_many(&parent->css, nr_reparented);
}
mutex_unlock(&slab_mutex);

put_online_mems();
Expand Down

0 comments on commit fb2f2b0

Please sign in to comment.