Skip to content

Commit

Permalink
High kmalloc-32 slab cache consumption with 10k containers
Browse files Browse the repository at this point in the history
Hi,

When running 10000 (more-or-less-empty-)containers on a bare-metal Power9
server(160 CPUs, 2 NUMA nodes, 256G memory), it is seen that memory
consumption increases quite a lot (around 172G) when the containers are
running. Most of it comes from slab (149G) and within slab, the majority of
it comes from kmalloc-32 cache (102G)

The major allocator of kmalloc-32 slab cache happens to be the list_head
allocations of list_lru_one list. These lists are created whenever a
FS mount happens. Specially two such lists are registered by alloc_super(),
one for dentry and another for inode shrinker list. And these lists
are created for all possible NUMA nodes and for all given memcgs
(memcg_nr_cache_ids to be particular)

If,

A = Nr allocation request per mount: 2 (one for dentry and inode list)
B = Nr NUMA possible nodes
C = memcg_nr_cache_ids
D = size of each kmalloc-32 object: 32 bytes,

then for every mount, the amount of memory consumed by kmalloc-32 slab
cache for list_lru creation is A*B*C*D bytes.

Following factors contribute to the excessive allocations:

- Lists are created for possible NUMA nodes.
- memcg_nr_cache_ids grows in bulk (see memcg_alloc_cache_id() and additional
  list_lrus are created when it grows. Thus we end up creating list_lru_one
  list_heads even for those memcgs which are yet to be created.
  For example, when 10000 memcgs are created, memcg_nr_cache_ids reach
  a value of 12286.
- When a memcg goes offline, the list elements are drained to the parent
  memcg, but the list_head entry remains.
- The lists are destroyed only when the FS is unmounted. So list_heads
  for non-existing memcgs remain and continue to contribute to the
  kmalloc-32 allocation. This is presumably done for performance
  reason as they get reused when new memcgs are created, but they end up
  consuming slab memory until then.
- In case of containers, a few file systems get mounted and are specific
  to the container namespace and hence to a particular memcg, but we
  end up creating lists for all the memcgs.
  As an example, if 7 FS mounts are done for every container and when
  10k containers are created, we end up creating 2*7*12286 list_lru_one
  lists for each NUMA node. It appears that no elements will get added
  to other than 2*7=14 of them in the case of containers.

One straight forward way to prevent this excessive list_lru_one
allocations is to limit the list_lru_one creation only to the
relevant memcg. However I don't see an easy way to figure out
that relevant memcg from FS mount path (alloc_super())

As an alternative approach, I have this below hack that does lazy
list_lru creation. The memcg-specific list is created and initialized
only when there is a request to add an element to that particular
list. Though I am not sure about the full impact of this change
on the owners of the lists and also the performance impact of this,
the overall savings look good.

Used memory
		Before		During		After
W/o patch	23G		172G		40G
W/  patch	23G		69G		29G

Slab consumption
		Before		During		After
W/o patch	1.5G		149G		22G
W/  patch	1.5G		45G		10G

Number of kmalloc-32 allocations
		Before		During		After
W/o patch	178176		3442409472	388933632
W/  patch	190464		468992		468992

Any thoughts on other approaches to address this scenario and
any specific comments about the approach that I have taken is
appreciated. Meanwhile the patch looks like below:

From 9444a0c6734c2853057b1f486f85da2c409fdc84 Mon Sep 17 00:00:00 2001
From: Bharata B Rao <bharata@linux.ibm.com>
Date: Wed, 31 Mar 2021 18:21:45 +0530
Subject: [PATCH 1/1] mm: list_lru: Allocate list_lru_one only when required.

Don't pre-allocate list_lru_one list heads for all memcg_cache_ids.
Instead allocate and initialize it only when required.

Signed-off-by: Bharata B Rao <bharata@linux.ibm.com>
  • Loading branch information
Bharata B Rao authored and intel-lab-lkp committed Apr 5, 2021
1 parent 5e46d1b commit b7ba352
Showing 1 changed file with 38 additions and 41 deletions.
79 changes: 38 additions & 41 deletions mm/list_lru.c
Expand Up @@ -112,16 +112,32 @@ list_lru_from_kmem(struct list_lru_node *nlru, void *ptr,
}
#endif /* CONFIG_MEMCG_KMEM */

static void init_one_lru(struct list_lru_one *l)
{
INIT_LIST_HEAD(&l->list);
l->nr_items = 0;
}

bool list_lru_add(struct list_lru *lru, struct list_head *item)
{
int nid = page_to_nid(virt_to_page(item));
struct list_lru_node *nlru = &lru->node[nid];
struct mem_cgroup *memcg;
struct list_lru_one *l;
struct list_lru_memcg *memcg_lrus;

spin_lock(&nlru->lock);
if (list_empty(item)) {
l = list_lru_from_kmem(nlru, item, &memcg);
if (!l) {
l = kmalloc(sizeof(struct list_lru_one), GFP_ATOMIC);
if (!l)
goto out;

init_one_lru(l);
memcg_lrus = rcu_dereference_protected(nlru->memcg_lrus, true);
memcg_lrus->lru[memcg_cache_id(memcg)] = l;
}
list_add_tail(item, &l->list);
/* Set shrinker bit if the first element was added */
if (!l->nr_items++)
Expand All @@ -131,6 +147,7 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item)
spin_unlock(&nlru->lock);
return true;
}
out:
spin_unlock(&nlru->lock);
return false;
}
Expand Down Expand Up @@ -176,11 +193,12 @@ unsigned long list_lru_count_one(struct list_lru *lru,
{
struct list_lru_node *nlru = &lru->node[nid];
struct list_lru_one *l;
unsigned long count;
unsigned long count = 0;

rcu_read_lock();
l = list_lru_from_memcg_idx(nlru, memcg_cache_id(memcg));
count = READ_ONCE(l->nr_items);
if (l)
count = READ_ONCE(l->nr_items);
rcu_read_unlock();

return count;
Expand All @@ -207,6 +225,9 @@ __list_lru_walk_one(struct list_lru_node *nlru, int memcg_idx,
unsigned long isolated = 0;

l = list_lru_from_memcg_idx(nlru, memcg_idx);
if (!l)
goto out;

restart:
list_for_each_safe(item, n, &l->list) {
enum lru_status ret;
Expand Down Expand Up @@ -251,6 +272,7 @@ __list_lru_walk_one(struct list_lru_node *nlru, int memcg_idx,
BUG();
}
}
out:
return isolated;
}

Expand Down Expand Up @@ -312,12 +334,6 @@ unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
}
EXPORT_SYMBOL_GPL(list_lru_walk_node);

static void init_one_lru(struct list_lru_one *l)
{
INIT_LIST_HEAD(&l->list);
l->nr_items = 0;
}

#ifdef CONFIG_MEMCG_KMEM
static void __memcg_destroy_list_lru_node(struct list_lru_memcg *memcg_lrus,
int begin, int end)
Expand All @@ -328,41 +344,16 @@ static void __memcg_destroy_list_lru_node(struct list_lru_memcg *memcg_lrus,
kfree(memcg_lrus->lru[i]);
}

static int __memcg_init_list_lru_node(struct list_lru_memcg *memcg_lrus,
int begin, int end)
{
int i;

for (i = begin; i < end; i++) {
struct list_lru_one *l;

l = kmalloc(sizeof(struct list_lru_one), GFP_KERNEL);
if (!l)
goto fail;

init_one_lru(l);
memcg_lrus->lru[i] = l;
}
return 0;
fail:
__memcg_destroy_list_lru_node(memcg_lrus, begin, i);
return -ENOMEM;
}

static int memcg_init_list_lru_node(struct list_lru_node *nlru)
{
struct list_lru_memcg *memcg_lrus;
int size = memcg_nr_cache_ids;

memcg_lrus = kvmalloc(sizeof(*memcg_lrus) +
memcg_lrus = kvzalloc(sizeof(*memcg_lrus) +
size * sizeof(void *), GFP_KERNEL);
if (!memcg_lrus)
return -ENOMEM;

if (__memcg_init_list_lru_node(memcg_lrus, 0, size)) {
kvfree(memcg_lrus);
return -ENOMEM;
}
RCU_INIT_POINTER(nlru->memcg_lrus, memcg_lrus);

return 0;
Expand All @@ -389,15 +380,10 @@ static int memcg_update_list_lru_node(struct list_lru_node *nlru,

old = rcu_dereference_protected(nlru->memcg_lrus,
lockdep_is_held(&list_lrus_mutex));
new = kvmalloc(sizeof(*new) + new_size * sizeof(void *), GFP_KERNEL);
new = kvzalloc(sizeof(*new) + new_size * sizeof(void *), GFP_KERNEL);
if (!new)
return -ENOMEM;

if (__memcg_init_list_lru_node(new, old_size, new_size)) {
kvfree(new);
return -ENOMEM;
}

memcpy(&new->lru, &old->lru, old_size * sizeof(void *));

/*
Expand Down Expand Up @@ -526,6 +512,7 @@ static void memcg_drain_list_lru_node(struct list_lru *lru, int nid,
struct list_lru_node *nlru = &lru->node[nid];
int dst_idx = dst_memcg->kmemcg_id;
struct list_lru_one *src, *dst;
struct list_lru_memcg *memcg_lrus;

/*
* Since list_lru_{add,del} may be called under an IRQ-safe lock,
Expand All @@ -534,7 +521,17 @@ static void memcg_drain_list_lru_node(struct list_lru *lru, int nid,
spin_lock_irq(&nlru->lock);

src = list_lru_from_memcg_idx(nlru, src_idx);
if (!src)
goto out;

dst = list_lru_from_memcg_idx(nlru, dst_idx);
if (!dst) {
/* TODO: Use __GFP_NOFAIL? */
dst = kmalloc(sizeof(struct list_lru_one), GFP_ATOMIC);
init_one_lru(dst);
memcg_lrus = rcu_dereference_protected(nlru->memcg_lrus, true);
memcg_lrus->lru[dst_idx] = dst;
}

list_splice_init(&src->list, &dst->list);

Expand All @@ -543,7 +540,7 @@ static void memcg_drain_list_lru_node(struct list_lru *lru, int nid,
memcg_set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru));
src->nr_items = 0;
}

out:
spin_unlock_irq(&nlru->lock);
}

Expand Down

0 comments on commit b7ba352

Please sign in to comment.