Skip to content
Permalink
Browse files
mm: memcontrol: Add the missing numa_stat interface for cgroup v2
In the cgroup v1, we have a numa_stat interface. This is useful for
providing visibility into the numa locality information within an
memcg since the pages are allowed to be allocated from any physical
node. One of the use cases is evaluating application performance by
combining this information with the application's CPU allocation.
But the cgroup v2 does not. So this patch adds the missing information.

Suggested-by: Shakeel Butt <shakeelb@google.com>
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
  • Loading branch information
Muchun Song authored and intel-lab-lkp committed Sep 12, 2020
1 parent 2932a9e commit 1fadd691a289cdae3e267de7cb5e9444f8b7705d
Show file tree
Hide file tree
Showing 2 changed files with 156 additions and 0 deletions.
@@ -1368,6 +1368,78 @@ PAGE_SIZE multiple when read back.
collapsing an existing range of pages. This counter is not
present when CONFIG_TRANSPARENT_HUGEPAGE is not set.

memory.numa_stat
A read-only flat-keyed file which exists on non-root cgroups.

This breaks down the cgroup's memory footprint into different
types of memory, type-specific details, and other information
per node on the state of the memory management system.

This is useful for providing visibility into the numa locality
information within an memcg since the pages are allowed to be
allocated from any physical node. One of the use cases is evaluating
application performance by combining this information with the
application's CPU allocation.

All memory amounts are in bytes.

The output format of memory.numa_stat is::

type N0=<node 0 pages> N1=<node 1 pages> ...

The entries are ordered to be human readable, and new entries
can show up in the middle. Don't rely on items remaining in a
fixed position; use the keys to look up specific values!

anon
Amount of memory per node used in anonymous mappings such
as brk(), sbrk(), and mmap(MAP_ANONYMOUS)

file
Amount of memory per node used to cache filesystem data,
including tmpfs and shared memory.

kernel_stack
Amount of memory per node allocated to kernel stacks.

shmem
Amount of cached filesystem data per node that is swap-backed,
such as tmpfs, shm segments, shared anonymous mmap()s

file_mapped
Amount of cached filesystem data per node mapped with mmap()

file_dirty
Amount of cached filesystem data per node that was modified but
not yet written back to disk

file_writeback
Amount of cached filesystem data per node that was modified and
is currently being written back to disk

anon_thp
Amount of memory per node used in anonymous mappings backed by
transparent hugepages

inactive_anon, active_anon, inactive_file, active_file, unevictable
Amount of memory, swap-backed and filesystem-backed,
per node on the internal memory management lists used
by the page reclaim algorithm.

As these represent internal list state (eg. shmem pages are on anon
memory management lists), inactive_foo + active_foo may not be equal to
the value for the foo counter, since the foo counter is type-based, not
list-based.

slab_reclaimable
Amount of memory per node used for storing in-kernel data
structures which might be reclaimed, such as dentries and
inodes.

slab_unreclaimable
Amount of memory per node used for storing in-kernel data
structures which cannot be reclaimed on memory pressure.

memory.swap.current
A read-only single value file which exists on non-root
cgroups.
@@ -6393,6 +6393,84 @@ static int memory_stat_show(struct seq_file *m, void *v)
return 0;
}

#ifdef CONFIG_NUMA
static unsigned long memcg_node_page_state(struct mem_cgroup *memcg,
unsigned int nid,
enum node_stat_item idx)
{
VM_BUG_ON(nid >= nr_node_ids);
return lruvec_page_state(mem_cgroup_lruvec(memcg, NODE_DATA(nid)), idx);
}

static const char *memory_numa_stat_format(struct mem_cgroup *memcg)
{
struct numa_stat {
const char *name;
unsigned int ratio;
enum node_stat_item idx;
};

static const struct numa_stat stats[] = {
{ "anno", PAGE_SIZE, NR_ANON_MAPPED },
{ "file", PAGE_SIZE, NR_FILE_PAGES },
{ "kernel_stack", 1024, NR_KERNEL_STACK_KB },
{ "shmem", PAGE_SIZE, NR_SHMEM },
{ "file_mapped", PAGE_SIZE, NR_FILE_MAPPED },
{ "file_dirty", PAGE_SIZE, NR_FILE_DIRTY },
{ "file_writeback", PAGE_SIZE, NR_WRITEBACK },
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
{ "anon_thp", HPAGE_PMD_SIZE, NR_ANON_THPS },
#endif
{ "inactive_anon", PAGE_SIZE, NR_INACTIVE_ANON },
{ "active_anon", PAGE_SIZE, NR_ACTIVE_ANON },
{ "inactive_file", PAGE_SIZE, NR_INACTIVE_FILE },
{ "active_file", PAGE_SIZE, NR_ACTIVE_FILE },
{ "unevictable", PAGE_SIZE, NR_UNEVICTABLE },
{ "slab_reclaimable", 1, NR_SLAB_RECLAIMABLE_B },
{ "slab_unreclaimable", 1, NR_SLAB_UNRECLAIMABLE_B },
};

int i, nid;
struct seq_buf s;

/* Reserve a byte for the trailing null */
seq_buf_init(&s, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE - 1);
if (!s.buffer)
return NULL;

for (i = 0; i < ARRAY_SIZE(stats); i++) {
seq_buf_printf(&s, "%s", stats[i].name);
for_each_node_state(nid, N_MEMORY) {
u64 size;

size = memcg_node_page_state(memcg, nid, stats[i].idx);
size *= stats[i].ratio;
seq_buf_printf(&s, " N%d=%llu", nid, size);
}
seq_buf_putc(&s, '\n');
}

/* The above should easily fit into one page */
if (WARN_ON_ONCE(seq_buf_putc(&s, '\0')))
s.buffer[PAGE_SIZE - 1] = '\0';

return s.buffer;
}

static int memory_numa_stat_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
const char *buf;

buf = memory_numa_stat_format(memcg);
if (!buf)
return -ENOMEM;
seq_puts(m, buf);
kfree(buf);
return 0;
}
#endif

static int memory_oom_group_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
@@ -6470,6 +6548,12 @@ static struct cftype memory_files[] = {
.name = "stat",
.seq_show = memory_stat_show,
},
#ifdef CONFIG_NUMA
{
.name = "numa_stat",
.seq_show = memory_numa_stat_show,
},
#endif
{
.name = "oom.group",
.flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,

0 comments on commit 1fadd69

Please sign in to comment.