Skip to content
Permalink
Browse files
sched/fair: Update nohz.next_balance for newly NOHZ-idle CPUs
Consider a system with some NOHZ-idle CPUs, such that

  nohz.idle_cpus_mask = S
  nohz.next_balance = T

When a new CPU k goes NOHZ idle (nohz_balance_enter_idle()), we end up
with:

  nohz.idle_cpus_mask = S \U {k}
  nohz.next_balance = T

Note that the nohz.next_balance hasn't changed - it won't be updated until
a NOHZ balance is triggered. This is problematic if the newly NOHZ idle CPU
has an earlier rq.next_balance than the other NOHZ idle CPUs, IOW if:

  cpu_rq(k).next_balance < nohz.next_balance

In such scenarios, the existing nohz.next_balance will prevent any NOHZ
balance from happening, which itself will prevent nohz.next_balance from
being updated to this new cpu_rq(k).next_balance. Unnecessary load balance
delays of over 12ms caused by this were observed on an arm64 RB5 board.

Track which CPUs are iterated over during a NOHZ idle balance with a new
cpumask. When considering whether to kick a NOHZ idle balance, use this
cpumask to determine if any CPU has entered NOHZ idle but hasn't had its
rq.next_balance collated into nohz.next_balance yet, and kick a NOHZ_STATS
balance if it is the case.

Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
  • Loading branch information
valschneider authored and intel-lab-lkp committed Jul 14, 2021
1 parent 031e3bd commit cbd87e97caf59c1a9d06d35e5a59404e4d7c8660
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 2 deletions.
@@ -8893,6 +8893,10 @@ static struct kmem_cache *task_group_cache __read_mostly;
DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);

#ifdef CONFIG_NOHZ_COMMON
DECLARE_PER_CPU(cpumask_var_t, nohz_balance_mask);
#endif /* CONFIG_NOHZ_COMMON */

void __init sched_init(void)
{
unsigned long ptr = 0;
@@ -8942,6 +8946,10 @@ void __init sched_init(void)
cpumask_size(), GFP_KERNEL, cpu_to_node(i));
per_cpu(select_idle_mask, i) = (cpumask_var_t)kzalloc_node(
cpumask_size(), GFP_KERNEL, cpu_to_node(i));
#ifdef CONFIG_NOHZ_COMMON
per_cpu(nohz_balance_mask, i) = (cpumask_var_t)kzalloc_node(
cpumask_size(), GFP_KERNEL, cpu_to_node(i));
#endif /* CONFIG_NOHZ_COMMON */
}
#endif /* CONFIG_CPUMASK_OFFSTACK */

@@ -5694,8 +5694,11 @@ DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);

#ifdef CONFIG_NO_HZ_COMMON

DEFINE_PER_CPU(cpumask_var_t, nohz_balance_mask);

static struct {
cpumask_var_t idle_cpus_mask;
cpumask_var_t idle_cpus_mask; /* CPUs in NOHZ idle */
cpumask_var_t last_balance_mask; /* CPUs covered by last NOHZ balance */
atomic_t nr_cpus;
int has_blocked; /* Idle CPUS has blocked load */
unsigned long next_balance; /* in jiffy units */
@@ -10351,6 +10354,13 @@ static void nohz_balancer_kick(struct rq *rq)
unlock:
rcu_read_unlock();
out:
/*
* Some CPUs have recently gone into NOHZ idle; kick a balance to
* collate the proper next balance interval.
*/
if (!cpumask_subset(nohz.idle_cpus_mask, nohz.last_balance_mask))
flags |= NOHZ_STATS_KICK;

if (flags)
kick_ilb(flags);
}
@@ -10487,6 +10497,7 @@ static bool update_nohz_stats(struct rq *rq)
static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
enum cpu_idle_type idle)
{
struct cpumask *cpus = this_cpu_cpumask_var_ptr(nohz_balance_mask);
/* Earliest time when we have to do rebalance again */
unsigned long now = jiffies;
unsigned long next_balance = now + 60*HZ;
@@ -10518,7 +10529,8 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
* Start with the next CPU after this_cpu so we will end with this_cpu and let a
* chance for other idle cpu to pull load.
*/
for_each_cpu_wrap(balance_cpu, nohz.idle_cpus_mask, this_cpu+1) {
cpumask_copy(cpus, nohz.idle_cpus_mask);
for_each_cpu_wrap(balance_cpu, cpus, this_cpu+1) {
if (!idle_cpu(balance_cpu))
continue;

@@ -10565,6 +10577,8 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
if (likely(update_next_balance))
nohz.next_balance = next_balance;

cpumask_copy(nohz.last_balance_mask, cpus);

WRITE_ONCE(nohz.next_blocked,
now + msecs_to_jiffies(LOAD_AVG_PERIOD));

@@ -11550,6 +11564,7 @@ __init void init_sched_fair_class(void)
nohz.next_balance = jiffies;
nohz.next_blocked = jiffies;
zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
zalloc_cpumask_var(&nohz.last_balance_mask, GFP_NOWAIT);
#endif
#endif /* SMP */

0 comments on commit cbd87e9

Please sign in to comment.