Permalink
Browse files

kernel - usched_dfly revamp (7), bring back td_release, sysv_sem, wei…

…ghts

* Bring back the td_release kernel priority adjustment.

* sysv_sem now attempts to delay wakeups until after releasing its token.

* Tune default weights.

* Do not depress priority until we've become the uschedcp.

* Fix priority sort for LWKT and usched_dfly to avoid context-switching
  across all runable threads twice.
  • Loading branch information...
1 parent 9e603ef commit e3e6be1f3ada3078bf270c3a65637a84a95c4585 Matthew Dillon committed Sep 25, 2012
View
@@ -464,6 +464,7 @@ DONTPROBE_1284 opt_ppb_1284.h
ENABLE_ALART opt_intpm.h
# These cause changes all over the kernel
+NO_LWKT_SPLIT_USERPRI opt_global.h
BLKDEV_IOSIZE opt_global.h
DEBUG opt_global.h
DEBUG_LOCKS opt_global.h
View
@@ -673,10 +673,10 @@ lwp_fork(struct lwp *origlp, struct proc *destproc, int flags)
td->td_proc = destproc;
td->td_lwp = lp;
td->td_switch = cpu_heavy_switch;
-#ifdef LWKT_SPLIT_USERPRI
- lwkt_setpri(td, TDPRI_KERN_USER);
-#else
+#ifdef NO_LWKT_SPLIT_USERPRI
lwkt_setpri(td, TDPRI_USER_NORM);
+#else
+ lwkt_setpri(td, TDPRI_KERN_USER);
#endif
lwkt_set_comm(td, "%s", destproc->p_comm);
View
@@ -183,9 +183,7 @@ _lwkt_dequeue(thread_t td)
* There are a limited number of lwkt threads runnable since user
* processes only schedule one at a time per cpu. However, there can
* be many user processes in kernel mode exiting from a tsleep() which
- * become runnable. We do a secondary comparison using td_upri to try
- * to order these in the situation where several wake up at the same time
- * to avoid excessive switching.
+ * become runnable.
*
* NOTE: lwkt_schedulerclock() will force a round-robin based on td_pri and
* will ignore user priority. This is to ensure that user threads in
@@ -207,10 +205,23 @@ _lwkt_enqueue(thread_t td)
TAILQ_INSERT_TAIL(&gd->gd_tdrunq, td, td_threadq);
atomic_set_int(&gd->gd_reqflags, RQF_RUNNING);
} else {
+ /*
+ * NOTE: td_upri - higher numbers more desireable, same sense
+ * as td_pri (typically reversed from lwp_upri).
+ *
+ * In the equal priority case we want the best selection
+ * at the beginning so the less desireable selections know
+ * that they have to setrunqueue/go-to-another-cpu, even
+ * though it means switching back to the 'best' selection.
+ * This also avoids degenerate situations when many threads
+ * are runnable or waking up at the same time.
+ *
+ * If upri matches exactly place at end/round-robin.
+ */
while (xtd &&
- (xtd->td_pri > td->td_pri ||
+ (xtd->td_pri >= td->td_pri ||
(xtd->td_pri == td->td_pri &&
- xtd->td_upri >= td->td_pri))) {
+ xtd->td_upri >= td->td_upri))) {
xtd = TAILQ_NEXT(xtd, td_threadq);
}
if (xtd)
@@ -719,7 +730,7 @@ lwkt_switch(void)
goto skip;
while ((ntd = TAILQ_NEXT(ntd, td_threadq)) != NULL) {
-#ifdef LWKT_SPLIT_USERPRI
+#ifndef NO_LWKT_SPLIT_USERPRI
/*
* Never schedule threads returning to userland or the
* user thread scheduler helper thread when higher priority
@@ -1144,7 +1155,7 @@ lwkt_passive_release(struct thread *td)
{
struct lwp *lp = td->td_lwp;
-#ifdef LWKT_SPLIT_USERPRI
+#ifndef NO_LWKT_SPLIT_USERPRI
td->td_release = NULL;
lwkt_setpri_self(TDPRI_KERN_USER);
#endif
View
@@ -773,7 +773,6 @@ sys_semop(struct semop_args *uap)
#ifdef SEM_DEBUG
kprintf("call to semop(%d, 0x%x, %u)\n", semid, sops, nsops);
#endif
-
if (!jail_sysvipc_allowed && td->td_ucred->cr_prison != NULL)
return (ENOSYS);
@@ -783,8 +782,11 @@ sys_semop(struct semop_args *uap)
eval = EINVAL;
goto done2;
}
+
+ wakeup_start_delayed();
semaptr = &sema[semid];
lockmgr(&semaptr->lk, LK_SHARED);
+
if ((semaptr->ds.sem_perm.mode & SEM_ALLOC) == 0) {
eval = EINVAL;
goto done;
@@ -948,7 +950,7 @@ sys_semop(struct semop_args *uap)
#endif
gen = semaptr->gen;
lockmgr(&semaptr->lk, LK_RELEASE);
- eval = tsleep(semptr, PCATCH | PINTERLOCKED, "semwait", 0);
+ eval = tsleep(semptr, PCATCH | PINTERLOCKED, "semwait", hz);
lockmgr(&semaptr->lk, LK_SHARED);
#ifdef SEM_DEBUG
kprintf("semop: good morning (eval=%d)!\n", eval);
@@ -1073,6 +1075,7 @@ sys_semop(struct semop_args *uap)
eval = 0;
done:
lockmgr(&semaptr->lk, LK_RELEASE);
+ wakeup_end_delayed();
done2:
return(eval);
}
View
@@ -324,8 +324,11 @@ SYSINIT(runqueue, SI_BOOT2_USCHED, SI_ORDER_FIRST, bsd4_rqinit, NULL)
* It is responsible for making the thread the current designated userland
* thread for this cpu, blocking if necessary.
*
- * The kernel has already depressed our LWKT priority so we must not switch
- * until we have either assigned or disposed of the thread.
+ * The kernel will not depress our LWKT priority until after we return,
+ * in case we have to shove over to another cpu.
+ *
+ * We must determine our thread's disposition before we switch away. This
+ * is very sensitive code.
*
* WARNING! THIS FUNCTION IS ALLOWED TO CAUSE THE CURRENT THREAD TO MIGRATE
* TO ANOTHER CPU! Because most of the kernel assumes that no migration will
View
@@ -97,11 +97,12 @@ TAILQ_HEAD(rq, lwp);
#define lwp_uload lwp_usdata.dfly.uload
#define lwp_rqtype lwp_usdata.dfly.rqtype
#define lwp_qcpu lwp_usdata.dfly.qcpu
+#define lwp_rrcount lwp_usdata.dfly.rrcount
struct usched_dfly_pcpu {
struct spinlock spin;
struct thread helper_thread;
- short rrcount;
+ short unusde01;
short upri;
int uload;
int ucount;
@@ -255,7 +256,7 @@ SYSCTL_INT(_debug, OID_AUTO, dfly_chooser, CTLFLAG_RW,
static int usched_dfly_smt = 0;
static int usched_dfly_cache_coherent = 0;
static int usched_dfly_weight1 = 200; /* keep thread on current cpu */
-static int usched_dfly_weight2 = 120; /* synchronous peer's current cpu */
+static int usched_dfly_weight2 = 180; /* synchronous peer's current cpu */
static int usched_dfly_weight3 = 40; /* number of threads on queue */
static int usched_dfly_weight4 = 160; /* availability of idle cores */
static int usched_dfly_features = 0x8F; /* allow pulls */
@@ -281,8 +282,11 @@ KTR_INFO(KTR_USCHED_DFLY, usched, chooseproc, 0,
* It is responsible for making the thread the current designated userland
* thread for this cpu, blocking if necessary.
*
- * The kernel has already depressed our LWKT priority so we must not switch
- * until we have either assigned or disposed of the thread.
+ * The kernel will not depress our LWKT priority until after we return,
+ * in case we have to shove over to another cpu.
+ *
+ * We must determine our thread's disposition before we switch away. This
+ * is very sensitive code.
*
* WARNING! THIS FUNCTION IS ALLOWED TO CAUSE THE CURRENT THREAD TO MIGRATE
* TO ANOTHER CPU! Because most of the kernel assumes that no migration will
@@ -382,12 +386,17 @@ dfly_acquire_curproc(struct lwp *lp)
*
* It is important to do a masked test to avoid the edge
* case where two near-equal-priority threads are constantly
- * interrupting each other. Since our context is the one
- * that is active NOW, we WANT to steal the uschedcp
- * designation and not switch-flap.
+ * interrupting each other.
+ *
+ * In the exact match case another thread has already gained
+ * uschedcp and lowered its priority, if we steal it the
+ * other thread will stay stuck on the LWKT runq and not
+ * push to another cpu. So don't steal on equal-priority even
+ * though it might appear to be more beneficial due to not
+ * having to switch back to the other thread's context.
*/
if (dd->uschedcp &&
- (dd->upri & ~PPQMASK) >=
+ (dd->upri & ~PPQMASK) >
(lp->lwp_priority & ~PPQMASK)) {
dd->uschedcp = lp;
dd->upri = lp->lwp_priority;
@@ -516,7 +525,9 @@ dfly_select_curproc(globaldata_t gd)
atomic_set_cpumask(&dfly_curprocmask, CPUMASK(cpuid));
dd->upri = nlp->lwp_priority;
dd->uschedcp = nlp;
+#if 0
dd->rrcount = 0; /* reset round robin */
+#endif
spin_unlock(&dd->spin);
#ifdef SMP
lwkt_acquire(nlp->lwp_thread);
@@ -753,9 +764,8 @@ dfly_schedulerclock(struct lwp *lp, sysclock_t period, sysclock_t cpstamp)
* Do we need to round-robin? We round-robin 10 times a second.
* This should only occur for cpu-bound batch processes.
*/
- if (++dd->rrcount >= usched_dfly_rrinterval) {
+ if (++lp->lwp_rrcount >= usched_dfly_rrinterval) {
lp->lwp_thread->td_wakefromcpu = -1;
- dd->rrcount = 0;
need_user_resched();
}
@@ -823,7 +833,9 @@ dfly_schedulerclock(struct lwp *lp, sysclock_t period, sysclock_t cpstamp)
atomic_set_cpumask(&dfly_curprocmask, dd->cpumask);
dd->upri = nlp->lwp_priority;
dd->uschedcp = nlp;
+#if 0
dd->rrcount = 0; /* reset round robin */
+#endif
spin_unlock(&dd->spin);
lwkt_acquire(nlp->lwp_thread);
lwkt_schedule(nlp->lwp_thread);
@@ -1488,11 +1500,16 @@ dfly_choose_best_queue(struct lwp *lp)
load += rdd->ucount * usched_dfly_weight3;
if (rdd->uschedcp == NULL &&
- rdd->runqcount == 0) {
+ rdd->runqcount == 0 &&
+ globaldata_find(cpuid)->gd_tdrunqcount == 0
+ ) {
load -= usched_dfly_weight4;
- } else if (rdd->upri > lp->lwp_priority + PPQ) {
+ }
+#if 0
+ else if (rdd->upri > lp->lwp_priority + PPQ) {
load -= usched_dfly_weight4 / 2;
}
+#endif
mask &= ~CPUMASK(cpuid);
++count;
}
@@ -1652,9 +1669,12 @@ dfly_choose_worst_queue(dfly_pcpu_t dd)
globaldata_find(cpuid)->gd_tdrunqcount == 0
) {
load -= usched_dfly_weight4;
- } else if (rdd->upri > dd->upri + PPQ) {
+ }
+#if 0
+ else if (rdd->upri > dd->upri + PPQ) {
load -= usched_dfly_weight4 / 2;
}
+#endif
mask &= ~CPUMASK(cpuid);
++count;
}
@@ -1901,11 +1921,24 @@ dfly_setrunqueue_locked(dfly_pcpu_t rdd, struct lwp *lp)
* we want a reschedule, calculate the best cpu for the job.
*
* Always run reschedules on the LWPs original cpu.
+ *
+ * If the lp's rrcount has not been exhausted we want to resume with
+ * it when this queue is reached the next time, instead of resuming
+ * with a different lp. This improves cache effects and also avoids
+ * leaving interrupted MP servers out in the cold holding internal
+ * locks while trying to run a different thread.
*/
KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0);
atomic_set_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ);
++rdd->runqcount;
- TAILQ_INSERT_TAIL(q, lp, lwp_procq);
+ if (lp->lwp_rrcount >= usched_dfly_rrinterval) {
+ lp->lwp_rrcount = 0;
+ TAILQ_INSERT_TAIL(q, lp, lwp_procq);
+ } else {
+ TAILQ_INSERT_HEAD(q, lp, lwp_procq);
+ if (TAILQ_NEXT(lp, lwp_procq) == NULL)
+ lp->lwp_rrcount = 0;
+ }
*which |= 1 << pri;
}
@@ -1958,7 +1991,9 @@ dfly_helper_thread(void *dummy)
atomic_set_cpumask(&dfly_rdyprocmask, mask);
clear_user_resched(); /* This satisfied the reschedule request */
+#if 0
dd->rrcount = 0; /* Reset the round-robin counter */
+#endif
if (dd->runqcount || dd->uschedcp != NULL) {
/*
@@ -1971,7 +2006,9 @@ dfly_helper_thread(void *dummy)
atomic_set_cpumask(&dfly_curprocmask, mask);
dd->upri = nlp->lwp_priority;
dd->uschedcp = nlp;
+#if 0
dd->rrcount = 0; /* reset round robin */
+#endif
spin_unlock(&dd->spin);
lwkt_acquire(nlp->lwp_thread);
lwkt_schedule(nlp->lwp_thread);
@@ -2009,7 +2046,9 @@ dfly_helper_thread(void *dummy)
atomic_set_cpumask(&dfly_curprocmask, mask);
dd->upri = nlp->lwp_priority;
dd->uschedcp = nlp;
+#if 0
dd->rrcount = 0; /* reset round robin */
+#endif
spin_unlock(&dd->spin);
lwkt_acquire(nlp->lwp_thread);
lwkt_schedule(nlp->lwp_thread);
View
@@ -132,6 +132,12 @@ SYSINIT(runqueue, SI_BOOT2_USCHED, SI_ORDER_FIRST, dummyinit, NULL)
* It is responsible for making the thread the current designated userland
* thread for this cpu, blocking if necessary.
*
+ * The kernel will not depress our LWKT priority until after we return,
+ * in case we have to shove over to another cpu.
+ *
+ * We must determine our thread's disposition before we switch away. This
+ * is very sensitive code.
+ *
* We are expected to handle userland reschedule requests here too.
*
* WARNING! THIS FUNCTION IS ALLOWED TO CAUSE THE CURRENT THREAD TO MIGRATE
@@ -356,19 +356,19 @@ userexit(struct lwp *lp)
}
/*
- * Reduce our priority in preparation for a return to userland. If
- * our passive release function was still in place, our priority was
- * never raised and does not need to be reduced.
- */
- lwkt_passive_recover(td);
-
- /*
* Become the current user scheduled process if we aren't already,
* and deal with reschedule requests and other factors.
*/
lp->lwp_proc->p_usched->acquire_curproc(lp);
/* WARNING: we may have migrated cpu's */
/* gd = td->td_gd; */
+
+ /*
+ * Reduce our priority in preparation for a return to userland. If
+ * our passive release function was still in place, our priority was
+ * never raised and does not need to be reduced.
+ */
+ lwkt_passive_recover(td);
}
#if !defined(KTR_KERNENTRY)
@@ -351,13 +351,14 @@ userexit(struct lwp *lp)
*/
lwkt_passive_recover(td);
+ /* WARNING: we may have migrated cpu's */
+ /* gd = td->td_gd; */
+
/*
* Become the current user scheduled process if we aren't already,
* and deal with reschedule requests and other factors.
*/
lp->lwp_proc->p_usched->acquire_curproc(lp);
- /* WARNING: we may have migrated cpu's */
- /* gd = td->td_gd; */
}
#if !defined(KTR_KERNENTRY)
@@ -330,19 +330,19 @@ userexit(struct lwp *lp)
}
/*
- * Reduce our priority in preparation for a return to userland. If
- * our passive release function was still in place, our priority was
- * never raised and does not need to be reduced.
- */
- lwkt_passive_recover(td);
-
- /*
* Become the current user scheduled process if we aren't already,
* and deal with reschedule requests and other factors.
*/
lp->lwp_proc->p_usched->acquire_curproc(lp);
/* WARNING: we may have migrated cpu's */
/* gd = td->td_gd; */
+
+ /*
+ * Reduce our priority in preparation for a return to userland. If
+ * our passive release function was still in place, our priority was
+ * never raised and does not need to be reduced.
+ */
+ lwkt_passive_recover(td);
}
#if !defined(KTR_KERNENTRY)
View
@@ -261,7 +261,7 @@ lwkt_getpri_self(void)
static __inline void
lwkt_passive_recover(thread_t td)
{
-#ifdef LWKT_SPLIT_USERPRI
+#ifndef NO_LWKT_SPLIT_USERPRI
if (td->td_release == NULL)
lwkt_setpri_self(TDPRI_USER_NORM);
td->td_release = NULL;
Oops, something went wrong.

0 comments on commit e3e6be1

Please sign in to comment.