Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse code

kernel - usched_dfly revamp (3), fix estcpu

* Fix the estcpu calculation, which previously assumed only a single
  runq (in usched_dfly there is a runq per cpu).

* Add a global atomic int accounting for all running and runnable lwp's.

* Fix cpu-hogging issues for bursty processes by creating a fast-decay-mode
  for estcpu when a thread first starts up, or after it has been asleep
  for more than 1 seconds.
  • Loading branch information...
commit bc55d64fabb369afb8b8b6a30341793ecea21278 1 parent 09f49d5
Matthew Dillon authored
33 sys/kern/kern_synch.c
@@ -141,24 +141,8 @@ sysctl_kern_quantum(SYSCTL_HANDLER_ARGS)
141 141 SYSCTL_PROC(_kern, OID_AUTO, quantum, CTLTYPE_INT|CTLFLAG_RW,
142 142 0, sizeof sched_quantum, sysctl_kern_quantum, "I", "");
143 143
144   -/*
145   - * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
146   - * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
147   - * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
148   - *
149   - * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
150   - * 1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
151   - *
152   - * If you don't want to bother with the faster/more-accurate formula, you
153   - * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
154   - * (more general) method of calculating the %age of CPU used by a process.
155   - *
156   - * decay 95% of `lwp_pctcpu' in 60 seconds; see CCPU_SHIFT before changing
157   - */
158   -#define CCPU_SHIFT 11
159   -
160   -static fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */
161   -SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
  144 +static int pctcpu_decay = 10;
  145 +SYSCTL_INT(_kern, OID_AUTO, pctcpu_decay, CTLFLAG_RW, &pctcpu_decay, 0, "");
162 146
163 147 /*
164 148 * kernel uses `FSCALE', userland (SHOULD) use kern.fscale
@@ -225,11 +209,20 @@ schedcpu_stats(struct proc *p, void *data __unused)
225 209 /*
226 210 * Only recalculate processes that are active or have slept
227 211 * less then 2 seconds. The schedulers understand this.
  212 + * Otherwise decay by 50% per second.
228 213 */
229 214 if (lp->lwp_slptime <= 1) {
230 215 p->p_usched->recalculate(lp);
231 216 } else {
232   - lp->lwp_pctcpu = (lp->lwp_pctcpu * ccpu) >> FSHIFT;
  217 + int decay;
  218 +
  219 + decay = pctcpu_decay;
  220 + cpu_ccfence();
  221 + if (decay <= 1)
  222 + decay = 1;
  223 + if (decay > 100)
  224 + decay = 100;
  225 + lp->lwp_pctcpu = (lp->lwp_pctcpu * (decay - 1)) / decay;
233 226 }
234 227 }
235 228 lwkt_reltoken(&p->p_token);
@@ -298,8 +291,6 @@ schedcpu_resource(struct proc *p, void *data __unused)
298 291 /*
299 292 * This is only used by ps. Generate a cpu percentage use over
300 293 * a period of one second.
301   - *
302   - * MPSAFE
303 294 */
304 295 void
305 296 updatepcpu(struct lwp *lp, int cpticks, int ttlticks)
117 sys/kern/usched_dfly.c
@@ -93,7 +93,7 @@ TAILQ_HEAD(rq, lwp);
93 93 #define lwp_forked lwp_usdata.dfly.forked
94 94 #define lwp_rqindex lwp_usdata.dfly.rqindex
95 95 #define lwp_estcpu lwp_usdata.dfly.estcpu
96   -#define lwp_batch lwp_usdata.dfly.batch
  96 +#define lwp_estfast lwp_usdata.dfly.estfast
97 97 #define lwp_rqtype lwp_usdata.dfly.rqtype
98 98 #define lwp_qcpu lwp_usdata.dfly.qcpu
99 99
@@ -185,6 +185,7 @@ static cpumask_t dfly_rdyprocmask; /* ready to accept a user process */
185 185 #ifdef SMP
186 186 static volatile int dfly_scancpu;
187 187 #endif
  188 +static volatile int dfly_ucount; /* total running on whole system */
188 189 static struct usched_dfly_pcpu dfly_pcpu[MAXCPU];
189 190 static struct sysctl_ctx_list usched_dfly_sysctl_ctx;
190 191 static struct sysctl_oid *usched_dfly_sysctl_tree;
@@ -260,7 +261,6 @@ static int usched_dfly_features = 0x8F; /* allow pulls */
260 261 #endif
261 262 static int usched_dfly_rrinterval = (ESTCPUFREQ + 9) / 10;
262 263 static int usched_dfly_decay = 8;
263   -static int usched_dfly_batch_time = 10;
264 264
265 265 /* KTR debug printings */
266 266
@@ -644,6 +644,7 @@ dfly_changeqcpu_locked(struct lwp *lp, dfly_pcpu_t dd, dfly_pcpu_t rdd)
644 644 atomic_add_int(&dd->uload,
645 645 -((lp->lwp_priority & ~PPQMASK) & PRIMASK));
646 646 atomic_add_int(&dd->ucount, -1);
  647 + atomic_add_int(&dfly_ucount, -1);
647 648 }
648 649 lp->lwp_qcpu = rdd->cpuid;
649 650 }
@@ -824,11 +825,8 @@ dfly_schedulerclock(struct lwp *lp, sysclock_t period, sysclock_t cpstamp)
824 825 * Called from acquire and from kern_synch's one-second timer (one of the
825 826 * callout helper threads) with a critical section held.
826 827 *
827   - * Decay p_estcpu based on the number of ticks we haven't been running
828   - * and our p_nice. As the load increases each process observes a larger
829   - * number of idle ticks (because other processes are running in them).
830   - * This observation leads to a larger correction which tends to make the
831   - * system more 'batchy'.
  828 + * Adjust p_estcpu based on our single-cpu load, p_nice, and compensate for
  829 + * overall system load.
832 830 *
833 831 * Note that no recalculation occurs for a process which sleeps and wakes
834 832 * up in the same tick. That is, a system doing thousands of context
@@ -840,11 +838,11 @@ void
840 838 dfly_recalculate_estcpu(struct lwp *lp)
841 839 {
842 840 globaldata_t gd = mycpu;
843   - dfly_pcpu_t dd = &dfly_pcpu[gd->gd_cpuid];
844 841 sysclock_t cpbase;
845 842 sysclock_t ttlticks;
846 843 int estcpu;
847 844 int decay_factor;
  845 + int ucount;
848 846
849 847 /*
850 848 * We have to subtract periodic to get the last schedclock
@@ -863,9 +861,7 @@ dfly_recalculate_estcpu(struct lwp *lp)
863 861 dfly_resetpriority(lp);
864 862 lp->lwp_cpbase = cpbase;
865 863 lp->lwp_cpticks = 0;
866   - lp->lwp_batch -= ESTCPUFREQ;
867   - if (lp->lwp_batch < 0)
868   - lp->lwp_batch = 0;
  864 + lp->lwp_estfast = 0;
869 865 } else if (lp->lwp_cpbase != cpbase) {
870 866 /*
871 867 * Adjust estcpu if we are in a different tick. Don't waste
@@ -887,51 +883,41 @@ dfly_recalculate_estcpu(struct lwp *lp)
887 883 updatepcpu(lp, lp->lwp_cpticks, ttlticks);
888 884
889 885 /*
890   - * Calculate the percentage of one cpu used factoring in ncpus
891   - * and the load and adjust estcpu. Handle degenerate cases
892   - * by adding 1 to runqcount.
893   - *
894   - * estcpu is scaled by ESTCPUMAX.
  886 + * Calculate the percentage of one cpu being used then
  887 + * compensate for any system load in excess of ncpus.
895 888 *
896   - * runqcount is the excess number of user processes
897   - * that cannot be immediately scheduled to cpus. We want
898   - * to count these as running to avoid range compression
899   - * in the base calculation (which is the actual percentage
900   - * of one cpu used).
901   - */
902   - estcpu = (lp->lwp_cpticks * ESTCPUMAX) *
903   - (dd->runqcount + ncpus) / (ncpus * ttlticks);
904   -
905   - /*
906   - * If estcpu is > 50% we become more batch-like
907   - * If estcpu is <= 50% we become less batch-like
  889 + * For example, if we have 8 cores and 16 running cpu-bound
  890 + * processes then all things being equal each process will
  891 + * get 50% of one cpu. We need to pump this value back
  892 + * up to 100% so the estcpu calculation properly adjusts
  893 + * the process's dynamic priority.
908 894 *
909   - * It takes 30 cpu seconds to traverse the entire range.
  895 + * estcpu is scaled by ESTCPUMAX, pctcpu is scaled by FSCALE.
910 896 */
911   - if (estcpu > ESTCPUMAX / 2) {
912   - lp->lwp_batch += ttlticks;
913   - if (lp->lwp_batch > BATCHMAX)
914   - lp->lwp_batch = BATCHMAX;
915   - } else {
916   - lp->lwp_batch -= ttlticks;
917   - if (lp->lwp_batch < 0)
918   - lp->lwp_batch = 0;
  897 + estcpu = (lp->lwp_pctcpu * ESTCPUMAX) >> FSHIFT;
  898 + ucount = dfly_ucount;
  899 + if (ucount > ncpus) {
  900 + estcpu += estcpu * (ucount - ncpus) / ncpus;
919 901 }
920 902
921 903 if (usched_dfly_debug == lp->lwp_proc->p_pid) {
922   - kprintf("pid %d lwp %p estcpu %3d %3d bat %d cp %d/%d",
  904 + kprintf("pid %d lwp %p estcpu %3d %3d cp %d/%d",
923 905 lp->lwp_proc->p_pid, lp,
924 906 estcpu, lp->lwp_estcpu,
925   - lp->lwp_batch,
926 907 lp->lwp_cpticks, ttlticks);
927 908 }
928 909
929 910 /*
930 911 * Adjust lp->lwp_esetcpu. The decay factor determines how
931 912 * quickly lwp_estcpu collapses to its realtime calculation.
932   - * A slower collapse gives us a more accurate number but
933   - * can cause a cpu hog to eat too much cpu before the
934   - * scheduler decides to downgrade it.
  913 + * A slower collapse gives us a more accurate number over
  914 + * the long term but can create problems with bursty threads
  915 + * or threads which become cpu hogs.
  916 + *
  917 + * To solve this problem, newly started lwps and lwps which
  918 + * are restarting after having been asleep for a while are
  919 + * given a much, much faster decay in order to quickly
  920 + * detect whether they become cpu-bound.
935 921 *
936 922 * NOTE: p_nice is accounted for in dfly_resetpriority(),
937 923 * and not here, but we must still ensure that a
@@ -947,9 +933,16 @@ dfly_recalculate_estcpu(struct lwp *lp)
947 933 if (decay_factor > 1024)
948 934 decay_factor = 1024;
949 935
950   - lp->lwp_estcpu = ESTCPULIM(
951   - (lp->lwp_estcpu * decay_factor + estcpu) /
952   - (decay_factor + 1));
  936 + if (lp->lwp_estfast < usched_dfly_decay) {
  937 + ++lp->lwp_estfast;
  938 + lp->lwp_estcpu = ESTCPULIM(
  939 + (lp->lwp_estcpu * lp->lwp_estfast + estcpu) /
  940 + (lp->lwp_estfast + 1));
  941 + } else {
  942 + lp->lwp_estcpu = ESTCPULIM(
  943 + (lp->lwp_estcpu * decay_factor + estcpu) /
  944 + (decay_factor + 1));
  945 + }
953 946
954 947 if (usched_dfly_debug == lp->lwp_proc->p_pid)
955 948 kprintf(" finalestcpu %d\n", lp->lwp_estcpu);
@@ -1010,12 +1003,9 @@ dfly_resetpriority(struct lwp *lp)
1010 1003 break;
1011 1004 case RTP_PRIO_NORMAL:
1012 1005 /*
1013   - * Detune estcpu based on batchiness. lwp_batch ranges
1014   - * from 0 to BATCHMAX. Limit estcpu for the sake of
1015   - * the priority calculation to between 50% and 100%.
  1006 + *
1016 1007 */
1017   - estcpu = lp->lwp_estcpu * (lp->lwp_batch + BATCHMAX) /
1018   - (BATCHMAX * 2);
  1008 + estcpu = lp->lwp_estcpu;
1019 1009
1020 1010 /*
1021 1011 * p_nice piece Adds (0-40) * 2 0-80
@@ -1152,8 +1142,9 @@ dfly_yield(struct lwp *lp)
1152 1142 *
1153 1143 * Give the child process an initial estcpu that is more batch then
1154 1144 * its parent and dock the parent for the fork (but do not
1155   - * reschedule the parent). This comprises the main part of our batch
1156   - * detection heuristic for both parallel forking and sequential execs.
  1145 + * reschedule the parent).
  1146 + *
  1147 + * fast
1157 1148 *
1158 1149 * XXX lwp should be "spawning" instead of "forking"
1159 1150 */
@@ -1166,13 +1157,7 @@ dfly_forking(struct lwp *plp, struct lwp *lp)
1166 1157 */
1167 1158 lp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + ESTCPUPPQ * 4);
1168 1159 lp->lwp_forked = 1;
1169   -
1170   - /*
1171   - * The batch status of children always starts out centerline
1172   - * and will inch-up or inch-down as appropriate. It takes roughly
1173   - * ~15 seconds of >50% cpu to hit the limit.
1174   - */
1175   - lp->lwp_batch = BATCHMAX / 2;
  1160 + lp->lwp_estfast = 0;
1176 1161
1177 1162 /*
1178 1163 * Dock the parent a cost for the fork, protecting us from fork
@@ -1201,6 +1186,7 @@ dfly_exiting(struct lwp *lp, struct proc *child_proc)
1201 1186 atomic_add_int(&dd->uload,
1202 1187 -((lp->lwp_priority & ~PPQMASK) & PRIMASK));
1203 1188 atomic_add_int(&dd->ucount, -1);
  1189 + atomic_add_int(&dfly_ucount, -1);
1204 1190 }
1205 1191 }
1206 1192
@@ -1225,6 +1211,7 @@ dfly_uload_update(struct lwp *lp)
1225 1211 atomic_add_int(&dd->uload,
1226 1212 ((lp->lwp_priority & ~PPQMASK) & PRIMASK));
1227 1213 atomic_add_int(&dd->ucount, 1);
  1214 + atomic_add_int(&dfly_ucount, 1);
1228 1215 }
1229 1216 spin_unlock(&dd->spin);
1230 1217 }
@@ -1237,6 +1224,7 @@ dfly_uload_update(struct lwp *lp)
1237 1224 atomic_add_int(&dd->uload,
1238 1225 -((lp->lwp_priority & ~PPQMASK) & PRIMASK));
1239 1226 atomic_add_int(&dd->ucount, -1);
  1227 + atomic_add_int(&dfly_ucount, -1);
1240 1228 }
1241 1229 spin_unlock(&dd->spin);
1242 1230 }
@@ -1353,11 +1341,13 @@ dfly_chooseproc_locked(dfly_pcpu_t rdd, dfly_pcpu_t dd,
1353 1341 atomic_add_int(&rdd->uload,
1354 1342 -((lp->lwp_priority & ~PPQMASK) & PRIMASK));
1355 1343 atomic_add_int(&rdd->ucount, -1);
  1344 + atomic_add_int(&dfly_ucount, -1);
1356 1345 }
1357 1346 lp->lwp_qcpu = dd->cpuid;
1358 1347 atomic_add_int(&dd->uload,
1359 1348 ((lp->lwp_priority & ~PPQMASK) & PRIMASK));
1360 1349 atomic_add_int(&dd->ucount, 1);
  1350 + atomic_add_int(&dfly_ucount, 1);
1361 1351 atomic_set_int(&lp->lwp_mpflags, LWP_MP_ULOAD);
1362 1352 }
1363 1353 return lp;
@@ -1851,6 +1841,7 @@ dfly_setrunqueue_locked(dfly_pcpu_t rdd, struct lwp *lp)
1851 1841 atomic_add_int(&dfly_pcpu[lp->lwp_qcpu].uload,
1852 1842 (lp->lwp_priority & ~PPQMASK) & PRIMASK);
1853 1843 atomic_add_int(&dfly_pcpu[lp->lwp_qcpu].ucount, 1);
  1844 + atomic_add_int(&dfly_ucount, 1);
1854 1845 }
1855 1846
1856 1847 pri = lp->lwp_rqindex;
@@ -2152,10 +2143,6 @@ dfly_helper_thread_cpu_init(void)
2152 2143 SYSCTL_CHILDREN(usched_dfly_sysctl_tree),
2153 2144 OID_AUTO, "decay", CTLFLAG_RW,
2154 2145 &usched_dfly_decay, 0, "Extra decay when not running");
2155   - SYSCTL_ADD_INT(&usched_dfly_sysctl_ctx,
2156   - SYSCTL_CHILDREN(usched_dfly_sysctl_tree),
2157   - OID_AUTO, "batch_time", CTLFLAG_RW,
2158   - &usched_dfly_batch_time, 0, "Min batch counter value");
2159 2146
2160 2147 /* Add enable/disable option for SMT scheduling if supported */
2161 2148 if (smt_not_supported) {
@@ -2257,10 +2244,6 @@ sched_sysctl_tree_init(void)
2257 2244 SYSCTL_CHILDREN(usched_dfly_sysctl_tree),
2258 2245 OID_AUTO, "decay", CTLFLAG_RW,
2259 2246 &usched_dfly_decay, 0, "Extra decay when not running");
2260   - SYSCTL_ADD_INT(&usched_dfly_sysctl_ctx,
2261   - SYSCTL_CHILDREN(usched_dfly_sysctl_tree),
2262   - OID_AUTO, "batch_time", CTLFLAG_RW,
2263   - &usched_dfly_batch_time, 0, "Min batch counter value");
2264 2247 }
2265 2248 SYSINIT(uschedtd, SI_BOOT2_USCHED, SI_ORDER_SECOND,
2266 2249 sched_sysctl_tree_init, NULL)
3  sys/sys/usched.h
@@ -63,7 +63,8 @@ union usched_data {
63 63 short priority; /* lower is better */
64 64 char forked; /* lock cpu during fork */
65 65 char rqindex;
66   - int batch; /* batch mode heuristic */
  66 + short estfast; /* fast estcpu collapse mode */
  67 + short unused01;
67 68 int estcpu; /* dynamic priority modification */
68 69 u_short rqtype; /* protected copy of rtprio type */
69 70 u_short qcpu; /* which cpu are we enqueued on? */

0 comments on commit bc55d64

Please sign in to comment.
Something went wrong with that request. Please try again.